@tkeron/html-parser 0.1.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +8 -3
- package/index.ts +4 -0
- package/package.json +13 -6
- package/src/css-selector.ts +45 -27
- package/src/dom-simulator.ts +162 -20
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +478 -183
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +59 -139
- package/tests/advanced.test.ts +119 -106
- package/tests/custom-elements.test.ts +172 -162
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +637 -0
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +43 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +172 -193
- package/tests/selectors.test.ts +64 -1
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +83 -0
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +24 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/void-elements.test.ts +471 -0
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/parser.ts
CHANGED
|
@@ -1,38 +1,14 @@
|
|
|
1
1
|
import type { Token } from './tokenizer.js';
|
|
2
2
|
import { TokenType } from './tokenizer.js';
|
|
3
|
-
|
|
4
|
-
export interface ASTNode {
|
|
5
|
-
type: ASTNodeType;
|
|
6
|
-
tagName?: string;
|
|
7
|
-
attributes?: Record<string, string>;
|
|
8
|
-
children?: ASTNode[];
|
|
9
|
-
content?: string;
|
|
10
|
-
parent?: ASTNode;
|
|
11
|
-
isSelfClosing?: boolean;
|
|
12
|
-
position?: {
|
|
13
|
-
start: number;
|
|
14
|
-
end: number;
|
|
15
|
-
line: number;
|
|
16
|
-
column: number;
|
|
17
|
-
};
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
export enum ASTNodeType {
|
|
21
|
-
DOCUMENT = 'DOCUMENT',
|
|
22
|
-
ELEMENT = 'ELEMENT',
|
|
23
|
-
TEXT = 'TEXT',
|
|
24
|
-
COMMENT = 'COMMENT',
|
|
25
|
-
CDATA = 'CDATA',
|
|
26
|
-
DOCTYPE = 'DOCTYPE',
|
|
27
|
-
PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
|
|
28
|
-
}
|
|
3
|
+
import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
|
|
29
4
|
|
|
30
5
|
export interface ParserState {
|
|
31
6
|
tokens: Token[];
|
|
32
7
|
position: number;
|
|
33
8
|
length: number;
|
|
34
|
-
stack:
|
|
35
|
-
root:
|
|
9
|
+
stack: any[]; // DOM elements
|
|
10
|
+
root: any; // Document
|
|
11
|
+
insertionMode: InsertionMode;
|
|
36
12
|
errors: ParseError[];
|
|
37
13
|
}
|
|
38
14
|
|
|
@@ -44,6 +20,32 @@ export interface ParseError {
|
|
|
44
20
|
severity: 'error' | 'warning';
|
|
45
21
|
}
|
|
46
22
|
|
|
23
|
+
export enum InsertionMode {
|
|
24
|
+
Initial = 'initial',
|
|
25
|
+
BeforeHtml = 'beforeHtml',
|
|
26
|
+
BeforeHead = 'beforeHead',
|
|
27
|
+
InHead = 'inHead',
|
|
28
|
+
AfterHead = 'afterHead',
|
|
29
|
+
InBody = 'inBody'
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export enum ASTNodeType {
|
|
33
|
+
Document = 'document',
|
|
34
|
+
Element = 'element',
|
|
35
|
+
Text = 'text',
|
|
36
|
+
Comment = 'comment',
|
|
37
|
+
Doctype = 'doctype',
|
|
38
|
+
CDATA = 'cdata'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface ASTNode {
|
|
42
|
+
type: ASTNodeType;
|
|
43
|
+
tagName?: string;
|
|
44
|
+
value?: string;
|
|
45
|
+
attributes?: Record<string, string>;
|
|
46
|
+
children?: ASTNode[];
|
|
47
|
+
}
|
|
48
|
+
|
|
47
49
|
const VOID_ELEMENTS = new Set([
|
|
48
50
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
49
51
|
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
@@ -57,7 +59,41 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
|
57
59
|
'li': ['li'],
|
|
58
60
|
'dt': ['dt', 'dd'],
|
|
59
61
|
'dd': ['dt', 'dd'],
|
|
62
|
+
'address': ['p'],
|
|
63
|
+
'article': ['p'],
|
|
64
|
+
'aside': ['p'],
|
|
65
|
+
'blockquote': ['p'],
|
|
66
|
+
'center': ['p'],
|
|
67
|
+
'details': ['p'],
|
|
68
|
+
'dialog': ['p'],
|
|
69
|
+
'dir': ['p'],
|
|
70
|
+
'div': ['p'],
|
|
71
|
+
'dl': ['p'],
|
|
72
|
+
'fieldset': ['p'],
|
|
73
|
+
'figcaption': ['p'],
|
|
74
|
+
'figure': ['p'],
|
|
75
|
+
'footer': ['p'],
|
|
76
|
+
'form': ['p'],
|
|
77
|
+
'h1': ['p'],
|
|
78
|
+
'h2': ['p'],
|
|
79
|
+
'h3': ['p'],
|
|
80
|
+
'h4': ['p'],
|
|
81
|
+
'h5': ['p'],
|
|
82
|
+
'h6': ['p'],
|
|
83
|
+
'header': ['p'],
|
|
84
|
+
'hgroup': ['p'],
|
|
85
|
+
'hr': ['p'],
|
|
86
|
+
'listing': ['p'],
|
|
87
|
+
'main': ['p'],
|
|
88
|
+
'menu': ['p'],
|
|
89
|
+
'nav': ['p'],
|
|
90
|
+
'ol': ['p'],
|
|
60
91
|
'p': ['p'],
|
|
92
|
+
'pre': ['p'],
|
|
93
|
+
'section': ['p'],
|
|
94
|
+
'summary': ['p'],
|
|
95
|
+
'table': ['p'],
|
|
96
|
+
'ul': ['p'],
|
|
61
97
|
'rt': ['rt', 'rp'],
|
|
62
98
|
'rp': ['rt', 'rp'],
|
|
63
99
|
'optgroup': ['optgroup'],
|
|
@@ -70,7 +106,7 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
|
70
106
|
'th': ['td', 'th']
|
|
71
107
|
};
|
|
72
108
|
|
|
73
|
-
export function parse(tokens: Token[]):
|
|
109
|
+
export function parse(tokens: Token[]): any {
|
|
74
110
|
const state = createParserState(tokens);
|
|
75
111
|
|
|
76
112
|
while (state.position < state.length) {
|
|
@@ -84,21 +120,119 @@ export function parse(tokens: Token[]): ASTNode {
|
|
|
84
120
|
advance(state);
|
|
85
121
|
}
|
|
86
122
|
|
|
123
|
+
// Create implicit html, head, body if needed
|
|
124
|
+
if (state.root.childNodes && state.root.childNodes.length > 0) {
|
|
125
|
+
let hasHtml = false;
|
|
126
|
+
for (const child of state.root.childNodes) {
|
|
127
|
+
if (child.nodeType === 1 && child.tagName === 'HTML') {
|
|
128
|
+
hasHtml = true;
|
|
129
|
+
state.root.documentElement = child;
|
|
130
|
+
break;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (!hasHtml) {
|
|
134
|
+
const html = createElement('html', {});
|
|
135
|
+
const head = createElement('head', {});
|
|
136
|
+
const body = createElement('body', {});
|
|
137
|
+
appendChild(html, head);
|
|
138
|
+
appendChild(html, body);
|
|
139
|
+
|
|
140
|
+
const doctypes: any[] = [];
|
|
141
|
+
const children = [...state.root.childNodes];
|
|
142
|
+
for (const child of children) {
|
|
143
|
+
if (child.nodeType === 10) {
|
|
144
|
+
doctypes.push(child);
|
|
145
|
+
} else {
|
|
146
|
+
appendChild(body, child);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
state.root.childNodes = [];
|
|
151
|
+
for (const doctype of doctypes) {
|
|
152
|
+
doctype.parentNode = null;
|
|
153
|
+
appendChild(state.root, doctype);
|
|
154
|
+
}
|
|
155
|
+
appendChild(state.root, html);
|
|
156
|
+
state.root.documentElement = html;
|
|
157
|
+
state.root.head = head;
|
|
158
|
+
state.root.body = body;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
87
162
|
while (state.stack.length > 1) {
|
|
88
163
|
const unclosedElement = state.stack.pop()!;
|
|
89
164
|
const currentToken = getCurrentToken(state);
|
|
90
|
-
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.
|
|
165
|
+
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
|
|
91
166
|
}
|
|
92
167
|
|
|
93
168
|
return state.root;
|
|
94
169
|
}
|
|
95
170
|
|
|
171
|
+
export function domToAST(dom: any): ASTNode {
|
|
172
|
+
function convert(node: any): ASTNode | null {
|
|
173
|
+
if (!node) return null;
|
|
174
|
+
|
|
175
|
+
if (node.nodeType === 9) {
|
|
176
|
+
const children: ASTNode[] = [];
|
|
177
|
+
if (node.childNodes) {
|
|
178
|
+
for (const child of node.childNodes) {
|
|
179
|
+
const converted = convert(child);
|
|
180
|
+
if (converted) children.push(converted);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
type: ASTNodeType.Document,
|
|
185
|
+
children
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (node.nodeType === 1) {
|
|
190
|
+
const children: ASTNode[] = [];
|
|
191
|
+
if (node.childNodes) {
|
|
192
|
+
for (const child of node.childNodes) {
|
|
193
|
+
const converted = convert(child);
|
|
194
|
+
if (converted) children.push(converted);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
const tagName = node.tagName?.toLowerCase();
|
|
198
|
+
return {
|
|
199
|
+
type: ASTNodeType.Element,
|
|
200
|
+
tagName,
|
|
201
|
+
attributes: node.attributes || {},
|
|
202
|
+
children,
|
|
203
|
+
isSelfClosing: VOID_ELEMENTS.has(tagName)
|
|
204
|
+
} as ASTNode & { isSelfClosing: boolean };
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (node.nodeType === 3) {
|
|
208
|
+
return {
|
|
209
|
+
type: ASTNodeType.Text,
|
|
210
|
+
content: node.nodeValue || ''
|
|
211
|
+
} as ASTNode & { content: string };
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (node.nodeType === 8) {
|
|
215
|
+
return {
|
|
216
|
+
type: ASTNodeType.Comment,
|
|
217
|
+
content: node.nodeValue || ''
|
|
218
|
+
} as ASTNode & { content: string };
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (node.nodeType === 10) {
|
|
222
|
+
return {
|
|
223
|
+
type: ASTNodeType.Doctype,
|
|
224
|
+
content: node.name || 'html'
|
|
225
|
+
} as ASTNode & { content: string };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return null;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return convert(dom) || { type: ASTNodeType.Document, children: [] };
|
|
232
|
+
}
|
|
233
|
+
|
|
96
234
|
function createParserState(tokens: Token[]): ParserState {
|
|
97
|
-
const root
|
|
98
|
-
type: ASTNodeType.DOCUMENT,
|
|
99
|
-
children: [],
|
|
100
|
-
tagName: '#document'
|
|
101
|
-
};
|
|
235
|
+
const root = createDocument();
|
|
102
236
|
|
|
103
237
|
return {
|
|
104
238
|
tokens,
|
|
@@ -106,81 +240,223 @@ function createParserState(tokens: Token[]): ParserState {
|
|
|
106
240
|
length: tokens.length,
|
|
107
241
|
stack: [root],
|
|
108
242
|
root,
|
|
243
|
+
insertionMode: InsertionMode.Initial,
|
|
109
244
|
errors: []
|
|
110
245
|
};
|
|
111
246
|
}
|
|
112
247
|
|
|
113
248
|
function parseToken(state: ParserState, token: Token): void {
|
|
114
|
-
switch (
|
|
115
|
-
case
|
|
116
|
-
|
|
249
|
+
switch (state.insertionMode) {
|
|
250
|
+
case InsertionMode.Initial:
|
|
251
|
+
parseTokenInInitialMode(state, token);
|
|
117
252
|
break;
|
|
118
|
-
case
|
|
119
|
-
|
|
253
|
+
case InsertionMode.BeforeHtml:
|
|
254
|
+
parseTokenInBeforeHtmlMode(state, token);
|
|
120
255
|
break;
|
|
121
|
-
case
|
|
122
|
-
|
|
123
|
-
break;
|
|
124
|
-
case TokenType.COMMENT:
|
|
125
|
-
parseComment(state, token);
|
|
256
|
+
case InsertionMode.BeforeHead:
|
|
257
|
+
parseTokenInBeforeHeadMode(state, token);
|
|
126
258
|
break;
|
|
127
|
-
case
|
|
128
|
-
|
|
259
|
+
case InsertionMode.InHead:
|
|
260
|
+
parseTokenInInHeadMode(state, token);
|
|
129
261
|
break;
|
|
130
|
-
case
|
|
131
|
-
|
|
262
|
+
case InsertionMode.AfterHead:
|
|
263
|
+
parseTokenInAfterHeadMode(state, token);
|
|
132
264
|
break;
|
|
133
|
-
case
|
|
134
|
-
|
|
265
|
+
case InsertionMode.InBody:
|
|
266
|
+
parseTokenInInBodyMode(state, token);
|
|
135
267
|
break;
|
|
268
|
+
default:
|
|
269
|
+
parseTokenInInBodyMode(state, token); // fallback
|
|
136
270
|
}
|
|
137
271
|
}
|
|
138
272
|
|
|
139
|
-
function
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
273
|
+
function parseTokenInInitialMode(state: ParserState, token: Token): void {
|
|
274
|
+
if (token.type === TokenType.DOCTYPE) {
|
|
275
|
+
// TODO: Create DOCTYPE node
|
|
276
|
+
parseDoctype(state, token);
|
|
277
|
+
state.insertionMode = InsertionMode.BeforeHtml;
|
|
278
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
279
|
+
parseComment(state, token);
|
|
280
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
281
|
+
// Ignore whitespace
|
|
282
|
+
} else {
|
|
283
|
+
// No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
|
|
284
|
+
const doctype = createDoctype('html');
|
|
285
|
+
appendChild(state.root, doctype);
|
|
286
|
+
state.insertionMode = InsertionMode.BeforeHtml;
|
|
287
|
+
parseToken(state, token); // Re-parse in new mode
|
|
288
|
+
}
|
|
289
|
+
}
|
|
145
290
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
291
|
+
function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
|
|
292
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
|
|
293
|
+
const html = createElement('html', token.attributes || {});
|
|
294
|
+
appendChild(state.root, html);
|
|
295
|
+
state.root.documentElement = html;
|
|
296
|
+
state.stack.push(html);
|
|
297
|
+
state.insertionMode = InsertionMode.BeforeHead;
|
|
298
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
299
|
+
parseComment(state, token);
|
|
300
|
+
} else if (token.type === TokenType.DOCTYPE) {
|
|
301
|
+
// Ignore
|
|
302
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
303
|
+
// Ignore whitespace
|
|
304
|
+
} else {
|
|
305
|
+
const html = createElement('html', {});
|
|
306
|
+
appendChild(state.root, html);
|
|
307
|
+
state.root.documentElement = html;
|
|
308
|
+
state.stack.push(html);
|
|
309
|
+
state.insertionMode = InsertionMode.BeforeHead;
|
|
310
|
+
parseToken(state, token);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
155
313
|
|
|
156
|
-
|
|
157
|
-
|
|
314
|
+
function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
|
|
315
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
|
|
316
|
+
const head = createElement('head', token.attributes || {});
|
|
317
|
+
appendChild(getCurrentParent(state), head);
|
|
318
|
+
state.root.head = head;
|
|
319
|
+
state.stack.push(head);
|
|
320
|
+
state.insertionMode = InsertionMode.InHead;
|
|
321
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
322
|
+
parseComment(state, token);
|
|
323
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
324
|
+
// Ignore whitespace
|
|
325
|
+
} else {
|
|
326
|
+
const head = createElement('head', {});
|
|
327
|
+
appendChild(getCurrentParent(state), head);
|
|
328
|
+
state.root.head = head;
|
|
329
|
+
state.stack.push(head);
|
|
330
|
+
state.insertionMode = InsertionMode.InHead;
|
|
331
|
+
parseToken(state, token);
|
|
158
332
|
}
|
|
333
|
+
}
|
|
159
334
|
|
|
160
|
-
|
|
335
|
+
function parseOpenTag(state: ParserState, token: Token): void {
|
|
336
|
+
const tagName = token.value.toLowerCase();
|
|
337
|
+
const currentParent = getCurrentParent(state);
|
|
338
|
+
const element = createElement(tagName, token.attributes || {});
|
|
339
|
+
appendChild(currentParent, element);
|
|
340
|
+
|
|
341
|
+
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
161
342
|
state.stack.push(element);
|
|
162
343
|
}
|
|
163
344
|
}
|
|
164
345
|
|
|
165
|
-
function
|
|
166
|
-
const
|
|
346
|
+
function parseTokenInInHeadMode(state: ParserState, token: Token): void {
|
|
347
|
+
const currentElement = getCurrentElement(state);
|
|
348
|
+
const currentTagName = currentElement?.tagName?.toLowerCase();
|
|
167
349
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
const unclosedElement = state.stack.pop()!;
|
|
174
|
-
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, token.position?.start || 0);
|
|
175
|
-
}
|
|
350
|
+
if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
|
|
351
|
+
if (token.type === TokenType.TEXT) {
|
|
352
|
+
parseText(state, token);
|
|
353
|
+
return;
|
|
354
|
+
} else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
|
|
176
355
|
state.stack.pop();
|
|
177
|
-
|
|
178
|
-
|
|
356
|
+
return;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (token.type === TokenType.TAG_OPEN) {
|
|
361
|
+
const tagName = token.value.toLowerCase();
|
|
362
|
+
if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
363
|
+
parseOpenTag(state, token);
|
|
364
|
+
} else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
|
|
365
|
+
parseOpenTag(state, token);
|
|
366
|
+
} else if (tagName === 'head') {
|
|
367
|
+
} else {
|
|
368
|
+
state.stack.pop();
|
|
369
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
370
|
+
parseToken(state, token);
|
|
371
|
+
}
|
|
372
|
+
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
373
|
+
const tagName = token.value.toLowerCase();
|
|
374
|
+
if (tagName === 'head') {
|
|
375
|
+
state.stack.pop();
|
|
376
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
377
|
+
} else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
378
|
+
if (currentTagName === tagName) {
|
|
379
|
+
state.stack.pop();
|
|
380
|
+
}
|
|
179
381
|
}
|
|
382
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
383
|
+
parseComment(state, token);
|
|
384
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
385
|
+
} else {
|
|
386
|
+
state.stack.pop();
|
|
387
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
388
|
+
parseToken(state, token);
|
|
180
389
|
}
|
|
390
|
+
}
|
|
181
391
|
|
|
182
|
-
|
|
183
|
-
|
|
392
|
+
function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
|
|
393
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
|
|
394
|
+
const body = createElement('body', token.attributes || {});
|
|
395
|
+
appendChild(getCurrentParent(state), body);
|
|
396
|
+
state.root.body = body;
|
|
397
|
+
state.stack.push(body);
|
|
398
|
+
state.insertionMode = InsertionMode.InBody;
|
|
399
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
400
|
+
parseComment(state, token);
|
|
401
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
402
|
+
// Ignore whitespace
|
|
403
|
+
} else {
|
|
404
|
+
const body = createElement('body', {});
|
|
405
|
+
appendChild(getCurrentParent(state), body);
|
|
406
|
+
state.root.body = body;
|
|
407
|
+
state.stack.push(body);
|
|
408
|
+
state.insertionMode = InsertionMode.InBody;
|
|
409
|
+
parseToken(state, token);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
function parseTokenInInBodyMode(state: ParserState, token: Token): void {
|
|
414
|
+
if (token.type === TokenType.TAG_OPEN) {
|
|
415
|
+
const tagName = token.value.toLowerCase();
|
|
416
|
+
|
|
417
|
+
handleAutoClosing(state, tagName);
|
|
418
|
+
|
|
419
|
+
const currentParent = getCurrentParent(state);
|
|
420
|
+
|
|
421
|
+
const element = createElement(tagName, token.attributes || {});
|
|
422
|
+
|
|
423
|
+
appendChild(currentParent, element);
|
|
424
|
+
|
|
425
|
+
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
426
|
+
state.stack.push(element);
|
|
427
|
+
}
|
|
428
|
+
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
429
|
+
const tagName = token.value.toLowerCase();
|
|
430
|
+
|
|
431
|
+
// Generate implied end tags
|
|
432
|
+
const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
|
|
433
|
+
while (state.stack.length > 1) { // Don't pop document
|
|
434
|
+
const currentElement = getCurrentElement(state);
|
|
435
|
+
if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
|
|
436
|
+
break;
|
|
437
|
+
}
|
|
438
|
+
state.stack.pop();
|
|
439
|
+
addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const currentElement = getCurrentElement(state);
|
|
443
|
+
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
444
|
+
state.stack.pop();
|
|
445
|
+
} else {
|
|
446
|
+
// For now, just ignore unmatched closing tags
|
|
447
|
+
// TODO: Implement full adoption agency algorithm
|
|
448
|
+
addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
|
|
449
|
+
}
|
|
450
|
+
} else if (token.type === TokenType.TEXT) {
|
|
451
|
+
parseText(state, token);
|
|
452
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
453
|
+
parseComment(state, token);
|
|
454
|
+
} else if (token.type === TokenType.CDATA) {
|
|
455
|
+
parseCDATA(state, token);
|
|
456
|
+
} else if (token.type === TokenType.DOCTYPE) {
|
|
457
|
+
// Ignore
|
|
458
|
+
} else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
|
|
459
|
+
parseProcessingInstruction(state, token);
|
|
184
460
|
}
|
|
185
461
|
}
|
|
186
462
|
|
|
@@ -192,76 +468,134 @@ function parseText(state: ParserState, token: Token): void {
|
|
|
192
468
|
return;
|
|
193
469
|
}
|
|
194
470
|
|
|
195
|
-
const textNode
|
|
196
|
-
|
|
197
|
-
content,
|
|
198
|
-
parent: currentParent,
|
|
199
|
-
position: token.position
|
|
200
|
-
};
|
|
201
|
-
|
|
202
|
-
if (currentParent.children) {
|
|
203
|
-
currentParent.children.push(textNode);
|
|
204
|
-
}
|
|
471
|
+
const textNode = createTextNode(content);
|
|
472
|
+
appendChild(currentParent, textNode);
|
|
205
473
|
}
|
|
206
474
|
|
|
207
475
|
function parseComment(state: ParserState, token: Token): void {
|
|
208
476
|
const currentParent = getCurrentParent(state);
|
|
209
477
|
|
|
210
|
-
const commentNode
|
|
211
|
-
|
|
212
|
-
content: token.value,
|
|
213
|
-
parent: currentParent,
|
|
214
|
-
position: token.position
|
|
215
|
-
};
|
|
216
|
-
|
|
217
|
-
if (currentParent.children) {
|
|
218
|
-
currentParent.children.push(commentNode);
|
|
219
|
-
}
|
|
478
|
+
const commentNode = createComment(token.value);
|
|
479
|
+
appendChild(currentParent, commentNode);
|
|
220
480
|
}
|
|
221
481
|
|
|
222
482
|
function parseCDATA(state: ParserState, token: Token): void {
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
const cdataNode: ASTNode = {
|
|
226
|
-
type: ASTNodeType.CDATA,
|
|
227
|
-
content: token.value,
|
|
228
|
-
parent: currentParent,
|
|
229
|
-
position: token.position
|
|
230
|
-
};
|
|
231
|
-
|
|
232
|
-
if (currentParent.children) {
|
|
233
|
-
currentParent.children.push(cdataNode);
|
|
234
|
-
}
|
|
483
|
+
// TODO: implement CDATA
|
|
235
484
|
}
|
|
236
485
|
|
|
237
486
|
function parseDoctype(state: ParserState, token: Token): void {
|
|
238
|
-
const
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
type: ASTNodeType.DOCTYPE,
|
|
242
|
-
content: token.value,
|
|
243
|
-
parent: currentParent,
|
|
244
|
-
position: token.position
|
|
245
|
-
};
|
|
246
|
-
|
|
247
|
-
if (currentParent.children) {
|
|
248
|
-
currentParent.children.push(doctypeNode);
|
|
249
|
-
}
|
|
487
|
+
const doctype = createDoctype(token.value || 'html');
|
|
488
|
+
appendChild(state.root, doctype);
|
|
489
|
+
state.root.doctype = doctype;
|
|
250
490
|
}
|
|
251
491
|
|
|
252
492
|
function parseProcessingInstruction(state: ParserState, token: Token): void {
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
const piNode: ASTNode = {
|
|
256
|
-
type: ASTNodeType.PROCESSING_INSTRUCTION,
|
|
257
|
-
content: token.value,
|
|
258
|
-
parent: currentParent,
|
|
259
|
-
position: token.position
|
|
260
|
-
};
|
|
493
|
+
// TODO: implement ProcessingInstruction
|
|
494
|
+
}
|
|
261
495
|
|
|
262
|
-
|
|
263
|
-
|
|
496
|
+
function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
|
|
497
|
+
// HTML5 Adoption Agency Algorithm - simplified but more correct implementation
|
|
498
|
+
|
|
499
|
+
// 1. If the current node is an HTML element whose tag name matches the token's tag name,
|
|
500
|
+
// then pop the current node off the stack of open elements and abort these steps.
|
|
501
|
+
const currentElement = getCurrentElement(state);
|
|
502
|
+
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
503
|
+
state.stack.pop();
|
|
504
|
+
return;
|
|
264
505
|
}
|
|
506
|
+
|
|
507
|
+
// 2. Let outer loop counter be 0
|
|
508
|
+
let outerLoopCounter = 0;
|
|
509
|
+
const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
|
|
510
|
+
|
|
511
|
+
while (outerLoopCounter < 8) { // Prevent infinite loops
|
|
512
|
+
outerLoopCounter++;
|
|
513
|
+
|
|
514
|
+
// 3. Let the formatting element be the last element in the list of active formatting elements
|
|
515
|
+
// that is between the end of the list and the last scope marker or the start of the list,
|
|
516
|
+
// if any, that has the same tag name as the token.
|
|
517
|
+
|
|
518
|
+
// For simplicity, find the innermost element with matching tag name
|
|
519
|
+
let formattingElementIndex = -1;
|
|
520
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
521
|
+
const element = state.stack[i];
|
|
522
|
+
if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
|
|
523
|
+
formattingElementIndex = i;
|
|
524
|
+
break;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
if (formattingElementIndex === -1) {
|
|
529
|
+
// No formatting element found, just find any element with matching tag name
|
|
530
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
531
|
+
const element = state.stack[i];
|
|
532
|
+
if (element.tagName && element.tagName.toLowerCase() === tagName) {
|
|
533
|
+
formattingElementIndex = i;
|
|
534
|
+
break;
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if (formattingElementIndex === -1) {
|
|
540
|
+
// No matching element found, ignore the token
|
|
541
|
+
addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
const formattingElement = state.stack[formattingElementIndex];
|
|
546
|
+
|
|
547
|
+
// 4. If there is no element in the stack of open elements that has the same tag name as the
|
|
548
|
+
// formatting element, then remove the element from the list of active formatting elements
|
|
549
|
+
// and abort these steps.
|
|
550
|
+
let openElementIndex = -1;
|
|
551
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
552
|
+
if (state.stack[i] === formattingElement) {
|
|
553
|
+
openElementIndex = i;
|
|
554
|
+
break;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
if (openElementIndex === -1) {
|
|
559
|
+
// Element not in stack, ignore
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
// 5. If the element is not in the stack of open elements, then this is a parse error;
|
|
564
|
+
// remove the element from the list of active formatting elements and abort these steps.
|
|
565
|
+
// (Already checked above)
|
|
566
|
+
|
|
567
|
+
// 6. Let the furthest block be the topmost node in the stack of open elements that is lower
|
|
568
|
+
// in the stack than the formatting element, and is an element in the special category.
|
|
569
|
+
const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
|
|
570
|
+
|
|
571
|
+
let furthestBlockIndex = -1;
|
|
572
|
+
for (let i = openElementIndex + 1; i < state.stack.length; i++) {
|
|
573
|
+
const element = state.stack[i];
|
|
574
|
+
if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
|
|
575
|
+
furthestBlockIndex = i;
|
|
576
|
+
break;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
if (furthestBlockIndex === -1) {
|
|
581
|
+
// No special element found, just pop elements until we reach the formatting element
|
|
582
|
+
while (state.stack.length > openElementIndex + 1) {
|
|
583
|
+
state.stack.pop();
|
|
584
|
+
}
|
|
585
|
+
state.stack.pop(); // Pop the formatting element
|
|
586
|
+
return;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
// 7. Simplified: just pop everything until the formatting element
|
|
590
|
+
while (state.stack.length > openElementIndex + 1) {
|
|
591
|
+
state.stack.pop();
|
|
592
|
+
}
|
|
593
|
+
state.stack.pop(); // Pop the formatting element
|
|
594
|
+
return;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// If we get here, something went wrong, ignore the token
|
|
598
|
+
addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
|
|
265
599
|
}
|
|
266
600
|
|
|
267
601
|
function handleAutoClosing(state: ParserState, tagName: string): void {
|
|
@@ -269,19 +603,19 @@ function handleAutoClosing(state: ParserState, tagName: string): void {
|
|
|
269
603
|
if (!autoCloseList) return;
|
|
270
604
|
|
|
271
605
|
const currentElement = getCurrentElement(state);
|
|
272
|
-
if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
|
|
606
|
+
if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
|
|
273
607
|
state.stack.pop();
|
|
274
608
|
}
|
|
275
609
|
}
|
|
276
610
|
|
|
277
|
-
function getCurrentParent(state: ParserState):
|
|
278
|
-
return state.stack[state.stack.length - 1]
|
|
611
|
+
function getCurrentParent(state: ParserState): any {
|
|
612
|
+
return state.stack[state.stack.length - 1];
|
|
279
613
|
}
|
|
280
614
|
|
|
281
|
-
function getCurrentElement(state: ParserState):
|
|
615
|
+
function getCurrentElement(state: ParserState): any {
|
|
282
616
|
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
283
|
-
const element = state.stack[i]
|
|
284
|
-
if (element.
|
|
617
|
+
const element = state.stack[i];
|
|
618
|
+
if (element.nodeType === 1) { // ELEMENT_NODE
|
|
285
619
|
return element;
|
|
286
620
|
}
|
|
287
621
|
}
|
|
@@ -306,7 +640,7 @@ function addError(state: ParserState, message: string, position: number): void {
|
|
|
306
640
|
});
|
|
307
641
|
}
|
|
308
642
|
|
|
309
|
-
function shouldSkipWhitespace(parent:
|
|
643
|
+
function shouldSkipWhitespace(parent: any): boolean {
|
|
310
644
|
const skipWhitespaceIn = new Set([
|
|
311
645
|
'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
|
|
312
646
|
'ul', 'ol', 'dl', 'select', 'optgroup'
|
|
@@ -314,42 +648,3 @@ function shouldSkipWhitespace(parent: ASTNode): boolean {
|
|
|
314
648
|
|
|
315
649
|
return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
|
|
316
650
|
}
|
|
317
|
-
|
|
318
|
-
export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
|
|
319
|
-
callback(node);
|
|
320
|
-
|
|
321
|
-
if (node.children) {
|
|
322
|
-
for (const child of node.children) {
|
|
323
|
-
traverseAST(child, callback);
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
|
|
329
|
-
const results: ASTNode[] = [];
|
|
330
|
-
|
|
331
|
-
traverseAST(root, (node) => {
|
|
332
|
-
if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
|
|
333
|
-
results.push(node);
|
|
334
|
-
}
|
|
335
|
-
});
|
|
336
|
-
|
|
337
|
-
return results;
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
|
|
341
|
-
const results: ASTNode[] = [];
|
|
342
|
-
|
|
343
|
-
traverseAST(root, (node) => {
|
|
344
|
-
if (node.type === ASTNodeType.ELEMENT && node.attributes) {
|
|
345
|
-
const hasAttr = attrName in node.attributes;
|
|
346
|
-
const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
|
|
347
|
-
|
|
348
|
-
if (hasAttr && valueMatches) {
|
|
349
|
-
results.push(node);
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
});
|
|
353
|
-
|
|
354
|
-
return results;
|
|
355
|
-
}
|