@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
package/src/parser.ts
DELETED
|
@@ -1,682 +0,0 @@
|
|
|
1
|
-
import type { Token } from './tokenizer.js';
|
|
2
|
-
import { TokenType } from './tokenizer.js';
|
|
3
|
-
import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
|
|
4
|
-
|
|
5
|
-
export interface ParserState {
|
|
6
|
-
tokens: Token[];
|
|
7
|
-
position: number;
|
|
8
|
-
length: number;
|
|
9
|
-
stack: any[]; // DOM elements
|
|
10
|
-
root: any; // Document
|
|
11
|
-
insertionMode: InsertionMode;
|
|
12
|
-
errors: ParseError[];
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
export interface ParseError {
|
|
16
|
-
message: string;
|
|
17
|
-
position: number;
|
|
18
|
-
line: number;
|
|
19
|
-
column: number;
|
|
20
|
-
severity: 'error' | 'warning';
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
export enum InsertionMode {
|
|
24
|
-
Initial = 'initial',
|
|
25
|
-
BeforeHtml = 'beforeHtml',
|
|
26
|
-
BeforeHead = 'beforeHead',
|
|
27
|
-
InHead = 'inHead',
|
|
28
|
-
AfterHead = 'afterHead',
|
|
29
|
-
InBody = 'inBody'
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export enum ASTNodeType {
|
|
33
|
-
Document = 'document',
|
|
34
|
-
Element = 'element',
|
|
35
|
-
Text = 'text',
|
|
36
|
-
Comment = 'comment',
|
|
37
|
-
Doctype = 'doctype',
|
|
38
|
-
CDATA = 'cdata'
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export interface ASTNode {
|
|
42
|
-
type: ASTNodeType;
|
|
43
|
-
tagName?: string;
|
|
44
|
-
value?: string;
|
|
45
|
-
attributes?: Record<string, string>;
|
|
46
|
-
children?: ASTNode[];
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
const VOID_ELEMENTS = new Set([
|
|
50
|
-
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
51
|
-
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
52
|
-
]);
|
|
53
|
-
|
|
54
|
-
const RAW_TEXT_ELEMENTS = new Set([
|
|
55
|
-
'script', 'style', 'textarea', 'title'
|
|
56
|
-
]);
|
|
57
|
-
|
|
58
|
-
const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
59
|
-
'li': ['li'],
|
|
60
|
-
'dt': ['dt', 'dd'],
|
|
61
|
-
'dd': ['dt', 'dd'],
|
|
62
|
-
'address': ['p'],
|
|
63
|
-
'article': ['p'],
|
|
64
|
-
'aside': ['p'],
|
|
65
|
-
'blockquote': ['p'],
|
|
66
|
-
'center': ['p'],
|
|
67
|
-
'details': ['p'],
|
|
68
|
-
'dialog': ['p'],
|
|
69
|
-
'dir': ['p'],
|
|
70
|
-
'div': ['p'],
|
|
71
|
-
'dl': ['p'],
|
|
72
|
-
'fieldset': ['p'],
|
|
73
|
-
'figcaption': ['p'],
|
|
74
|
-
'figure': ['p'],
|
|
75
|
-
'footer': ['p'],
|
|
76
|
-
'form': ['p'],
|
|
77
|
-
'h1': ['p'],
|
|
78
|
-
'h2': ['p'],
|
|
79
|
-
'h3': ['p'],
|
|
80
|
-
'h4': ['p'],
|
|
81
|
-
'h5': ['p'],
|
|
82
|
-
'h6': ['p'],
|
|
83
|
-
'header': ['p'],
|
|
84
|
-
'hgroup': ['p'],
|
|
85
|
-
'hr': ['p'],
|
|
86
|
-
'listing': ['p'],
|
|
87
|
-
'main': ['p'],
|
|
88
|
-
'menu': ['p'],
|
|
89
|
-
'nav': ['p'],
|
|
90
|
-
'ol': ['p'],
|
|
91
|
-
'p': ['p'],
|
|
92
|
-
'pre': ['p'],
|
|
93
|
-
'section': ['p'],
|
|
94
|
-
'summary': ['p'],
|
|
95
|
-
'table': ['p'],
|
|
96
|
-
'ul': ['p'],
|
|
97
|
-
'rt': ['rt', 'rp'],
|
|
98
|
-
'rp': ['rt', 'rp'],
|
|
99
|
-
'optgroup': ['optgroup'],
|
|
100
|
-
'option': ['option'],
|
|
101
|
-
'thead': ['tbody', 'tfoot'],
|
|
102
|
-
'tbody': ['thead', 'tbody', 'tfoot'],
|
|
103
|
-
'tfoot': ['thead', 'tbody'],
|
|
104
|
-
'tr': ['tr'],
|
|
105
|
-
'td': ['td', 'th'],
|
|
106
|
-
'th': ['td', 'th']
|
|
107
|
-
};
|
|
108
|
-
|
|
109
|
-
export function parse(tokens: Token[]): any {
|
|
110
|
-
const state = createParserState(tokens);
|
|
111
|
-
|
|
112
|
-
while (state.position < state.length) {
|
|
113
|
-
const token = getCurrentToken(state);
|
|
114
|
-
|
|
115
|
-
if (!token || token.type === TokenType.EOF) {
|
|
116
|
-
break;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
parseToken(state, token);
|
|
120
|
-
advance(state);
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
// Create implicit html, head, body if needed
|
|
124
|
-
if (state.root.childNodes && state.root.childNodes.length > 0) {
|
|
125
|
-
let hasHtml = false;
|
|
126
|
-
for (const child of state.root.childNodes) {
|
|
127
|
-
if (child.nodeType === 1 && child.tagName === 'HTML') {
|
|
128
|
-
hasHtml = true;
|
|
129
|
-
state.root.documentElement = child;
|
|
130
|
-
break;
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
if (!hasHtml) {
|
|
134
|
-
const html = createElement('html', {});
|
|
135
|
-
const head = createElement('head', {});
|
|
136
|
-
const body = createElement('body', {});
|
|
137
|
-
appendChild(html, head);
|
|
138
|
-
appendChild(html, body);
|
|
139
|
-
|
|
140
|
-
const doctypes: any[] = [];
|
|
141
|
-
const commentsBeforeHtml: any[] = [];
|
|
142
|
-
const bodyContent: any[] = [];
|
|
143
|
-
const children = [...state.root.childNodes];
|
|
144
|
-
|
|
145
|
-
let foundElement = false;
|
|
146
|
-
for (const child of children) {
|
|
147
|
-
if (child.nodeType === 10) {
|
|
148
|
-
doctypes.push(child);
|
|
149
|
-
} else if (child.nodeType === 8 && !foundElement) {
|
|
150
|
-
commentsBeforeHtml.push(child);
|
|
151
|
-
} else {
|
|
152
|
-
if (child.nodeType === 1) foundElement = true;
|
|
153
|
-
bodyContent.push(child);
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
for (const content of bodyContent) {
|
|
158
|
-
appendChild(body, content);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
state.root.childNodes = [];
|
|
162
|
-
for (const doctype of doctypes) {
|
|
163
|
-
doctype.parentNode = null;
|
|
164
|
-
appendChild(state.root, doctype);
|
|
165
|
-
}
|
|
166
|
-
for (const comment of commentsBeforeHtml) {
|
|
167
|
-
comment.parentNode = null;
|
|
168
|
-
appendChild(state.root, comment);
|
|
169
|
-
}
|
|
170
|
-
appendChild(state.root, html);
|
|
171
|
-
state.root.documentElement = html;
|
|
172
|
-
state.root.head = head;
|
|
173
|
-
state.root.body = body;
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
while (state.stack.length > 1) {
|
|
178
|
-
const unclosedElement = state.stack.pop()!;
|
|
179
|
-
const currentToken = getCurrentToken(state);
|
|
180
|
-
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
return state.root;
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
export function domToAST(dom: any): ASTNode {
|
|
187
|
-
function convert(node: any): ASTNode | null {
|
|
188
|
-
if (!node) return null;
|
|
189
|
-
|
|
190
|
-
if (node.nodeType === 9) {
|
|
191
|
-
const children: ASTNode[] = [];
|
|
192
|
-
if (node.childNodes) {
|
|
193
|
-
for (const child of node.childNodes) {
|
|
194
|
-
const converted = convert(child);
|
|
195
|
-
if (converted) children.push(converted);
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
return {
|
|
199
|
-
type: ASTNodeType.Document,
|
|
200
|
-
children
|
|
201
|
-
};
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
if (node.nodeType === 1) {
|
|
205
|
-
const children: ASTNode[] = [];
|
|
206
|
-
if (node.childNodes) {
|
|
207
|
-
for (const child of node.childNodes) {
|
|
208
|
-
const converted = convert(child);
|
|
209
|
-
if (converted) children.push(converted);
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
const tagName = node.tagName?.toLowerCase();
|
|
213
|
-
return {
|
|
214
|
-
type: ASTNodeType.Element,
|
|
215
|
-
tagName,
|
|
216
|
-
attributes: node.attributes || {},
|
|
217
|
-
children,
|
|
218
|
-
isSelfClosing: VOID_ELEMENTS.has(tagName)
|
|
219
|
-
} as ASTNode & { isSelfClosing: boolean };
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
if (node.nodeType === 3) {
|
|
223
|
-
return {
|
|
224
|
-
type: ASTNodeType.Text,
|
|
225
|
-
content: node.nodeValue || ''
|
|
226
|
-
} as ASTNode & { content: string };
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
if (node.nodeType === 8) {
|
|
230
|
-
return {
|
|
231
|
-
type: ASTNodeType.Comment,
|
|
232
|
-
content: node.nodeValue || ''
|
|
233
|
-
} as ASTNode & { content: string };
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
if (node.nodeType === 10) {
|
|
237
|
-
return {
|
|
238
|
-
type: ASTNodeType.Doctype,
|
|
239
|
-
content: node.name || 'html'
|
|
240
|
-
} as ASTNode & { content: string };
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
return null;
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
return convert(dom) || { type: ASTNodeType.Document, children: [] };
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
function createParserState(tokens: Token[]): ParserState {
|
|
250
|
-
const root = createDocument();
|
|
251
|
-
|
|
252
|
-
return {
|
|
253
|
-
tokens,
|
|
254
|
-
position: 0,
|
|
255
|
-
length: tokens.length,
|
|
256
|
-
stack: [root],
|
|
257
|
-
root,
|
|
258
|
-
insertionMode: InsertionMode.Initial,
|
|
259
|
-
errors: []
|
|
260
|
-
};
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
function parseToken(state: ParserState, token: Token): void {
|
|
264
|
-
switch (state.insertionMode) {
|
|
265
|
-
case InsertionMode.Initial:
|
|
266
|
-
parseTokenInInitialMode(state, token);
|
|
267
|
-
break;
|
|
268
|
-
case InsertionMode.BeforeHtml:
|
|
269
|
-
parseTokenInBeforeHtmlMode(state, token);
|
|
270
|
-
break;
|
|
271
|
-
case InsertionMode.BeforeHead:
|
|
272
|
-
parseTokenInBeforeHeadMode(state, token);
|
|
273
|
-
break;
|
|
274
|
-
case InsertionMode.InHead:
|
|
275
|
-
parseTokenInInHeadMode(state, token);
|
|
276
|
-
break;
|
|
277
|
-
case InsertionMode.AfterHead:
|
|
278
|
-
parseTokenInAfterHeadMode(state, token);
|
|
279
|
-
break;
|
|
280
|
-
case InsertionMode.InBody:
|
|
281
|
-
parseTokenInInBodyMode(state, token);
|
|
282
|
-
break;
|
|
283
|
-
default:
|
|
284
|
-
parseTokenInInBodyMode(state, token); // fallback
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
function parseTokenInInitialMode(state: ParserState, token: Token): void {
|
|
289
|
-
if (token.type === TokenType.DOCTYPE) {
|
|
290
|
-
// TODO: Create DOCTYPE node
|
|
291
|
-
parseDoctype(state, token);
|
|
292
|
-
state.insertionMode = InsertionMode.BeforeHtml;
|
|
293
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
294
|
-
parseComment(state, token);
|
|
295
|
-
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
296
|
-
// Ignore whitespace
|
|
297
|
-
} else {
|
|
298
|
-
// No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
|
|
299
|
-
const doctype = createDoctype('html');
|
|
300
|
-
appendChild(state.root, doctype);
|
|
301
|
-
state.insertionMode = InsertionMode.BeforeHtml;
|
|
302
|
-
parseToken(state, token); // Re-parse in new mode
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
|
|
307
|
-
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
|
|
308
|
-
const html = createElement('html', token.attributes || {});
|
|
309
|
-
appendChild(state.root, html);
|
|
310
|
-
state.root.documentElement = html;
|
|
311
|
-
state.stack.push(html);
|
|
312
|
-
state.insertionMode = InsertionMode.BeforeHead;
|
|
313
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
314
|
-
parseComment(state, token);
|
|
315
|
-
} else if (token.type === TokenType.DOCTYPE) {
|
|
316
|
-
// Ignore
|
|
317
|
-
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
318
|
-
// Ignore whitespace
|
|
319
|
-
} else {
|
|
320
|
-
const html = createElement('html', {});
|
|
321
|
-
appendChild(state.root, html);
|
|
322
|
-
state.root.documentElement = html;
|
|
323
|
-
state.stack.push(html);
|
|
324
|
-
state.insertionMode = InsertionMode.BeforeHead;
|
|
325
|
-
parseToken(state, token);
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
|
|
330
|
-
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
|
|
331
|
-
const head = createElement('head', token.attributes || {});
|
|
332
|
-
appendChild(getCurrentParent(state), head);
|
|
333
|
-
state.root.head = head;
|
|
334
|
-
state.stack.push(head);
|
|
335
|
-
state.insertionMode = InsertionMode.InHead;
|
|
336
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
337
|
-
parseComment(state, token);
|
|
338
|
-
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
339
|
-
// Ignore whitespace
|
|
340
|
-
} else {
|
|
341
|
-
const head = createElement('head', {});
|
|
342
|
-
appendChild(getCurrentParent(state), head);
|
|
343
|
-
state.root.head = head;
|
|
344
|
-
state.stack.push(head);
|
|
345
|
-
state.insertionMode = InsertionMode.InHead;
|
|
346
|
-
parseToken(state, token);
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
function parseOpenTag(state: ParserState, token: Token): void {
|
|
351
|
-
const tagName = token.value.toLowerCase();
|
|
352
|
-
const currentParent = getCurrentParent(state);
|
|
353
|
-
const element = createElement(tagName, token.attributes || {});
|
|
354
|
-
appendChild(currentParent, element);
|
|
355
|
-
|
|
356
|
-
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
357
|
-
state.stack.push(element);
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
function parseTokenInInHeadMode(state: ParserState, token: Token): void {
|
|
362
|
-
const currentElement = getCurrentElement(state);
|
|
363
|
-
const currentTagName = currentElement?.tagName?.toLowerCase();
|
|
364
|
-
|
|
365
|
-
if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
|
|
366
|
-
if (token.type === TokenType.TEXT) {
|
|
367
|
-
parseText(state, token);
|
|
368
|
-
return;
|
|
369
|
-
} else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
|
|
370
|
-
state.stack.pop();
|
|
371
|
-
return;
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
if (token.type === TokenType.TAG_OPEN) {
|
|
376
|
-
const tagName = token.value.toLowerCase();
|
|
377
|
-
if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
378
|
-
parseOpenTag(state, token);
|
|
379
|
-
} else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
|
|
380
|
-
parseOpenTag(state, token);
|
|
381
|
-
} else if (tagName === 'head') {
|
|
382
|
-
// Ignore duplicate <head> tags
|
|
383
|
-
} else if (tagName.includes('-')) {
|
|
384
|
-
// Custom elements (tags with hyphens) are valid in <head>
|
|
385
|
-
parseOpenTag(state, token);
|
|
386
|
-
} else {
|
|
387
|
-
state.stack.pop();
|
|
388
|
-
state.insertionMode = InsertionMode.AfterHead;
|
|
389
|
-
parseToken(state, token);
|
|
390
|
-
}
|
|
391
|
-
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
392
|
-
const tagName = token.value.toLowerCase();
|
|
393
|
-
if (tagName === 'head') {
|
|
394
|
-
state.stack.pop();
|
|
395
|
-
state.insertionMode = InsertionMode.AfterHead;
|
|
396
|
-
} else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
397
|
-
if (currentTagName === tagName) {
|
|
398
|
-
state.stack.pop();
|
|
399
|
-
}
|
|
400
|
-
} else if (tagName.includes('-') && currentTagName === tagName) {
|
|
401
|
-
// Handle closing tags for custom elements in <head>
|
|
402
|
-
state.stack.pop();
|
|
403
|
-
}
|
|
404
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
405
|
-
parseComment(state, token);
|
|
406
|
-
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
407
|
-
} else {
|
|
408
|
-
state.stack.pop();
|
|
409
|
-
state.insertionMode = InsertionMode.AfterHead;
|
|
410
|
-
parseToken(state, token);
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
|
|
415
|
-
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
|
|
416
|
-
const body = createElement('body', token.attributes || {});
|
|
417
|
-
appendChild(getCurrentParent(state), body);
|
|
418
|
-
state.root.body = body;
|
|
419
|
-
state.stack.push(body);
|
|
420
|
-
state.insertionMode = InsertionMode.InBody;
|
|
421
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
422
|
-
parseComment(state, token);
|
|
423
|
-
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
424
|
-
// Ignore whitespace
|
|
425
|
-
} else {
|
|
426
|
-
const body = createElement('body', {});
|
|
427
|
-
appendChild(getCurrentParent(state), body);
|
|
428
|
-
state.root.body = body;
|
|
429
|
-
state.stack.push(body);
|
|
430
|
-
state.insertionMode = InsertionMode.InBody;
|
|
431
|
-
parseToken(state, token);
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
|
|
436
|
-
const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
|
|
437
|
-
|
|
438
|
-
function parseTokenInInBodyMode(state: ParserState, token: Token): void {
|
|
439
|
-
if (token.type === TokenType.TAG_OPEN) {
|
|
440
|
-
const tagName = token.value.toLowerCase();
|
|
441
|
-
|
|
442
|
-
handleAutoClosing(state, tagName);
|
|
443
|
-
|
|
444
|
-
const currentParent = getCurrentParent(state);
|
|
445
|
-
|
|
446
|
-
let namespaceURI: string | undefined;
|
|
447
|
-
if (tagName === 'svg') {
|
|
448
|
-
namespaceURI = SVG_NAMESPACE;
|
|
449
|
-
} else if (tagName === 'math') {
|
|
450
|
-
namespaceURI = MATHML_NAMESPACE;
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
const element = createElement(tagName, token.attributes || {}, namespaceURI);
|
|
454
|
-
|
|
455
|
-
appendChild(currentParent, element);
|
|
456
|
-
|
|
457
|
-
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
458
|
-
state.stack.push(element);
|
|
459
|
-
}
|
|
460
|
-
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
461
|
-
const tagName = token.value.toLowerCase();
|
|
462
|
-
|
|
463
|
-
// Generate implied end tags
|
|
464
|
-
const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
|
|
465
|
-
while (state.stack.length > 1) { // Don't pop document
|
|
466
|
-
const currentElement = getCurrentElement(state);
|
|
467
|
-
if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
|
|
468
|
-
break;
|
|
469
|
-
}
|
|
470
|
-
state.stack.pop();
|
|
471
|
-
addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
const currentElement = getCurrentElement(state);
|
|
475
|
-
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
476
|
-
state.stack.pop();
|
|
477
|
-
} else {
|
|
478
|
-
// For now, just ignore unmatched closing tags
|
|
479
|
-
// TODO: Implement full adoption agency algorithm
|
|
480
|
-
addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
|
|
481
|
-
}
|
|
482
|
-
} else if (token.type === TokenType.TEXT) {
|
|
483
|
-
parseText(state, token);
|
|
484
|
-
} else if (token.type === TokenType.COMMENT) {
|
|
485
|
-
parseComment(state, token);
|
|
486
|
-
} else if (token.type === TokenType.CDATA) {
|
|
487
|
-
parseCDATA(state, token);
|
|
488
|
-
} else if (token.type === TokenType.DOCTYPE) {
|
|
489
|
-
// Ignore
|
|
490
|
-
} else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
|
|
491
|
-
parseProcessingInstruction(state, token);
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
function parseText(state: ParserState, token: Token): void {
|
|
496
|
-
const content = token.value;
|
|
497
|
-
const currentParent = getCurrentParent(state);
|
|
498
|
-
|
|
499
|
-
if (content.trim() === '' && shouldSkipWhitespace(currentParent)) {
|
|
500
|
-
return;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
const textNode = createTextNode(content);
|
|
504
|
-
appendChild(currentParent, textNode);
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
function parseComment(state: ParserState, token: Token): void {
|
|
508
|
-
const currentParent = getCurrentParent(state);
|
|
509
|
-
|
|
510
|
-
const commentNode = createComment(token.value);
|
|
511
|
-
appendChild(currentParent, commentNode);
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
function parseCDATA(state: ParserState, token: Token): void {
|
|
515
|
-
// TODO: implement CDATA
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
function parseDoctype(state: ParserState, token: Token): void {
|
|
519
|
-
const doctype = createDoctype(token.value || 'html');
|
|
520
|
-
appendChild(state.root, doctype);
|
|
521
|
-
state.root.doctype = doctype;
|
|
522
|
-
}
|
|
523
|
-
|
|
524
|
-
function parseProcessingInstruction(state: ParserState, token: Token): void {
|
|
525
|
-
// TODO: implement ProcessingInstruction
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
|
|
529
|
-
// HTML5 Adoption Agency Algorithm - simplified but more correct implementation
|
|
530
|
-
|
|
531
|
-
// 1. If the current node is an HTML element whose tag name matches the token's tag name,
|
|
532
|
-
// then pop the current node off the stack of open elements and abort these steps.
|
|
533
|
-
const currentElement = getCurrentElement(state);
|
|
534
|
-
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
535
|
-
state.stack.pop();
|
|
536
|
-
return;
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
// 2. Let outer loop counter be 0
|
|
540
|
-
let outerLoopCounter = 0;
|
|
541
|
-
const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
|
|
542
|
-
|
|
543
|
-
while (outerLoopCounter < 8) { // Prevent infinite loops
|
|
544
|
-
outerLoopCounter++;
|
|
545
|
-
|
|
546
|
-
// 3. Let the formatting element be the last element in the list of active formatting elements
|
|
547
|
-
// that is between the end of the list and the last scope marker or the start of the list,
|
|
548
|
-
// if any, that has the same tag name as the token.
|
|
549
|
-
|
|
550
|
-
// For simplicity, find the innermost element with matching tag name
|
|
551
|
-
let formattingElementIndex = -1;
|
|
552
|
-
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
553
|
-
const element = state.stack[i];
|
|
554
|
-
if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
|
|
555
|
-
formattingElementIndex = i;
|
|
556
|
-
break;
|
|
557
|
-
}
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
if (formattingElementIndex === -1) {
|
|
561
|
-
// No formatting element found, just find any element with matching tag name
|
|
562
|
-
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
563
|
-
const element = state.stack[i];
|
|
564
|
-
if (element.tagName && element.tagName.toLowerCase() === tagName) {
|
|
565
|
-
formattingElementIndex = i;
|
|
566
|
-
break;
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
if (formattingElementIndex === -1) {
|
|
572
|
-
// No matching element found, ignore the token
|
|
573
|
-
addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
|
|
574
|
-
return;
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
const formattingElement = state.stack[formattingElementIndex];
|
|
578
|
-
|
|
579
|
-
// 4. If there is no element in the stack of open elements that has the same tag name as the
|
|
580
|
-
// formatting element, then remove the element from the list of active formatting elements
|
|
581
|
-
// and abort these steps.
|
|
582
|
-
let openElementIndex = -1;
|
|
583
|
-
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
584
|
-
if (state.stack[i] === formattingElement) {
|
|
585
|
-
openElementIndex = i;
|
|
586
|
-
break;
|
|
587
|
-
}
|
|
588
|
-
}
|
|
589
|
-
|
|
590
|
-
if (openElementIndex === -1) {
|
|
591
|
-
// Element not in stack, ignore
|
|
592
|
-
return;
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
// 5. If the element is not in the stack of open elements, then this is a parse error;
|
|
596
|
-
// remove the element from the list of active formatting elements and abort these steps.
|
|
597
|
-
// (Already checked above)
|
|
598
|
-
|
|
599
|
-
// 6. Let the furthest block be the topmost node in the stack of open elements that is lower
|
|
600
|
-
// in the stack than the formatting element, and is an element in the special category.
|
|
601
|
-
const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
|
|
602
|
-
|
|
603
|
-
let furthestBlockIndex = -1;
|
|
604
|
-
for (let i = openElementIndex + 1; i < state.stack.length; i++) {
|
|
605
|
-
const element = state.stack[i];
|
|
606
|
-
if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
|
|
607
|
-
furthestBlockIndex = i;
|
|
608
|
-
break;
|
|
609
|
-
}
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
if (furthestBlockIndex === -1) {
|
|
613
|
-
// No special element found, just pop elements until we reach the formatting element
|
|
614
|
-
while (state.stack.length > openElementIndex + 1) {
|
|
615
|
-
state.stack.pop();
|
|
616
|
-
}
|
|
617
|
-
state.stack.pop(); // Pop the formatting element
|
|
618
|
-
return;
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
// 7. Simplified: just pop everything until the formatting element
|
|
622
|
-
while (state.stack.length > openElementIndex + 1) {
|
|
623
|
-
state.stack.pop();
|
|
624
|
-
}
|
|
625
|
-
state.stack.pop(); // Pop the formatting element
|
|
626
|
-
return;
|
|
627
|
-
}
|
|
628
|
-
|
|
629
|
-
// If we get here, something went wrong, ignore the token
|
|
630
|
-
addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
function handleAutoClosing(state: ParserState, tagName: string): void {
|
|
634
|
-
const autoCloseList = AUTO_CLOSE_RULES[tagName];
|
|
635
|
-
if (!autoCloseList) return;
|
|
636
|
-
|
|
637
|
-
const currentElement = getCurrentElement(state);
|
|
638
|
-
if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
|
|
639
|
-
state.stack.pop();
|
|
640
|
-
}
|
|
641
|
-
}
|
|
642
|
-
|
|
643
|
-
function getCurrentParent(state: ParserState): any {
|
|
644
|
-
return state.stack[state.stack.length - 1];
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
function getCurrentElement(state: ParserState): any {
|
|
648
|
-
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
649
|
-
const element = state.stack[i];
|
|
650
|
-
if (element.nodeType === 1) { // ELEMENT_NODE
|
|
651
|
-
return element;
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
return null;
|
|
655
|
-
}
|
|
656
|
-
|
|
657
|
-
function getCurrentToken(state: ParserState): Token | null {
|
|
658
|
-
return state.tokens[state.position] || null;
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
function advance(state: ParserState): void {
|
|
662
|
-
state.position++;
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
function addError(state: ParserState, message: string, position: number): void {
|
|
666
|
-
state.errors.push({
|
|
667
|
-
message,
|
|
668
|
-
position,
|
|
669
|
-
line: 0,
|
|
670
|
-
column: 0,
|
|
671
|
-
severity: 'error'
|
|
672
|
-
});
|
|
673
|
-
}
|
|
674
|
-
|
|
675
|
-
function shouldSkipWhitespace(parent: any): boolean {
|
|
676
|
-
const skipWhitespaceIn = new Set([
|
|
677
|
-
'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
|
|
678
|
-
'ul', 'ol', 'dl', 'select', 'optgroup'
|
|
679
|
-
]);
|
|
680
|
-
|
|
681
|
-
return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
|
|
682
|
-
}
|