@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +41 -17
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +509 -143
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +190 -118
  12. package/tests/advanced.test.ts +121 -108
  13. package/tests/custom-elements-head.test.ts +105 -0
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +60 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +173 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +25 -32
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +32 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/custom-elements.test.ts +0 -745
  80. package/tests/official/README.md +0 -87
  81. package/tests/official/acid/acid-tests.test.ts +0 -309
  82. package/tests/official/final-output/final-output.test.ts +0 -361
  83. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  84. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  85. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  86. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  87. package/tests/official/validator/validator-tests.test.ts +0 -237
  88. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  89. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  90. package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/parser.ts CHANGED
@@ -1,38 +1,14 @@
1
1
  import type { Token } from './tokenizer.js';
2
2
  import { TokenType } from './tokenizer.js';
3
-
4
- export interface ASTNode {
5
- type: ASTNodeType;
6
- tagName?: string;
7
- attributes?: Record<string, string>;
8
- children?: ASTNode[];
9
- content?: string;
10
- parent?: ASTNode;
11
- isSelfClosing?: boolean;
12
- position?: {
13
- start: number;
14
- end: number;
15
- line: number;
16
- column: number;
17
- };
18
- }
19
-
20
- export enum ASTNodeType {
21
- DOCUMENT = 'DOCUMENT',
22
- ELEMENT = 'ELEMENT',
23
- TEXT = 'TEXT',
24
- COMMENT = 'COMMENT',
25
- CDATA = 'CDATA',
26
- DOCTYPE = 'DOCTYPE',
27
- PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
28
- }
3
+ import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
29
4
 
30
5
  export interface ParserState {
31
6
  tokens: Token[];
32
7
  position: number;
33
8
  length: number;
34
- stack: ASTNode[];
35
- root: ASTNode;
9
+ stack: any[]; // DOM elements
10
+ root: any; // Document
11
+ insertionMode: InsertionMode;
36
12
  errors: ParseError[];
37
13
  }
38
14
 
@@ -44,6 +20,32 @@ export interface ParseError {
44
20
  severity: 'error' | 'warning';
45
21
  }
46
22
 
23
+ export enum InsertionMode {
24
+ Initial = 'initial',
25
+ BeforeHtml = 'beforeHtml',
26
+ BeforeHead = 'beforeHead',
27
+ InHead = 'inHead',
28
+ AfterHead = 'afterHead',
29
+ InBody = 'inBody'
30
+ }
31
+
32
+ export enum ASTNodeType {
33
+ Document = 'document',
34
+ Element = 'element',
35
+ Text = 'text',
36
+ Comment = 'comment',
37
+ Doctype = 'doctype',
38
+ CDATA = 'cdata'
39
+ }
40
+
41
+ export interface ASTNode {
42
+ type: ASTNodeType;
43
+ tagName?: string;
44
+ value?: string;
45
+ attributes?: Record<string, string>;
46
+ children?: ASTNode[];
47
+ }
48
+
47
49
  const VOID_ELEMENTS = new Set([
48
50
  'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
49
51
  'link', 'meta', 'param', 'source', 'track', 'wbr'
@@ -57,7 +59,41 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
57
59
  'li': ['li'],
58
60
  'dt': ['dt', 'dd'],
59
61
  'dd': ['dt', 'dd'],
62
+ 'address': ['p'],
63
+ 'article': ['p'],
64
+ 'aside': ['p'],
65
+ 'blockquote': ['p'],
66
+ 'center': ['p'],
67
+ 'details': ['p'],
68
+ 'dialog': ['p'],
69
+ 'dir': ['p'],
70
+ 'div': ['p'],
71
+ 'dl': ['p'],
72
+ 'fieldset': ['p'],
73
+ 'figcaption': ['p'],
74
+ 'figure': ['p'],
75
+ 'footer': ['p'],
76
+ 'form': ['p'],
77
+ 'h1': ['p'],
78
+ 'h2': ['p'],
79
+ 'h3': ['p'],
80
+ 'h4': ['p'],
81
+ 'h5': ['p'],
82
+ 'h6': ['p'],
83
+ 'header': ['p'],
84
+ 'hgroup': ['p'],
85
+ 'hr': ['p'],
86
+ 'listing': ['p'],
87
+ 'main': ['p'],
88
+ 'menu': ['p'],
89
+ 'nav': ['p'],
90
+ 'ol': ['p'],
60
91
  'p': ['p'],
92
+ 'pre': ['p'],
93
+ 'section': ['p'],
94
+ 'summary': ['p'],
95
+ 'table': ['p'],
96
+ 'ul': ['p'],
61
97
  'rt': ['rt', 'rp'],
62
98
  'rp': ['rt', 'rp'],
63
99
  'optgroup': ['optgroup'],
@@ -70,7 +106,7 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
70
106
  'th': ['td', 'th']
71
107
  };
72
108
 
73
- export function parse(tokens: Token[]): ASTNode {
109
+ export function parse(tokens: Token[]): any {
74
110
  const state = createParserState(tokens);
75
111
 
76
112
  while (state.position < state.length) {
@@ -84,21 +120,134 @@ export function parse(tokens: Token[]): ASTNode {
84
120
  advance(state);
85
121
  }
86
122
 
123
+ // Create implicit html, head, body if needed
124
+ if (state.root.childNodes && state.root.childNodes.length > 0) {
125
+ let hasHtml = false;
126
+ for (const child of state.root.childNodes) {
127
+ if (child.nodeType === 1 && child.tagName === 'HTML') {
128
+ hasHtml = true;
129
+ state.root.documentElement = child;
130
+ break;
131
+ }
132
+ }
133
+ if (!hasHtml) {
134
+ const html = createElement('html', {});
135
+ const head = createElement('head', {});
136
+ const body = createElement('body', {});
137
+ appendChild(html, head);
138
+ appendChild(html, body);
139
+
140
+ const doctypes: any[] = [];
141
+ const commentsBeforeHtml: any[] = [];
142
+ const bodyContent: any[] = [];
143
+ const children = [...state.root.childNodes];
144
+
145
+ let foundElement = false;
146
+ for (const child of children) {
147
+ if (child.nodeType === 10) {
148
+ doctypes.push(child);
149
+ } else if (child.nodeType === 8 && !foundElement) {
150
+ commentsBeforeHtml.push(child);
151
+ } else {
152
+ if (child.nodeType === 1) foundElement = true;
153
+ bodyContent.push(child);
154
+ }
155
+ }
156
+
157
+ for (const content of bodyContent) {
158
+ appendChild(body, content);
159
+ }
160
+
161
+ state.root.childNodes = [];
162
+ for (const doctype of doctypes) {
163
+ doctype.parentNode = null;
164
+ appendChild(state.root, doctype);
165
+ }
166
+ for (const comment of commentsBeforeHtml) {
167
+ comment.parentNode = null;
168
+ appendChild(state.root, comment);
169
+ }
170
+ appendChild(state.root, html);
171
+ state.root.documentElement = html;
172
+ state.root.head = head;
173
+ state.root.body = body;
174
+ }
175
+ }
176
+
87
177
  while (state.stack.length > 1) {
88
178
  const unclosedElement = state.stack.pop()!;
89
179
  const currentToken = getCurrentToken(state);
90
- addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.start || 0);
180
+ addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
91
181
  }
92
182
 
93
183
  return state.root;
94
184
  }
95
185
 
186
+ export function domToAST(dom: any): ASTNode {
187
+ function convert(node: any): ASTNode | null {
188
+ if (!node) return null;
189
+
190
+ if (node.nodeType === 9) {
191
+ const children: ASTNode[] = [];
192
+ if (node.childNodes) {
193
+ for (const child of node.childNodes) {
194
+ const converted = convert(child);
195
+ if (converted) children.push(converted);
196
+ }
197
+ }
198
+ return {
199
+ type: ASTNodeType.Document,
200
+ children
201
+ };
202
+ }
203
+
204
+ if (node.nodeType === 1) {
205
+ const children: ASTNode[] = [];
206
+ if (node.childNodes) {
207
+ for (const child of node.childNodes) {
208
+ const converted = convert(child);
209
+ if (converted) children.push(converted);
210
+ }
211
+ }
212
+ const tagName = node.tagName?.toLowerCase();
213
+ return {
214
+ type: ASTNodeType.Element,
215
+ tagName,
216
+ attributes: node.attributes || {},
217
+ children,
218
+ isSelfClosing: VOID_ELEMENTS.has(tagName)
219
+ } as ASTNode & { isSelfClosing: boolean };
220
+ }
221
+
222
+ if (node.nodeType === 3) {
223
+ return {
224
+ type: ASTNodeType.Text,
225
+ content: node.nodeValue || ''
226
+ } as ASTNode & { content: string };
227
+ }
228
+
229
+ if (node.nodeType === 8) {
230
+ return {
231
+ type: ASTNodeType.Comment,
232
+ content: node.nodeValue || ''
233
+ } as ASTNode & { content: string };
234
+ }
235
+
236
+ if (node.nodeType === 10) {
237
+ return {
238
+ type: ASTNodeType.Doctype,
239
+ content: node.name || 'html'
240
+ } as ASTNode & { content: string };
241
+ }
242
+
243
+ return null;
244
+ }
245
+
246
+ return convert(dom) || { type: ASTNodeType.Document, children: [] };
247
+ }
248
+
96
249
  function createParserState(tokens: Token[]): ParserState {
97
- const root: ASTNode = {
98
- type: ASTNodeType.DOCUMENT,
99
- children: [],
100
- tagName: '#document'
101
- };
250
+ const root = createDocument();
102
251
 
103
252
  return {
104
253
  tokens,
@@ -106,81 +255,240 @@ function createParserState(tokens: Token[]): ParserState {
106
255
  length: tokens.length,
107
256
  stack: [root],
108
257
  root,
258
+ insertionMode: InsertionMode.Initial,
109
259
  errors: []
110
260
  };
111
261
  }
112
262
 
113
263
  function parseToken(state: ParserState, token: Token): void {
114
- switch (token.type) {
115
- case TokenType.TAG_OPEN:
116
- parseOpenTag(state, token);
264
+ switch (state.insertionMode) {
265
+ case InsertionMode.Initial:
266
+ parseTokenInInitialMode(state, token);
117
267
  break;
118
- case TokenType.TAG_CLOSE:
119
- parseCloseTag(state, token);
268
+ case InsertionMode.BeforeHtml:
269
+ parseTokenInBeforeHtmlMode(state, token);
120
270
  break;
121
- case TokenType.TEXT:
122
- parseText(state, token);
271
+ case InsertionMode.BeforeHead:
272
+ parseTokenInBeforeHeadMode(state, token);
123
273
  break;
124
- case TokenType.COMMENT:
125
- parseComment(state, token);
274
+ case InsertionMode.InHead:
275
+ parseTokenInInHeadMode(state, token);
126
276
  break;
127
- case TokenType.CDATA:
128
- parseCDATA(state, token);
277
+ case InsertionMode.AfterHead:
278
+ parseTokenInAfterHeadMode(state, token);
129
279
  break;
130
- case TokenType.DOCTYPE:
131
- parseDoctype(state, token);
132
- break;
133
- case TokenType.PROCESSING_INSTRUCTION:
134
- parseProcessingInstruction(state, token);
280
+ case InsertionMode.InBody:
281
+ parseTokenInInBodyMode(state, token);
135
282
  break;
283
+ default:
284
+ parseTokenInInBodyMode(state, token); // fallback
136
285
  }
137
286
  }
138
287
 
139
- function parseOpenTag(state: ParserState, token: Token): void {
140
- const tagName = token.value.toLowerCase();
141
-
142
- handleAutoClosing(state, tagName);
143
-
144
- const currentParent = getCurrentParent(state);
288
+ function parseTokenInInitialMode(state: ParserState, token: Token): void {
289
+ if (token.type === TokenType.DOCTYPE) {
290
+ // TODO: Create DOCTYPE node
291
+ parseDoctype(state, token);
292
+ state.insertionMode = InsertionMode.BeforeHtml;
293
+ } else if (token.type === TokenType.COMMENT) {
294
+ parseComment(state, token);
295
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
296
+ // Ignore whitespace
297
+ } else {
298
+ // No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
299
+ const doctype = createDoctype('html');
300
+ appendChild(state.root, doctype);
301
+ state.insertionMode = InsertionMode.BeforeHtml;
302
+ parseToken(state, token); // Re-parse in new mode
303
+ }
304
+ }
145
305
 
146
- const element: ASTNode = {
147
- type: ASTNodeType.ELEMENT,
148
- tagName,
149
- attributes: token.attributes || {},
150
- children: [],
151
- parent: currentParent,
152
- isSelfClosing: token.isSelfClosing || VOID_ELEMENTS.has(tagName),
153
- position: token.position
154
- };
306
+ function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
307
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
308
+ const html = createElement('html', token.attributes || {});
309
+ appendChild(state.root, html);
310
+ state.root.documentElement = html;
311
+ state.stack.push(html);
312
+ state.insertionMode = InsertionMode.BeforeHead;
313
+ } else if (token.type === TokenType.COMMENT) {
314
+ parseComment(state, token);
315
+ } else if (token.type === TokenType.DOCTYPE) {
316
+ // Ignore
317
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
318
+ // Ignore whitespace
319
+ } else {
320
+ const html = createElement('html', {});
321
+ appendChild(state.root, html);
322
+ state.root.documentElement = html;
323
+ state.stack.push(html);
324
+ state.insertionMode = InsertionMode.BeforeHead;
325
+ parseToken(state, token);
326
+ }
327
+ }
155
328
 
156
- if (currentParent.children) {
157
- currentParent.children.push(element);
329
+ function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
330
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
331
+ const head = createElement('head', token.attributes || {});
332
+ appendChild(getCurrentParent(state), head);
333
+ state.root.head = head;
334
+ state.stack.push(head);
335
+ state.insertionMode = InsertionMode.InHead;
336
+ } else if (token.type === TokenType.COMMENT) {
337
+ parseComment(state, token);
338
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
339
+ // Ignore whitespace
340
+ } else {
341
+ const head = createElement('head', {});
342
+ appendChild(getCurrentParent(state), head);
343
+ state.root.head = head;
344
+ state.stack.push(head);
345
+ state.insertionMode = InsertionMode.InHead;
346
+ parseToken(state, token);
158
347
  }
348
+ }
159
349
 
160
- if (!element.isSelfClosing) {
350
+ function parseOpenTag(state: ParserState, token: Token): void {
351
+ const tagName = token.value.toLowerCase();
352
+ const currentParent = getCurrentParent(state);
353
+ const element = createElement(tagName, token.attributes || {});
354
+ appendChild(currentParent, element);
355
+
356
+ if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
161
357
  state.stack.push(element);
162
358
  }
163
359
  }
164
360
 
165
- function parseCloseTag(state: ParserState, token: Token): void {
166
- const tagName = token.value.toLowerCase();
361
+ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
362
+ const currentElement = getCurrentElement(state);
363
+ const currentTagName = currentElement?.tagName?.toLowerCase();
167
364
 
168
- let found = false;
169
- for (let i = state.stack.length - 1; i >= 0; i--) {
170
- const element = state.stack[i]!;
171
- if (element.tagName === tagName) {
172
- while (state.stack.length > i + 1) {
173
- const unclosedElement = state.stack.pop()!;
174
- addError(state, `Unclosed tag: ${unclosedElement.tagName}`, token.position?.start || 0);
365
+ if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
366
+ if (token.type === TokenType.TEXT) {
367
+ parseText(state, token);
368
+ return;
369
+ } else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
370
+ state.stack.pop();
371
+ return;
372
+ }
373
+ }
374
+
375
+ if (token.type === TokenType.TAG_OPEN) {
376
+ const tagName = token.value.toLowerCase();
377
+ if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
378
+ parseOpenTag(state, token);
379
+ } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
380
+ parseOpenTag(state, token);
381
+ } else if (tagName === 'head') {
382
+ // Ignore duplicate <head> tags
383
+ } else if (tagName.includes('-')) {
384
+ // Custom elements (tags with hyphens) are valid in <head>
385
+ parseOpenTag(state, token);
386
+ } else {
387
+ state.stack.pop();
388
+ state.insertionMode = InsertionMode.AfterHead;
389
+ parseToken(state, token);
390
+ }
391
+ } else if (token.type === TokenType.TAG_CLOSE) {
392
+ const tagName = token.value.toLowerCase();
393
+ if (tagName === 'head') {
394
+ state.stack.pop();
395
+ state.insertionMode = InsertionMode.AfterHead;
396
+ } else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
397
+ if (currentTagName === tagName) {
398
+ state.stack.pop();
175
399
  }
400
+ } else if (tagName.includes('-') && currentTagName === tagName) {
401
+ // Handle closing tags for custom elements in <head>
176
402
  state.stack.pop();
177
- found = true;
178
- break;
179
403
  }
404
+ } else if (token.type === TokenType.COMMENT) {
405
+ parseComment(state, token);
406
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
407
+ } else {
408
+ state.stack.pop();
409
+ state.insertionMode = InsertionMode.AfterHead;
410
+ parseToken(state, token);
180
411
  }
412
+ }
181
413
 
182
- if (!found) {
183
- addError(state, `Unexpected closing tag: ${tagName}`, token.position?.start || 0);
414
+ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
415
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
416
+ const body = createElement('body', token.attributes || {});
417
+ appendChild(getCurrentParent(state), body);
418
+ state.root.body = body;
419
+ state.stack.push(body);
420
+ state.insertionMode = InsertionMode.InBody;
421
+ } else if (token.type === TokenType.COMMENT) {
422
+ parseComment(state, token);
423
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
424
+ // Ignore whitespace
425
+ } else {
426
+ const body = createElement('body', {});
427
+ appendChild(getCurrentParent(state), body);
428
+ state.root.body = body;
429
+ state.stack.push(body);
430
+ state.insertionMode = InsertionMode.InBody;
431
+ parseToken(state, token);
432
+ }
433
+ }
434
+
435
+ const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
436
+ const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
437
+
438
+ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
439
+ if (token.type === TokenType.TAG_OPEN) {
440
+ const tagName = token.value.toLowerCase();
441
+
442
+ handleAutoClosing(state, tagName);
443
+
444
+ const currentParent = getCurrentParent(state);
445
+
446
+ let namespaceURI: string | undefined;
447
+ if (tagName === 'svg') {
448
+ namespaceURI = SVG_NAMESPACE;
449
+ } else if (tagName === 'math') {
450
+ namespaceURI = MATHML_NAMESPACE;
451
+ }
452
+
453
+ const element = createElement(tagName, token.attributes || {}, namespaceURI);
454
+
455
+ appendChild(currentParent, element);
456
+
457
+ if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
458
+ state.stack.push(element);
459
+ }
460
+ } else if (token.type === TokenType.TAG_CLOSE) {
461
+ const tagName = token.value.toLowerCase();
462
+
463
+ // Generate implied end tags
464
+ const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
465
+ while (state.stack.length > 1) { // Don't pop document
466
+ const currentElement = getCurrentElement(state);
467
+ if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
468
+ break;
469
+ }
470
+ state.stack.pop();
471
+ addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
472
+ }
473
+
474
+ const currentElement = getCurrentElement(state);
475
+ if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
476
+ state.stack.pop();
477
+ } else {
478
+ // For now, just ignore unmatched closing tags
479
+ // TODO: Implement full adoption agency algorithm
480
+ addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
481
+ }
482
+ } else if (token.type === TokenType.TEXT) {
483
+ parseText(state, token);
484
+ } else if (token.type === TokenType.COMMENT) {
485
+ parseComment(state, token);
486
+ } else if (token.type === TokenType.CDATA) {
487
+ parseCDATA(state, token);
488
+ } else if (token.type === TokenType.DOCTYPE) {
489
+ // Ignore
490
+ } else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
491
+ parseProcessingInstruction(state, token);
184
492
  }
185
493
  }
186
494
 
@@ -192,76 +500,134 @@ function parseText(state: ParserState, token: Token): void {
192
500
  return;
193
501
  }
194
502
 
195
- const textNode: ASTNode = {
196
- type: ASTNodeType.TEXT,
197
- content,
198
- parent: currentParent,
199
- position: token.position
200
- };
201
-
202
- if (currentParent.children) {
203
- currentParent.children.push(textNode);
204
- }
503
+ const textNode = createTextNode(content);
504
+ appendChild(currentParent, textNode);
205
505
  }
206
506
 
207
507
  function parseComment(state: ParserState, token: Token): void {
208
508
  const currentParent = getCurrentParent(state);
209
509
 
210
- const commentNode: ASTNode = {
211
- type: ASTNodeType.COMMENT,
212
- content: token.value,
213
- parent: currentParent,
214
- position: token.position
215
- };
216
-
217
- if (currentParent.children) {
218
- currentParent.children.push(commentNode);
219
- }
510
+ const commentNode = createComment(token.value);
511
+ appendChild(currentParent, commentNode);
220
512
  }
221
513
 
222
514
  function parseCDATA(state: ParserState, token: Token): void {
223
- const currentParent = getCurrentParent(state);
224
-
225
- const cdataNode: ASTNode = {
226
- type: ASTNodeType.CDATA,
227
- content: token.value,
228
- parent: currentParent,
229
- position: token.position
230
- };
231
-
232
- if (currentParent.children) {
233
- currentParent.children.push(cdataNode);
234
- }
515
+ // TODO: implement CDATA
235
516
  }
236
517
 
237
518
  function parseDoctype(state: ParserState, token: Token): void {
238
- const currentParent = getCurrentParent(state);
239
-
240
- const doctypeNode: ASTNode = {
241
- type: ASTNodeType.DOCTYPE,
242
- content: token.value,
243
- parent: currentParent,
244
- position: token.position
245
- };
246
-
247
- if (currentParent.children) {
248
- currentParent.children.push(doctypeNode);
249
- }
519
+ const doctype = createDoctype(token.value || 'html');
520
+ appendChild(state.root, doctype);
521
+ state.root.doctype = doctype;
250
522
  }
251
523
 
252
524
  function parseProcessingInstruction(state: ParserState, token: Token): void {
253
- const currentParent = getCurrentParent(state);
254
-
255
- const piNode: ASTNode = {
256
- type: ASTNodeType.PROCESSING_INSTRUCTION,
257
- content: token.value,
258
- parent: currentParent,
259
- position: token.position
260
- };
525
+ // TODO: implement ProcessingInstruction
526
+ }
261
527
 
262
- if (currentParent.children) {
263
- currentParent.children.push(piNode);
528
+ function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
529
+ // HTML5 Adoption Agency Algorithm - simplified but more correct implementation
530
+
531
+ // 1. If the current node is an HTML element whose tag name matches the token's tag name,
532
+ // then pop the current node off the stack of open elements and abort these steps.
533
+ const currentElement = getCurrentElement(state);
534
+ if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
535
+ state.stack.pop();
536
+ return;
537
+ }
538
+
539
+ // 2. Let outer loop counter be 0
540
+ let outerLoopCounter = 0;
541
+ const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
542
+
543
+ while (outerLoopCounter < 8) { // Prevent infinite loops
544
+ outerLoopCounter++;
545
+
546
+ // 3. Let the formatting element be the last element in the list of active formatting elements
547
+ // that is between the end of the list and the last scope marker or the start of the list,
548
+ // if any, that has the same tag name as the token.
549
+
550
+ // For simplicity, find the innermost element with matching tag name
551
+ let formattingElementIndex = -1;
552
+ for (let i = state.stack.length - 1; i >= 0; i--) {
553
+ const element = state.stack[i];
554
+ if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
555
+ formattingElementIndex = i;
556
+ break;
557
+ }
558
+ }
559
+
560
+ if (formattingElementIndex === -1) {
561
+ // No formatting element found, just find any element with matching tag name
562
+ for (let i = state.stack.length - 1; i >= 0; i--) {
563
+ const element = state.stack[i];
564
+ if (element.tagName && element.tagName.toLowerCase() === tagName) {
565
+ formattingElementIndex = i;
566
+ break;
567
+ }
568
+ }
569
+ }
570
+
571
+ if (formattingElementIndex === -1) {
572
+ // No matching element found, ignore the token
573
+ addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
574
+ return;
575
+ }
576
+
577
+ const formattingElement = state.stack[formattingElementIndex];
578
+
579
+ // 4. If there is no element in the stack of open elements that has the same tag name as the
580
+ // formatting element, then remove the element from the list of active formatting elements
581
+ // and abort these steps.
582
+ let openElementIndex = -1;
583
+ for (let i = state.stack.length - 1; i >= 0; i--) {
584
+ if (state.stack[i] === formattingElement) {
585
+ openElementIndex = i;
586
+ break;
587
+ }
588
+ }
589
+
590
+ if (openElementIndex === -1) {
591
+ // Element not in stack, ignore
592
+ return;
593
+ }
594
+
595
+ // 5. If the element is not in the stack of open elements, then this is a parse error;
596
+ // remove the element from the list of active formatting elements and abort these steps.
597
+ // (Already checked above)
598
+
599
+ // 6. Let the furthest block be the topmost node in the stack of open elements that is lower
600
+ // in the stack than the formatting element, and is an element in the special category.
601
+ const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
602
+
603
+ let furthestBlockIndex = -1;
604
+ for (let i = openElementIndex + 1; i < state.stack.length; i++) {
605
+ const element = state.stack[i];
606
+ if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
607
+ furthestBlockIndex = i;
608
+ break;
609
+ }
610
+ }
611
+
612
+ if (furthestBlockIndex === -1) {
613
+ // No special element found, just pop elements until we reach the formatting element
614
+ while (state.stack.length > openElementIndex + 1) {
615
+ state.stack.pop();
616
+ }
617
+ state.stack.pop(); // Pop the formatting element
618
+ return;
619
+ }
620
+
621
+ // 7. Simplified: just pop everything until the formatting element
622
+ while (state.stack.length > openElementIndex + 1) {
623
+ state.stack.pop();
624
+ }
625
+ state.stack.pop(); // Pop the formatting element
626
+ return;
264
627
  }
628
+
629
+ // If we get here, something went wrong, ignore the token
630
+ addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
265
631
  }
266
632
 
267
633
  function handleAutoClosing(state: ParserState, tagName: string): void {
@@ -269,19 +635,19 @@ function handleAutoClosing(state: ParserState, tagName: string): void {
269
635
  if (!autoCloseList) return;
270
636
 
271
637
  const currentElement = getCurrentElement(state);
272
- if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
638
+ if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
273
639
  state.stack.pop();
274
640
  }
275
641
  }
276
642
 
277
- function getCurrentParent(state: ParserState): ASTNode {
278
- return state.stack[state.stack.length - 1]!;
643
+ function getCurrentParent(state: ParserState): any {
644
+ return state.stack[state.stack.length - 1];
279
645
  }
280
646
 
281
- function getCurrentElement(state: ParserState): ASTNode | null {
647
+ function getCurrentElement(state: ParserState): any {
282
648
  for (let i = state.stack.length - 1; i >= 0; i--) {
283
- const element = state.stack[i]!;
284
- if (element.type === ASTNodeType.ELEMENT) {
649
+ const element = state.stack[i];
650
+ if (element.nodeType === 1) { // ELEMENT_NODE
285
651
  return element;
286
652
  }
287
653
  }
@@ -306,7 +672,7 @@ function addError(state: ParserState, message: string, position: number): void {
306
672
  });
307
673
  }
308
674
 
309
- function shouldSkipWhitespace(parent: ASTNode): boolean {
675
+ function shouldSkipWhitespace(parent: any): boolean {
310
676
  const skipWhitespaceIn = new Set([
311
677
  'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
312
678
  'ul', 'ol', 'dl', 'select', 'optgroup'