@tkeron/html-parser 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +8 -3
  3. package/index.ts +4 -0
  4. package/package.json +13 -6
  5. package/src/css-selector.ts +45 -27
  6. package/src/dom-simulator.ts +162 -20
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +478 -183
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +59 -139
  12. package/tests/advanced.test.ts +119 -106
  13. package/tests/custom-elements.test.ts +172 -162
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +637 -0
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +43 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +172 -193
  45. package/tests/selectors.test.ts +64 -1
  46. package/tests/serializer-core.test.ts +16 -0
  47. package/tests/serializer-data/core.test +125 -0
  48. package/tests/serializer-data/injectmeta.test +66 -0
  49. package/tests/serializer-data/optionaltags.test +965 -0
  50. package/tests/serializer-data/options.test +60 -0
  51. package/tests/serializer-data/whitespace.test +51 -0
  52. package/tests/serializer-injectmeta.test.ts +16 -0
  53. package/tests/serializer-optionaltags.test.ts +16 -0
  54. package/tests/serializer-options.test.ts +16 -0
  55. package/tests/serializer-whitespace.test.ts +16 -0
  56. package/tests/tokenizer-namedEntities.test.ts +20 -0
  57. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  58. package/tests/tokenizer.test.ts +83 -0
  59. package/tests/tree-construction-adoption01.test.ts +37 -0
  60. package/tests/tree-construction-adoption02.test.ts +34 -0
  61. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  62. package/tests/tree-construction-entities02.test.ts +33 -0
  63. package/tests/tree-construction-html5test-com.test.ts +24 -0
  64. package/tests/tree-construction-math.test.ts +18 -0
  65. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  66. package/tests/tree-construction-noscript01.test.ts +18 -0
  67. package/tests/tree-construction-ruby.test.ts +21 -0
  68. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  69. package/tests/tree-construction-svg.test.ts +21 -0
  70. package/tests/tree-construction-template.test.ts +21 -0
  71. package/tests/tree-construction-tests10.test.ts +21 -0
  72. package/tests/tree-construction-tests11.test.ts +21 -0
  73. package/tests/tree-construction-tests20.test.ts +18 -0
  74. package/tests/tree-construction-tests21.test.ts +18 -0
  75. package/tests/tree-construction-tests23.test.ts +18 -0
  76. package/tests/tree-construction-tests24.test.ts +18 -0
  77. package/tests/tree-construction-tests5.test.ts +21 -0
  78. package/tests/tree-construction-tests6.test.ts +21 -0
  79. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  80. package/tests/void-elements.test.ts +471 -0
  81. package/tests/official/README.md +0 -87
  82. package/tests/official/acid/acid-tests.test.ts +0 -309
  83. package/tests/official/final-output/final-output.test.ts +0 -361
  84. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  85. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  86. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  87. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  88. package/tests/official/validator/validator-tests.test.ts +0 -237
  89. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  90. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  91. package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/parser.ts CHANGED
@@ -1,38 +1,14 @@
1
1
  import type { Token } from './tokenizer.js';
2
2
  import { TokenType } from './tokenizer.js';
3
-
4
- export interface ASTNode {
5
- type: ASTNodeType;
6
- tagName?: string;
7
- attributes?: Record<string, string>;
8
- children?: ASTNode[];
9
- content?: string;
10
- parent?: ASTNode;
11
- isSelfClosing?: boolean;
12
- position?: {
13
- start: number;
14
- end: number;
15
- line: number;
16
- column: number;
17
- };
18
- }
19
-
20
- export enum ASTNodeType {
21
- DOCUMENT = 'DOCUMENT',
22
- ELEMENT = 'ELEMENT',
23
- TEXT = 'TEXT',
24
- COMMENT = 'COMMENT',
25
- CDATA = 'CDATA',
26
- DOCTYPE = 'DOCTYPE',
27
- PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
28
- }
3
+ import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
29
4
 
30
5
  export interface ParserState {
31
6
  tokens: Token[];
32
7
  position: number;
33
8
  length: number;
34
- stack: ASTNode[];
35
- root: ASTNode;
9
+ stack: any[]; // DOM elements
10
+ root: any; // Document
11
+ insertionMode: InsertionMode;
36
12
  errors: ParseError[];
37
13
  }
38
14
 
@@ -44,6 +20,32 @@ export interface ParseError {
44
20
  severity: 'error' | 'warning';
45
21
  }
46
22
 
23
+ export enum InsertionMode {
24
+ Initial = 'initial',
25
+ BeforeHtml = 'beforeHtml',
26
+ BeforeHead = 'beforeHead',
27
+ InHead = 'inHead',
28
+ AfterHead = 'afterHead',
29
+ InBody = 'inBody'
30
+ }
31
+
32
+ export enum ASTNodeType {
33
+ Document = 'document',
34
+ Element = 'element',
35
+ Text = 'text',
36
+ Comment = 'comment',
37
+ Doctype = 'doctype',
38
+ CDATA = 'cdata'
39
+ }
40
+
41
+ export interface ASTNode {
42
+ type: ASTNodeType;
43
+ tagName?: string;
44
+ value?: string;
45
+ attributes?: Record<string, string>;
46
+ children?: ASTNode[];
47
+ }
48
+
47
49
  const VOID_ELEMENTS = new Set([
48
50
  'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
49
51
  'link', 'meta', 'param', 'source', 'track', 'wbr'
@@ -57,7 +59,41 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
57
59
  'li': ['li'],
58
60
  'dt': ['dt', 'dd'],
59
61
  'dd': ['dt', 'dd'],
62
+ 'address': ['p'],
63
+ 'article': ['p'],
64
+ 'aside': ['p'],
65
+ 'blockquote': ['p'],
66
+ 'center': ['p'],
67
+ 'details': ['p'],
68
+ 'dialog': ['p'],
69
+ 'dir': ['p'],
70
+ 'div': ['p'],
71
+ 'dl': ['p'],
72
+ 'fieldset': ['p'],
73
+ 'figcaption': ['p'],
74
+ 'figure': ['p'],
75
+ 'footer': ['p'],
76
+ 'form': ['p'],
77
+ 'h1': ['p'],
78
+ 'h2': ['p'],
79
+ 'h3': ['p'],
80
+ 'h4': ['p'],
81
+ 'h5': ['p'],
82
+ 'h6': ['p'],
83
+ 'header': ['p'],
84
+ 'hgroup': ['p'],
85
+ 'hr': ['p'],
86
+ 'listing': ['p'],
87
+ 'main': ['p'],
88
+ 'menu': ['p'],
89
+ 'nav': ['p'],
90
+ 'ol': ['p'],
60
91
  'p': ['p'],
92
+ 'pre': ['p'],
93
+ 'section': ['p'],
94
+ 'summary': ['p'],
95
+ 'table': ['p'],
96
+ 'ul': ['p'],
61
97
  'rt': ['rt', 'rp'],
62
98
  'rp': ['rt', 'rp'],
63
99
  'optgroup': ['optgroup'],
@@ -70,7 +106,7 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
70
106
  'th': ['td', 'th']
71
107
  };
72
108
 
73
- export function parse(tokens: Token[]): ASTNode {
109
+ export function parse(tokens: Token[]): any {
74
110
  const state = createParserState(tokens);
75
111
 
76
112
  while (state.position < state.length) {
@@ -84,21 +120,119 @@ export function parse(tokens: Token[]): ASTNode {
84
120
  advance(state);
85
121
  }
86
122
 
123
+ // Create implicit html, head, body if needed
124
+ if (state.root.childNodes && state.root.childNodes.length > 0) {
125
+ let hasHtml = false;
126
+ for (const child of state.root.childNodes) {
127
+ if (child.nodeType === 1 && child.tagName === 'HTML') {
128
+ hasHtml = true;
129
+ state.root.documentElement = child;
130
+ break;
131
+ }
132
+ }
133
+ if (!hasHtml) {
134
+ const html = createElement('html', {});
135
+ const head = createElement('head', {});
136
+ const body = createElement('body', {});
137
+ appendChild(html, head);
138
+ appendChild(html, body);
139
+
140
+ const doctypes: any[] = [];
141
+ const children = [...state.root.childNodes];
142
+ for (const child of children) {
143
+ if (child.nodeType === 10) {
144
+ doctypes.push(child);
145
+ } else {
146
+ appendChild(body, child);
147
+ }
148
+ }
149
+
150
+ state.root.childNodes = [];
151
+ for (const doctype of doctypes) {
152
+ doctype.parentNode = null;
153
+ appendChild(state.root, doctype);
154
+ }
155
+ appendChild(state.root, html);
156
+ state.root.documentElement = html;
157
+ state.root.head = head;
158
+ state.root.body = body;
159
+ }
160
+ }
161
+
87
162
  while (state.stack.length > 1) {
88
163
  const unclosedElement = state.stack.pop()!;
89
164
  const currentToken = getCurrentToken(state);
90
- addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.start || 0);
165
+ addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
91
166
  }
92
167
 
93
168
  return state.root;
94
169
  }
95
170
 
171
+ export function domToAST(dom: any): ASTNode {
172
+ function convert(node: any): ASTNode | null {
173
+ if (!node) return null;
174
+
175
+ if (node.nodeType === 9) {
176
+ const children: ASTNode[] = [];
177
+ if (node.childNodes) {
178
+ for (const child of node.childNodes) {
179
+ const converted = convert(child);
180
+ if (converted) children.push(converted);
181
+ }
182
+ }
183
+ return {
184
+ type: ASTNodeType.Document,
185
+ children
186
+ };
187
+ }
188
+
189
+ if (node.nodeType === 1) {
190
+ const children: ASTNode[] = [];
191
+ if (node.childNodes) {
192
+ for (const child of node.childNodes) {
193
+ const converted = convert(child);
194
+ if (converted) children.push(converted);
195
+ }
196
+ }
197
+ const tagName = node.tagName?.toLowerCase();
198
+ return {
199
+ type: ASTNodeType.Element,
200
+ tagName,
201
+ attributes: node.attributes || {},
202
+ children,
203
+ isSelfClosing: VOID_ELEMENTS.has(tagName)
204
+ } as ASTNode & { isSelfClosing: boolean };
205
+ }
206
+
207
+ if (node.nodeType === 3) {
208
+ return {
209
+ type: ASTNodeType.Text,
210
+ content: node.nodeValue || ''
211
+ } as ASTNode & { content: string };
212
+ }
213
+
214
+ if (node.nodeType === 8) {
215
+ return {
216
+ type: ASTNodeType.Comment,
217
+ content: node.nodeValue || ''
218
+ } as ASTNode & { content: string };
219
+ }
220
+
221
+ if (node.nodeType === 10) {
222
+ return {
223
+ type: ASTNodeType.Doctype,
224
+ content: node.name || 'html'
225
+ } as ASTNode & { content: string };
226
+ }
227
+
228
+ return null;
229
+ }
230
+
231
+ return convert(dom) || { type: ASTNodeType.Document, children: [] };
232
+ }
233
+
96
234
  function createParserState(tokens: Token[]): ParserState {
97
- const root: ASTNode = {
98
- type: ASTNodeType.DOCUMENT,
99
- children: [],
100
- tagName: '#document'
101
- };
235
+ const root = createDocument();
102
236
 
103
237
  return {
104
238
  tokens,
@@ -106,81 +240,223 @@ function createParserState(tokens: Token[]): ParserState {
106
240
  length: tokens.length,
107
241
  stack: [root],
108
242
  root,
243
+ insertionMode: InsertionMode.Initial,
109
244
  errors: []
110
245
  };
111
246
  }
112
247
 
113
248
  function parseToken(state: ParserState, token: Token): void {
114
- switch (token.type) {
115
- case TokenType.TAG_OPEN:
116
- parseOpenTag(state, token);
249
+ switch (state.insertionMode) {
250
+ case InsertionMode.Initial:
251
+ parseTokenInInitialMode(state, token);
117
252
  break;
118
- case TokenType.TAG_CLOSE:
119
- parseCloseTag(state, token);
253
+ case InsertionMode.BeforeHtml:
254
+ parseTokenInBeforeHtmlMode(state, token);
120
255
  break;
121
- case TokenType.TEXT:
122
- parseText(state, token);
123
- break;
124
- case TokenType.COMMENT:
125
- parseComment(state, token);
256
+ case InsertionMode.BeforeHead:
257
+ parseTokenInBeforeHeadMode(state, token);
126
258
  break;
127
- case TokenType.CDATA:
128
- parseCDATA(state, token);
259
+ case InsertionMode.InHead:
260
+ parseTokenInInHeadMode(state, token);
129
261
  break;
130
- case TokenType.DOCTYPE:
131
- parseDoctype(state, token);
262
+ case InsertionMode.AfterHead:
263
+ parseTokenInAfterHeadMode(state, token);
132
264
  break;
133
- case TokenType.PROCESSING_INSTRUCTION:
134
- parseProcessingInstruction(state, token);
265
+ case InsertionMode.InBody:
266
+ parseTokenInInBodyMode(state, token);
135
267
  break;
268
+ default:
269
+ parseTokenInInBodyMode(state, token); // fallback
136
270
  }
137
271
  }
138
272
 
139
- function parseOpenTag(state: ParserState, token: Token): void {
140
- const tagName = token.value.toLowerCase();
141
-
142
- handleAutoClosing(state, tagName);
143
-
144
- const currentParent = getCurrentParent(state);
273
+ function parseTokenInInitialMode(state: ParserState, token: Token): void {
274
+ if (token.type === TokenType.DOCTYPE) {
275
+ // TODO: Create DOCTYPE node
276
+ parseDoctype(state, token);
277
+ state.insertionMode = InsertionMode.BeforeHtml;
278
+ } else if (token.type === TokenType.COMMENT) {
279
+ parseComment(state, token);
280
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
281
+ // Ignore whitespace
282
+ } else {
283
+ // No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
284
+ const doctype = createDoctype('html');
285
+ appendChild(state.root, doctype);
286
+ state.insertionMode = InsertionMode.BeforeHtml;
287
+ parseToken(state, token); // Re-parse in new mode
288
+ }
289
+ }
145
290
 
146
- const element: ASTNode = {
147
- type: ASTNodeType.ELEMENT,
148
- tagName,
149
- attributes: token.attributes || {},
150
- children: [],
151
- parent: currentParent,
152
- isSelfClosing: token.isSelfClosing || VOID_ELEMENTS.has(tagName),
153
- position: token.position
154
- };
291
+ function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
292
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
293
+ const html = createElement('html', token.attributes || {});
294
+ appendChild(state.root, html);
295
+ state.root.documentElement = html;
296
+ state.stack.push(html);
297
+ state.insertionMode = InsertionMode.BeforeHead;
298
+ } else if (token.type === TokenType.COMMENT) {
299
+ parseComment(state, token);
300
+ } else if (token.type === TokenType.DOCTYPE) {
301
+ // Ignore
302
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
303
+ // Ignore whitespace
304
+ } else {
305
+ const html = createElement('html', {});
306
+ appendChild(state.root, html);
307
+ state.root.documentElement = html;
308
+ state.stack.push(html);
309
+ state.insertionMode = InsertionMode.BeforeHead;
310
+ parseToken(state, token);
311
+ }
312
+ }
155
313
 
156
- if (currentParent.children) {
157
- currentParent.children.push(element);
314
+ function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
315
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
316
+ const head = createElement('head', token.attributes || {});
317
+ appendChild(getCurrentParent(state), head);
318
+ state.root.head = head;
319
+ state.stack.push(head);
320
+ state.insertionMode = InsertionMode.InHead;
321
+ } else if (token.type === TokenType.COMMENT) {
322
+ parseComment(state, token);
323
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
324
+ // Ignore whitespace
325
+ } else {
326
+ const head = createElement('head', {});
327
+ appendChild(getCurrentParent(state), head);
328
+ state.root.head = head;
329
+ state.stack.push(head);
330
+ state.insertionMode = InsertionMode.InHead;
331
+ parseToken(state, token);
158
332
  }
333
+ }
159
334
 
160
- if (!element.isSelfClosing) {
335
+ function parseOpenTag(state: ParserState, token: Token): void {
336
+ const tagName = token.value.toLowerCase();
337
+ const currentParent = getCurrentParent(state);
338
+ const element = createElement(tagName, token.attributes || {});
339
+ appendChild(currentParent, element);
340
+
341
+ if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
161
342
  state.stack.push(element);
162
343
  }
163
344
  }
164
345
 
165
- function parseCloseTag(state: ParserState, token: Token): void {
166
- const tagName = token.value.toLowerCase();
346
+ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
347
+ const currentElement = getCurrentElement(state);
348
+ const currentTagName = currentElement?.tagName?.toLowerCase();
167
349
 
168
- let found = false;
169
- for (let i = state.stack.length - 1; i >= 0; i--) {
170
- const element = state.stack[i]!;
171
- if (element.tagName === tagName) {
172
- while (state.stack.length > i + 1) {
173
- const unclosedElement = state.stack.pop()!;
174
- addError(state, `Unclosed tag: ${unclosedElement.tagName}`, token.position?.start || 0);
175
- }
350
+ if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
351
+ if (token.type === TokenType.TEXT) {
352
+ parseText(state, token);
353
+ return;
354
+ } else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
176
355
  state.stack.pop();
177
- found = true;
178
- break;
356
+ return;
357
+ }
358
+ }
359
+
360
+ if (token.type === TokenType.TAG_OPEN) {
361
+ const tagName = token.value.toLowerCase();
362
+ if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
363
+ parseOpenTag(state, token);
364
+ } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
365
+ parseOpenTag(state, token);
366
+ } else if (tagName === 'head') {
367
+ } else {
368
+ state.stack.pop();
369
+ state.insertionMode = InsertionMode.AfterHead;
370
+ parseToken(state, token);
371
+ }
372
+ } else if (token.type === TokenType.TAG_CLOSE) {
373
+ const tagName = token.value.toLowerCase();
374
+ if (tagName === 'head') {
375
+ state.stack.pop();
376
+ state.insertionMode = InsertionMode.AfterHead;
377
+ } else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
378
+ if (currentTagName === tagName) {
379
+ state.stack.pop();
380
+ }
179
381
  }
382
+ } else if (token.type === TokenType.COMMENT) {
383
+ parseComment(state, token);
384
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
385
+ } else {
386
+ state.stack.pop();
387
+ state.insertionMode = InsertionMode.AfterHead;
388
+ parseToken(state, token);
180
389
  }
390
+ }
181
391
 
182
- if (!found) {
183
- addError(state, `Unexpected closing tag: ${tagName}`, token.position?.start || 0);
392
+ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
393
+ if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
394
+ const body = createElement('body', token.attributes || {});
395
+ appendChild(getCurrentParent(state), body);
396
+ state.root.body = body;
397
+ state.stack.push(body);
398
+ state.insertionMode = InsertionMode.InBody;
399
+ } else if (token.type === TokenType.COMMENT) {
400
+ parseComment(state, token);
401
+ } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
402
+ // Ignore whitespace
403
+ } else {
404
+ const body = createElement('body', {});
405
+ appendChild(getCurrentParent(state), body);
406
+ state.root.body = body;
407
+ state.stack.push(body);
408
+ state.insertionMode = InsertionMode.InBody;
409
+ parseToken(state, token);
410
+ }
411
+ }
412
+
413
+ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
414
+ if (token.type === TokenType.TAG_OPEN) {
415
+ const tagName = token.value.toLowerCase();
416
+
417
+ handleAutoClosing(state, tagName);
418
+
419
+ const currentParent = getCurrentParent(state);
420
+
421
+ const element = createElement(tagName, token.attributes || {});
422
+
423
+ appendChild(currentParent, element);
424
+
425
+ if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
426
+ state.stack.push(element);
427
+ }
428
+ } else if (token.type === TokenType.TAG_CLOSE) {
429
+ const tagName = token.value.toLowerCase();
430
+
431
+ // Generate implied end tags
432
+ const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
433
+ while (state.stack.length > 1) { // Don't pop document
434
+ const currentElement = getCurrentElement(state);
435
+ if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
436
+ break;
437
+ }
438
+ state.stack.pop();
439
+ addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
440
+ }
441
+
442
+ const currentElement = getCurrentElement(state);
443
+ if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
444
+ state.stack.pop();
445
+ } else {
446
+ // For now, just ignore unmatched closing tags
447
+ // TODO: Implement full adoption agency algorithm
448
+ addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
449
+ }
450
+ } else if (token.type === TokenType.TEXT) {
451
+ parseText(state, token);
452
+ } else if (token.type === TokenType.COMMENT) {
453
+ parseComment(state, token);
454
+ } else if (token.type === TokenType.CDATA) {
455
+ parseCDATA(state, token);
456
+ } else if (token.type === TokenType.DOCTYPE) {
457
+ // Ignore
458
+ } else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
459
+ parseProcessingInstruction(state, token);
184
460
  }
185
461
  }
186
462
 
@@ -192,76 +468,134 @@ function parseText(state: ParserState, token: Token): void {
192
468
  return;
193
469
  }
194
470
 
195
- const textNode: ASTNode = {
196
- type: ASTNodeType.TEXT,
197
- content,
198
- parent: currentParent,
199
- position: token.position
200
- };
201
-
202
- if (currentParent.children) {
203
- currentParent.children.push(textNode);
204
- }
471
+ const textNode = createTextNode(content);
472
+ appendChild(currentParent, textNode);
205
473
  }
206
474
 
207
475
  function parseComment(state: ParserState, token: Token): void {
208
476
  const currentParent = getCurrentParent(state);
209
477
 
210
- const commentNode: ASTNode = {
211
- type: ASTNodeType.COMMENT,
212
- content: token.value,
213
- parent: currentParent,
214
- position: token.position
215
- };
216
-
217
- if (currentParent.children) {
218
- currentParent.children.push(commentNode);
219
- }
478
+ const commentNode = createComment(token.value);
479
+ appendChild(currentParent, commentNode);
220
480
  }
221
481
 
222
482
  function parseCDATA(state: ParserState, token: Token): void {
223
- const currentParent = getCurrentParent(state);
224
-
225
- const cdataNode: ASTNode = {
226
- type: ASTNodeType.CDATA,
227
- content: token.value,
228
- parent: currentParent,
229
- position: token.position
230
- };
231
-
232
- if (currentParent.children) {
233
- currentParent.children.push(cdataNode);
234
- }
483
+ // TODO: implement CDATA
235
484
  }
236
485
 
237
486
  function parseDoctype(state: ParserState, token: Token): void {
238
- const currentParent = getCurrentParent(state);
239
-
240
- const doctypeNode: ASTNode = {
241
- type: ASTNodeType.DOCTYPE,
242
- content: token.value,
243
- parent: currentParent,
244
- position: token.position
245
- };
246
-
247
- if (currentParent.children) {
248
- currentParent.children.push(doctypeNode);
249
- }
487
+ const doctype = createDoctype(token.value || 'html');
488
+ appendChild(state.root, doctype);
489
+ state.root.doctype = doctype;
250
490
  }
251
491
 
252
492
  function parseProcessingInstruction(state: ParserState, token: Token): void {
253
- const currentParent = getCurrentParent(state);
254
-
255
- const piNode: ASTNode = {
256
- type: ASTNodeType.PROCESSING_INSTRUCTION,
257
- content: token.value,
258
- parent: currentParent,
259
- position: token.position
260
- };
493
+ // TODO: implement ProcessingInstruction
494
+ }
261
495
 
262
- if (currentParent.children) {
263
- currentParent.children.push(piNode);
496
+ function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
497
+ // HTML5 Adoption Agency Algorithm - simplified but more correct implementation
498
+
499
+ // 1. If the current node is an HTML element whose tag name matches the token's tag name,
500
+ // then pop the current node off the stack of open elements and abort these steps.
501
+ const currentElement = getCurrentElement(state);
502
+ if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
503
+ state.stack.pop();
504
+ return;
264
505
  }
506
+
507
+ // 2. Let outer loop counter be 0
508
+ let outerLoopCounter = 0;
509
+ const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
510
+
511
+ while (outerLoopCounter < 8) { // Prevent infinite loops
512
+ outerLoopCounter++;
513
+
514
+ // 3. Let the formatting element be the last element in the list of active formatting elements
515
+ // that is between the end of the list and the last scope marker or the start of the list,
516
+ // if any, that has the same tag name as the token.
517
+
518
+ // For simplicity, find the innermost element with matching tag name
519
+ let formattingElementIndex = -1;
520
+ for (let i = state.stack.length - 1; i >= 0; i--) {
521
+ const element = state.stack[i];
522
+ if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
523
+ formattingElementIndex = i;
524
+ break;
525
+ }
526
+ }
527
+
528
+ if (formattingElementIndex === -1) {
529
+ // No formatting element found, just find any element with matching tag name
530
+ for (let i = state.stack.length - 1; i >= 0; i--) {
531
+ const element = state.stack[i];
532
+ if (element.tagName && element.tagName.toLowerCase() === tagName) {
533
+ formattingElementIndex = i;
534
+ break;
535
+ }
536
+ }
537
+ }
538
+
539
+ if (formattingElementIndex === -1) {
540
+ // No matching element found, ignore the token
541
+ addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
542
+ return;
543
+ }
544
+
545
+ const formattingElement = state.stack[formattingElementIndex];
546
+
547
+ // 4. If there is no element in the stack of open elements that has the same tag name as the
548
+ // formatting element, then remove the element from the list of active formatting elements
549
+ // and abort these steps.
550
+ let openElementIndex = -1;
551
+ for (let i = state.stack.length - 1; i >= 0; i--) {
552
+ if (state.stack[i] === formattingElement) {
553
+ openElementIndex = i;
554
+ break;
555
+ }
556
+ }
557
+
558
+ if (openElementIndex === -1) {
559
+ // Element not in stack, ignore
560
+ return;
561
+ }
562
+
563
+ // 5. If the element is not in the stack of open elements, then this is a parse error;
564
+ // remove the element from the list of active formatting elements and abort these steps.
565
+ // (Already checked above)
566
+
567
+ // 6. Let the furthest block be the topmost node in the stack of open elements that is lower
568
+ // in the stack than the formatting element, and is an element in the special category.
569
+ const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
570
+
571
+ let furthestBlockIndex = -1;
572
+ for (let i = openElementIndex + 1; i < state.stack.length; i++) {
573
+ const element = state.stack[i];
574
+ if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
575
+ furthestBlockIndex = i;
576
+ break;
577
+ }
578
+ }
579
+
580
+ if (furthestBlockIndex === -1) {
581
+ // No special element found, just pop elements until we reach the formatting element
582
+ while (state.stack.length > openElementIndex + 1) {
583
+ state.stack.pop();
584
+ }
585
+ state.stack.pop(); // Pop the formatting element
586
+ return;
587
+ }
588
+
589
+ // 7. Simplified: just pop everything until the formatting element
590
+ while (state.stack.length > openElementIndex + 1) {
591
+ state.stack.pop();
592
+ }
593
+ state.stack.pop(); // Pop the formatting element
594
+ return;
595
+ }
596
+
597
+ // If we get here, something went wrong, ignore the token
598
+ addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
265
599
  }
266
600
 
267
601
  function handleAutoClosing(state: ParserState, tagName: string): void {
@@ -269,19 +603,19 @@ function handleAutoClosing(state: ParserState, tagName: string): void {
269
603
  if (!autoCloseList) return;
270
604
 
271
605
  const currentElement = getCurrentElement(state);
272
- if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
606
+ if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
273
607
  state.stack.pop();
274
608
  }
275
609
  }
276
610
 
277
- function getCurrentParent(state: ParserState): ASTNode {
278
- return state.stack[state.stack.length - 1]!;
611
+ function getCurrentParent(state: ParserState): any {
612
+ return state.stack[state.stack.length - 1];
279
613
  }
280
614
 
281
- function getCurrentElement(state: ParserState): ASTNode | null {
615
+ function getCurrentElement(state: ParserState): any {
282
616
  for (let i = state.stack.length - 1; i >= 0; i--) {
283
- const element = state.stack[i]!;
284
- if (element.type === ASTNodeType.ELEMENT) {
617
+ const element = state.stack[i];
618
+ if (element.nodeType === 1) { // ELEMENT_NODE
285
619
  return element;
286
620
  }
287
621
  }
@@ -306,7 +640,7 @@ function addError(state: ParserState, message: string, position: number): void {
306
640
  });
307
641
  }
308
642
 
309
- function shouldSkipWhitespace(parent: ASTNode): boolean {
643
+ function shouldSkipWhitespace(parent: any): boolean {
310
644
  const skipWhitespaceIn = new Set([
311
645
  'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
312
646
  'ul', 'ol', 'dl', 'select', 'optgroup'
@@ -314,42 +648,3 @@ function shouldSkipWhitespace(parent: ASTNode): boolean {
314
648
 
315
649
  return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
316
650
  }
317
-
318
- export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
319
- callback(node);
320
-
321
- if (node.children) {
322
- for (const child of node.children) {
323
- traverseAST(child, callback);
324
- }
325
- }
326
- }
327
-
328
- export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
329
- const results: ASTNode[] = [];
330
-
331
- traverseAST(root, (node) => {
332
- if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
333
- results.push(node);
334
- }
335
- });
336
-
337
- return results;
338
- }
339
-
340
- export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
341
- const results: ASTNode[] = [];
342
-
343
- traverseAST(root, (node) => {
344
- if (node.type === ASTNodeType.ELEMENT && node.attributes) {
345
- const hasAttr = attrName in node.attributes;
346
- const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
347
-
348
- if (hasAttr && valueMatches) {
349
- results.push(node);
350
- }
351
- }
352
- });
353
-
354
- return results;
355
- }