@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
package/src/parser.ts DELETED
@@ -1,682 +0,0 @@
1
- import type { Token } from './tokenizer.js';
2
- import { TokenType } from './tokenizer.js';
3
- import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
4
-
5
- export interface ParserState {
6
- tokens: Token[];
7
- position: number;
8
- length: number;
9
- stack: any[]; // DOM elements
10
- root: any; // Document
11
- insertionMode: InsertionMode;
12
- errors: ParseError[];
13
- }
14
-
15
- export interface ParseError {
16
- message: string;
17
- position: number;
18
- line: number;
19
- column: number;
20
- severity: 'error' | 'warning';
21
- }
22
-
23
- export enum InsertionMode {
24
- Initial = 'initial',
25
- BeforeHtml = 'beforeHtml',
26
- BeforeHead = 'beforeHead',
27
- InHead = 'inHead',
28
- AfterHead = 'afterHead',
29
- InBody = 'inBody'
30
- }
31
-
32
- export enum ASTNodeType {
33
- Document = 'document',
34
- Element = 'element',
35
- Text = 'text',
36
- Comment = 'comment',
37
- Doctype = 'doctype',
38
- CDATA = 'cdata'
39
- }
40
-
41
- export interface ASTNode {
42
- type: ASTNodeType;
43
- tagName?: string;
44
- value?: string;
45
- attributes?: Record<string, string>;
46
- children?: ASTNode[];
47
- }
48
-
49
- const VOID_ELEMENTS = new Set([
50
- 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
51
- 'link', 'meta', 'param', 'source', 'track', 'wbr'
52
- ]);
53
-
54
- const RAW_TEXT_ELEMENTS = new Set([
55
- 'script', 'style', 'textarea', 'title'
56
- ]);
57
-
58
- const AUTO_CLOSE_RULES: Record<string, string[]> = {
59
- 'li': ['li'],
60
- 'dt': ['dt', 'dd'],
61
- 'dd': ['dt', 'dd'],
62
- 'address': ['p'],
63
- 'article': ['p'],
64
- 'aside': ['p'],
65
- 'blockquote': ['p'],
66
- 'center': ['p'],
67
- 'details': ['p'],
68
- 'dialog': ['p'],
69
- 'dir': ['p'],
70
- 'div': ['p'],
71
- 'dl': ['p'],
72
- 'fieldset': ['p'],
73
- 'figcaption': ['p'],
74
- 'figure': ['p'],
75
- 'footer': ['p'],
76
- 'form': ['p'],
77
- 'h1': ['p'],
78
- 'h2': ['p'],
79
- 'h3': ['p'],
80
- 'h4': ['p'],
81
- 'h5': ['p'],
82
- 'h6': ['p'],
83
- 'header': ['p'],
84
- 'hgroup': ['p'],
85
- 'hr': ['p'],
86
- 'listing': ['p'],
87
- 'main': ['p'],
88
- 'menu': ['p'],
89
- 'nav': ['p'],
90
- 'ol': ['p'],
91
- 'p': ['p'],
92
- 'pre': ['p'],
93
- 'section': ['p'],
94
- 'summary': ['p'],
95
- 'table': ['p'],
96
- 'ul': ['p'],
97
- 'rt': ['rt', 'rp'],
98
- 'rp': ['rt', 'rp'],
99
- 'optgroup': ['optgroup'],
100
- 'option': ['option'],
101
- 'thead': ['tbody', 'tfoot'],
102
- 'tbody': ['thead', 'tbody', 'tfoot'],
103
- 'tfoot': ['thead', 'tbody'],
104
- 'tr': ['tr'],
105
- 'td': ['td', 'th'],
106
- 'th': ['td', 'th']
107
- };
108
-
109
- export function parse(tokens: Token[]): any {
110
- const state = createParserState(tokens);
111
-
112
- while (state.position < state.length) {
113
- const token = getCurrentToken(state);
114
-
115
- if (!token || token.type === TokenType.EOF) {
116
- break;
117
- }
118
-
119
- parseToken(state, token);
120
- advance(state);
121
- }
122
-
123
- // Create implicit html, head, body if needed
124
- if (state.root.childNodes && state.root.childNodes.length > 0) {
125
- let hasHtml = false;
126
- for (const child of state.root.childNodes) {
127
- if (child.nodeType === 1 && child.tagName === 'HTML') {
128
- hasHtml = true;
129
- state.root.documentElement = child;
130
- break;
131
- }
132
- }
133
- if (!hasHtml) {
134
- const html = createElement('html', {});
135
- const head = createElement('head', {});
136
- const body = createElement('body', {});
137
- appendChild(html, head);
138
- appendChild(html, body);
139
-
140
- const doctypes: any[] = [];
141
- const commentsBeforeHtml: any[] = [];
142
- const bodyContent: any[] = [];
143
- const children = [...state.root.childNodes];
144
-
145
- let foundElement = false;
146
- for (const child of children) {
147
- if (child.nodeType === 10) {
148
- doctypes.push(child);
149
- } else if (child.nodeType === 8 && !foundElement) {
150
- commentsBeforeHtml.push(child);
151
- } else {
152
- if (child.nodeType === 1) foundElement = true;
153
- bodyContent.push(child);
154
- }
155
- }
156
-
157
- for (const content of bodyContent) {
158
- appendChild(body, content);
159
- }
160
-
161
- state.root.childNodes = [];
162
- for (const doctype of doctypes) {
163
- doctype.parentNode = null;
164
- appendChild(state.root, doctype);
165
- }
166
- for (const comment of commentsBeforeHtml) {
167
- comment.parentNode = null;
168
- appendChild(state.root, comment);
169
- }
170
- appendChild(state.root, html);
171
- state.root.documentElement = html;
172
- state.root.head = head;
173
- state.root.body = body;
174
- }
175
- }
176
-
177
- while (state.stack.length > 1) {
178
- const unclosedElement = state.stack.pop()!;
179
- const currentToken = getCurrentToken(state);
180
- addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
181
- }
182
-
183
- return state.root;
184
- }
185
-
186
- export function domToAST(dom: any): ASTNode {
187
- function convert(node: any): ASTNode | null {
188
- if (!node) return null;
189
-
190
- if (node.nodeType === 9) {
191
- const children: ASTNode[] = [];
192
- if (node.childNodes) {
193
- for (const child of node.childNodes) {
194
- const converted = convert(child);
195
- if (converted) children.push(converted);
196
- }
197
- }
198
- return {
199
- type: ASTNodeType.Document,
200
- children
201
- };
202
- }
203
-
204
- if (node.nodeType === 1) {
205
- const children: ASTNode[] = [];
206
- if (node.childNodes) {
207
- for (const child of node.childNodes) {
208
- const converted = convert(child);
209
- if (converted) children.push(converted);
210
- }
211
- }
212
- const tagName = node.tagName?.toLowerCase();
213
- return {
214
- type: ASTNodeType.Element,
215
- tagName,
216
- attributes: node.attributes || {},
217
- children,
218
- isSelfClosing: VOID_ELEMENTS.has(tagName)
219
- } as ASTNode & { isSelfClosing: boolean };
220
- }
221
-
222
- if (node.nodeType === 3) {
223
- return {
224
- type: ASTNodeType.Text,
225
- content: node.nodeValue || ''
226
- } as ASTNode & { content: string };
227
- }
228
-
229
- if (node.nodeType === 8) {
230
- return {
231
- type: ASTNodeType.Comment,
232
- content: node.nodeValue || ''
233
- } as ASTNode & { content: string };
234
- }
235
-
236
- if (node.nodeType === 10) {
237
- return {
238
- type: ASTNodeType.Doctype,
239
- content: node.name || 'html'
240
- } as ASTNode & { content: string };
241
- }
242
-
243
- return null;
244
- }
245
-
246
- return convert(dom) || { type: ASTNodeType.Document, children: [] };
247
- }
248
-
249
- function createParserState(tokens: Token[]): ParserState {
250
- const root = createDocument();
251
-
252
- return {
253
- tokens,
254
- position: 0,
255
- length: tokens.length,
256
- stack: [root],
257
- root,
258
- insertionMode: InsertionMode.Initial,
259
- errors: []
260
- };
261
- }
262
-
263
- function parseToken(state: ParserState, token: Token): void {
264
- switch (state.insertionMode) {
265
- case InsertionMode.Initial:
266
- parseTokenInInitialMode(state, token);
267
- break;
268
- case InsertionMode.BeforeHtml:
269
- parseTokenInBeforeHtmlMode(state, token);
270
- break;
271
- case InsertionMode.BeforeHead:
272
- parseTokenInBeforeHeadMode(state, token);
273
- break;
274
- case InsertionMode.InHead:
275
- parseTokenInInHeadMode(state, token);
276
- break;
277
- case InsertionMode.AfterHead:
278
- parseTokenInAfterHeadMode(state, token);
279
- break;
280
- case InsertionMode.InBody:
281
- parseTokenInInBodyMode(state, token);
282
- break;
283
- default:
284
- parseTokenInInBodyMode(state, token); // fallback
285
- }
286
- }
287
-
288
- function parseTokenInInitialMode(state: ParserState, token: Token): void {
289
- if (token.type === TokenType.DOCTYPE) {
290
- // TODO: Create DOCTYPE node
291
- parseDoctype(state, token);
292
- state.insertionMode = InsertionMode.BeforeHtml;
293
- } else if (token.type === TokenType.COMMENT) {
294
- parseComment(state, token);
295
- } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
296
- // Ignore whitespace
297
- } else {
298
- // No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
299
- const doctype = createDoctype('html');
300
- appendChild(state.root, doctype);
301
- state.insertionMode = InsertionMode.BeforeHtml;
302
- parseToken(state, token); // Re-parse in new mode
303
- }
304
- }
305
-
306
- function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
307
- if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
308
- const html = createElement('html', token.attributes || {});
309
- appendChild(state.root, html);
310
- state.root.documentElement = html;
311
- state.stack.push(html);
312
- state.insertionMode = InsertionMode.BeforeHead;
313
- } else if (token.type === TokenType.COMMENT) {
314
- parseComment(state, token);
315
- } else if (token.type === TokenType.DOCTYPE) {
316
- // Ignore
317
- } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
318
- // Ignore whitespace
319
- } else {
320
- const html = createElement('html', {});
321
- appendChild(state.root, html);
322
- state.root.documentElement = html;
323
- state.stack.push(html);
324
- state.insertionMode = InsertionMode.BeforeHead;
325
- parseToken(state, token);
326
- }
327
- }
328
-
329
- function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
330
- if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
331
- const head = createElement('head', token.attributes || {});
332
- appendChild(getCurrentParent(state), head);
333
- state.root.head = head;
334
- state.stack.push(head);
335
- state.insertionMode = InsertionMode.InHead;
336
- } else if (token.type === TokenType.COMMENT) {
337
- parseComment(state, token);
338
- } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
339
- // Ignore whitespace
340
- } else {
341
- const head = createElement('head', {});
342
- appendChild(getCurrentParent(state), head);
343
- state.root.head = head;
344
- state.stack.push(head);
345
- state.insertionMode = InsertionMode.InHead;
346
- parseToken(state, token);
347
- }
348
- }
349
-
350
- function parseOpenTag(state: ParserState, token: Token): void {
351
- const tagName = token.value.toLowerCase();
352
- const currentParent = getCurrentParent(state);
353
- const element = createElement(tagName, token.attributes || {});
354
- appendChild(currentParent, element);
355
-
356
- if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
357
- state.stack.push(element);
358
- }
359
- }
360
-
361
- function parseTokenInInHeadMode(state: ParserState, token: Token): void {
362
- const currentElement = getCurrentElement(state);
363
- const currentTagName = currentElement?.tagName?.toLowerCase();
364
-
365
- if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
366
- if (token.type === TokenType.TEXT) {
367
- parseText(state, token);
368
- return;
369
- } else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
370
- state.stack.pop();
371
- return;
372
- }
373
- }
374
-
375
- if (token.type === TokenType.TAG_OPEN) {
376
- const tagName = token.value.toLowerCase();
377
- if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
378
- parseOpenTag(state, token);
379
- } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
380
- parseOpenTag(state, token);
381
- } else if (tagName === 'head') {
382
- // Ignore duplicate <head> tags
383
- } else if (tagName.includes('-')) {
384
- // Custom elements (tags with hyphens) are valid in <head>
385
- parseOpenTag(state, token);
386
- } else {
387
- state.stack.pop();
388
- state.insertionMode = InsertionMode.AfterHead;
389
- parseToken(state, token);
390
- }
391
- } else if (token.type === TokenType.TAG_CLOSE) {
392
- const tagName = token.value.toLowerCase();
393
- if (tagName === 'head') {
394
- state.stack.pop();
395
- state.insertionMode = InsertionMode.AfterHead;
396
- } else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
397
- if (currentTagName === tagName) {
398
- state.stack.pop();
399
- }
400
- } else if (tagName.includes('-') && currentTagName === tagName) {
401
- // Handle closing tags for custom elements in <head>
402
- state.stack.pop();
403
- }
404
- } else if (token.type === TokenType.COMMENT) {
405
- parseComment(state, token);
406
- } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
407
- } else {
408
- state.stack.pop();
409
- state.insertionMode = InsertionMode.AfterHead;
410
- parseToken(state, token);
411
- }
412
- }
413
-
414
- function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
415
- if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
416
- const body = createElement('body', token.attributes || {});
417
- appendChild(getCurrentParent(state), body);
418
- state.root.body = body;
419
- state.stack.push(body);
420
- state.insertionMode = InsertionMode.InBody;
421
- } else if (token.type === TokenType.COMMENT) {
422
- parseComment(state, token);
423
- } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
424
- // Ignore whitespace
425
- } else {
426
- const body = createElement('body', {});
427
- appendChild(getCurrentParent(state), body);
428
- state.root.body = body;
429
- state.stack.push(body);
430
- state.insertionMode = InsertionMode.InBody;
431
- parseToken(state, token);
432
- }
433
- }
434
-
435
- const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
436
- const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
437
-
438
- function parseTokenInInBodyMode(state: ParserState, token: Token): void {
439
- if (token.type === TokenType.TAG_OPEN) {
440
- const tagName = token.value.toLowerCase();
441
-
442
- handleAutoClosing(state, tagName);
443
-
444
- const currentParent = getCurrentParent(state);
445
-
446
- let namespaceURI: string | undefined;
447
- if (tagName === 'svg') {
448
- namespaceURI = SVG_NAMESPACE;
449
- } else if (tagName === 'math') {
450
- namespaceURI = MATHML_NAMESPACE;
451
- }
452
-
453
- const element = createElement(tagName, token.attributes || {}, namespaceURI);
454
-
455
- appendChild(currentParent, element);
456
-
457
- if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
458
- state.stack.push(element);
459
- }
460
- } else if (token.type === TokenType.TAG_CLOSE) {
461
- const tagName = token.value.toLowerCase();
462
-
463
- // Generate implied end tags
464
- const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
465
- while (state.stack.length > 1) { // Don't pop document
466
- const currentElement = getCurrentElement(state);
467
- if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
468
- break;
469
- }
470
- state.stack.pop();
471
- addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
472
- }
473
-
474
- const currentElement = getCurrentElement(state);
475
- if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
476
- state.stack.pop();
477
- } else {
478
- // For now, just ignore unmatched closing tags
479
- // TODO: Implement full adoption agency algorithm
480
- addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
481
- }
482
- } else if (token.type === TokenType.TEXT) {
483
- parseText(state, token);
484
- } else if (token.type === TokenType.COMMENT) {
485
- parseComment(state, token);
486
- } else if (token.type === TokenType.CDATA) {
487
- parseCDATA(state, token);
488
- } else if (token.type === TokenType.DOCTYPE) {
489
- // Ignore
490
- } else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
491
- parseProcessingInstruction(state, token);
492
- }
493
- }
494
-
495
- function parseText(state: ParserState, token: Token): void {
496
- const content = token.value;
497
- const currentParent = getCurrentParent(state);
498
-
499
- if (content.trim() === '' && shouldSkipWhitespace(currentParent)) {
500
- return;
501
- }
502
-
503
- const textNode = createTextNode(content);
504
- appendChild(currentParent, textNode);
505
- }
506
-
507
- function parseComment(state: ParserState, token: Token): void {
508
- const currentParent = getCurrentParent(state);
509
-
510
- const commentNode = createComment(token.value);
511
- appendChild(currentParent, commentNode);
512
- }
513
-
514
- function parseCDATA(state: ParserState, token: Token): void {
515
- // TODO: implement CDATA
516
- }
517
-
518
- function parseDoctype(state: ParserState, token: Token): void {
519
- const doctype = createDoctype(token.value || 'html');
520
- appendChild(state.root, doctype);
521
- state.root.doctype = doctype;
522
- }
523
-
524
- function parseProcessingInstruction(state: ParserState, token: Token): void {
525
- // TODO: implement ProcessingInstruction
526
- }
527
-
528
- function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
529
- // HTML5 Adoption Agency Algorithm - simplified but more correct implementation
530
-
531
- // 1. If the current node is an HTML element whose tag name matches the token's tag name,
532
- // then pop the current node off the stack of open elements and abort these steps.
533
- const currentElement = getCurrentElement(state);
534
- if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
535
- state.stack.pop();
536
- return;
537
- }
538
-
539
- // 2. Let outer loop counter be 0
540
- let outerLoopCounter = 0;
541
- const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
542
-
543
- while (outerLoopCounter < 8) { // Prevent infinite loops
544
- outerLoopCounter++;
545
-
546
- // 3. Let the formatting element be the last element in the list of active formatting elements
547
- // that is between the end of the list and the last scope marker or the start of the list,
548
- // if any, that has the same tag name as the token.
549
-
550
- // For simplicity, find the innermost element with matching tag name
551
- let formattingElementIndex = -1;
552
- for (let i = state.stack.length - 1; i >= 0; i--) {
553
- const element = state.stack[i];
554
- if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
555
- formattingElementIndex = i;
556
- break;
557
- }
558
- }
559
-
560
- if (formattingElementIndex === -1) {
561
- // No formatting element found, just find any element with matching tag name
562
- for (let i = state.stack.length - 1; i >= 0; i--) {
563
- const element = state.stack[i];
564
- if (element.tagName && element.tagName.toLowerCase() === tagName) {
565
- formattingElementIndex = i;
566
- break;
567
- }
568
- }
569
- }
570
-
571
- if (formattingElementIndex === -1) {
572
- // No matching element found, ignore the token
573
- addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
574
- return;
575
- }
576
-
577
- const formattingElement = state.stack[formattingElementIndex];
578
-
579
- // 4. If there is no element in the stack of open elements that has the same tag name as the
580
- // formatting element, then remove the element from the list of active formatting elements
581
- // and abort these steps.
582
- let openElementIndex = -1;
583
- for (let i = state.stack.length - 1; i >= 0; i--) {
584
- if (state.stack[i] === formattingElement) {
585
- openElementIndex = i;
586
- break;
587
- }
588
- }
589
-
590
- if (openElementIndex === -1) {
591
- // Element not in stack, ignore
592
- return;
593
- }
594
-
595
- // 5. If the element is not in the stack of open elements, then this is a parse error;
596
- // remove the element from the list of active formatting elements and abort these steps.
597
- // (Already checked above)
598
-
599
- // 6. Let the furthest block be the topmost node in the stack of open elements that is lower
600
- // in the stack than the formatting element, and is an element in the special category.
601
- const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
602
-
603
- let furthestBlockIndex = -1;
604
- for (let i = openElementIndex + 1; i < state.stack.length; i++) {
605
- const element = state.stack[i];
606
- if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
607
- furthestBlockIndex = i;
608
- break;
609
- }
610
- }
611
-
612
- if (furthestBlockIndex === -1) {
613
- // No special element found, just pop elements until we reach the formatting element
614
- while (state.stack.length > openElementIndex + 1) {
615
- state.stack.pop();
616
- }
617
- state.stack.pop(); // Pop the formatting element
618
- return;
619
- }
620
-
621
- // 7. Simplified: just pop everything until the formatting element
622
- while (state.stack.length > openElementIndex + 1) {
623
- state.stack.pop();
624
- }
625
- state.stack.pop(); // Pop the formatting element
626
- return;
627
- }
628
-
629
- // If we get here, something went wrong, ignore the token
630
- addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
631
- }
632
-
633
- function handleAutoClosing(state: ParserState, tagName: string): void {
634
- const autoCloseList = AUTO_CLOSE_RULES[tagName];
635
- if (!autoCloseList) return;
636
-
637
- const currentElement = getCurrentElement(state);
638
- if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
639
- state.stack.pop();
640
- }
641
- }
642
-
643
- function getCurrentParent(state: ParserState): any {
644
- return state.stack[state.stack.length - 1];
645
- }
646
-
647
- function getCurrentElement(state: ParserState): any {
648
- for (let i = state.stack.length - 1; i >= 0; i--) {
649
- const element = state.stack[i];
650
- if (element.nodeType === 1) { // ELEMENT_NODE
651
- return element;
652
- }
653
- }
654
- return null;
655
- }
656
-
657
- function getCurrentToken(state: ParserState): Token | null {
658
- return state.tokens[state.position] || null;
659
- }
660
-
661
- function advance(state: ParserState): void {
662
- state.position++;
663
- }
664
-
665
- function addError(state: ParserState, message: string, position: number): void {
666
- state.errors.push({
667
- message,
668
- position,
669
- line: 0,
670
- column: 0,
671
- severity: 'error'
672
- });
673
- }
674
-
675
- function shouldSkipWhitespace(parent: any): boolean {
676
- const skipWhitespaceIn = new Set([
677
- 'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
678
- 'ul', 'ol', 'dl', 'select', 'optgroup'
679
- ]);
680
-
681
- return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
682
- }