@tkeron/html-parser 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.3.0",
3
+ "version": "1.4.1",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -135,3 +135,15 @@ export const VALID_TR_CHILDREN = new Set([
135
135
  "template",
136
136
  "style",
137
137
  ]);
138
+
139
+ export const BUTTON_SCOPE_TERMINATORS = new Set([
140
+ "applet",
141
+ "caption",
142
+ "html",
143
+ "table",
144
+ "td",
145
+ "th",
146
+ "marquee",
147
+ "object",
148
+ "template",
149
+ ]);
@@ -0,0 +1,48 @@
1
+ export const mergeAdjacentTextNodes = (
2
+ parent: any,
3
+ insertIndex: number,
4
+ ): void => {
5
+ if (!parent.childNodes || parent.childNodes.length < 2) {
6
+ return;
7
+ }
8
+
9
+ const node = parent.childNodes[insertIndex];
10
+ if (!node || node.nodeType !== 3) {
11
+ return;
12
+ }
13
+
14
+ if (insertIndex > 0) {
15
+ const prevNode = parent.childNodes[insertIndex - 1];
16
+ if (prevNode && prevNode.nodeType === 3) {
17
+ prevNode.textContent =
18
+ (prevNode.textContent || "") + (node.textContent || "");
19
+ prevNode.nodeValue = prevNode.textContent;
20
+ parent.childNodes.splice(insertIndex, 1);
21
+ return;
22
+ }
23
+ }
24
+
25
+ if (insertIndex < parent.childNodes.length - 1) {
26
+ const nextNode = parent.childNodes[insertIndex + 1];
27
+ if (nextNode && nextNode.nodeType === 3) {
28
+ node.textContent =
29
+ (node.textContent || "") + (nextNode.textContent || "");
30
+ node.nodeValue = node.textContent;
31
+ parent.childNodes.splice(insertIndex + 1, 1);
32
+ }
33
+ }
34
+ };
35
+
36
+ export const insertNodeBeforeTable = (
37
+ parent: any,
38
+ tableElement: any,
39
+ node: any,
40
+ ): number => {
41
+ const idx = parent.childNodes.indexOf(tableElement);
42
+ if (idx !== -1) {
43
+ node.parentNode = parent;
44
+ parent.childNodes.splice(idx, 0, node);
45
+ return idx;
46
+ }
47
+ return -1;
48
+ };
@@ -0,0 +1,65 @@
1
+ import { createElement, appendChild } from "../dom-simulator/index.js";
2
+
3
+ export const CELL_ELEMENTS = new Set(["td", "th"]);
4
+
5
+ export const TABLE_SECTION_ELEMENTS = new Set(["tbody", "thead", "tfoot"]);
6
+
7
+ export const shouldCreateImplicitTableStructure = (
8
+ parentTagName: string,
9
+ childTagName: string,
10
+ ): boolean => {
11
+ const parent = parentTagName.toLowerCase();
12
+ const child = childTagName.toLowerCase();
13
+
14
+ if (CELL_ELEMENTS.has(child)) {
15
+ return parent === "table" || TABLE_SECTION_ELEMENTS.has(parent);
16
+ }
17
+
18
+ if (child === "tr") {
19
+ return parent === "table";
20
+ }
21
+
22
+ return false;
23
+ };
24
+
25
+ export const createImplicitTableStructure = (
26
+ stack: any[],
27
+ parentTagName: string,
28
+ childTagName: string,
29
+ ): any => {
30
+ const parent = parentTagName.toLowerCase();
31
+ const child = childTagName.toLowerCase();
32
+ const currentParent = stack[stack.length - 1];
33
+
34
+ if (CELL_ELEMENTS.has(child)) {
35
+ if (parent === "table") {
36
+ const tbody = createElement("tbody", {});
37
+ appendChild(currentParent, tbody);
38
+ stack.push(tbody);
39
+
40
+ const tr = createElement("tr", {});
41
+ appendChild(tbody, tr);
42
+ stack.push(tr);
43
+
44
+ return tr;
45
+ }
46
+
47
+ if (TABLE_SECTION_ELEMENTS.has(parent)) {
48
+ const tr = createElement("tr", {});
49
+ appendChild(currentParent, tr);
50
+ stack.push(tr);
51
+
52
+ return tr;
53
+ }
54
+ }
55
+
56
+ if (child === "tr" && parent === "table") {
57
+ const tbody = createElement("tbody", {});
58
+ appendChild(currentParent, tbody);
59
+ stack.push(tbody);
60
+
61
+ return tbody;
62
+ }
63
+
64
+ return currentParent;
65
+ };
@@ -21,14 +21,20 @@ import {
21
21
  VALID_TABLE_CHILDREN,
22
22
  VALID_TABLE_SECTION_CHILDREN,
23
23
  VALID_TR_CHILDREN,
24
+ BUTTON_SCOPE_TERMINATORS,
24
25
  } from "./constants";
25
26
  import {
26
- findFormattingElementInStack,
27
27
  findFurthestBlock,
28
28
  getCommonAncestor,
29
29
  cloneFormattingElement,
30
30
  reparentChildren,
31
31
  } from "./adoption-agency-helpers.js";
32
+ import {
33
+ shouldCreateImplicitTableStructure,
34
+ createImplicitTableStructure,
35
+ CELL_ELEMENTS,
36
+ } from "./implicit-table-structure.js";
37
+ import { mergeAdjacentTextNodes } from "./foster-parenting-helpers.js";
32
38
 
33
39
  export const parse = (tokens: Token[]): any => {
34
40
  const state = createParserState(tokens);
@@ -327,17 +333,49 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
327
333
  if (token.type === TokenType.TAG_OPEN) {
328
334
  const tagName = token.value.toLowerCase();
329
335
 
330
- handleAutoClosing(state, tagName);
336
+ if (tagName === "a") {
337
+ const existingA = state.activeFormattingElements.find(
338
+ (el) => el && el.tagName && el.tagName.toLowerCase() === "a",
339
+ );
340
+ if (existingA) {
341
+ runAdoptionAgencyAlgorithm(state, "a");
342
+ }
343
+ }
331
344
 
332
- reconstructActiveFormattingElements(state);
345
+ const closedParagraph = handleAutoClosing(state, tagName);
333
346
 
334
- const currentParent = getCurrentParent(state);
347
+ const inTableContext = isInTableContext(state);
348
+ const isTableStructureElement =
349
+ CELL_ELEMENTS.has(tagName) ||
350
+ tagName === "tr" ||
351
+ tagName === "tbody" ||
352
+ tagName === "thead" ||
353
+ tagName === "tfoot";
354
+ const currentStackParent = getCurrentParent(state);
355
+ const currentStackParentTag =
356
+ currentStackParent.tagName?.toLowerCase() || "";
357
+ const parentIsTableContext = TABLE_CONTEXT_ELEMENTS.has(
358
+ currentStackParentTag,
359
+ );
360
+
361
+ if (inTableContext && isTableStructureElement) {
362
+ const tableParent = findTableContextParent(state);
363
+ if (tableParent) {
364
+ popStackUntilTableContext(state);
365
+ }
366
+ } else if (!parentIsTableContext && !closedParagraph) {
367
+ reconstructActiveFormattingElements(state);
368
+ }
369
+
370
+ let currentParent = getCurrentParent(state);
335
371
 
336
372
  let namespaceURI: string | undefined;
337
373
  if (tagName === "svg") {
338
374
  namespaceURI = SVG_NAMESPACE;
339
375
  } else if (tagName === "math") {
340
376
  namespaceURI = MATHML_NAMESPACE;
377
+ } else {
378
+ namespaceURI = getCurrentNamespace(state);
341
379
  }
342
380
 
343
381
  const element = createElement(
@@ -346,8 +384,8 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
346
384
  namespaceURI,
347
385
  );
348
386
 
349
- const inTableContext = isInTableContext(state);
350
- const parentTagName = currentParent.tagName || "";
387
+ let parentTagName = currentParent.tagName || "";
388
+
351
389
  const isValidForParent = isValidChildForTableParent(parentTagName, tagName);
352
390
  const isHiddenInput =
353
391
  tagName === "input" &&
@@ -355,43 +393,59 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
355
393
  token.attributes.type &&
356
394
  token.attributes.type.toLowerCase() === "hidden";
357
395
  const isFormInTable = tagName === "form" && inTableContext;
396
+
397
+ const needsImplicitStructure =
398
+ inTableContext &&
399
+ shouldCreateImplicitTableStructure(parentTagName, tagName);
400
+
358
401
  const needsFosterParenting =
359
402
  inTableContext &&
360
403
  TABLE_CONTEXT_ELEMENTS.has(parentTagName.toLowerCase()) &&
361
404
  !isValidForParent &&
362
405
  !isHiddenInput &&
363
- !isFormInTable;
406
+ !isFormInTable &&
407
+ !needsImplicitStructure;
364
408
 
365
- if (needsFosterParenting) {
409
+ if (needsImplicitStructure) {
410
+ createImplicitTableStructure(state.stack, parentTagName, tagName);
411
+ appendChild(getCurrentParent(state), element);
412
+ } else if (needsFosterParenting) {
366
413
  insertWithFosterParenting(state, element);
367
414
  } else {
368
415
  appendChild(currentParent, element);
369
416
  }
370
417
 
418
+ const wasFosterParented = needsFosterParenting;
419
+ const isFormattingElement = FORMATTING_ELEMENTS.has(tagName);
420
+
371
421
  if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
372
- if (!isFormInTable) {
422
+ if (!isFormInTable && !(wasFosterParented && isFormattingElement)) {
373
423
  state.stack.push(element);
374
424
  }
375
425
 
376
- if (FORMATTING_ELEMENTS.has(tagName)) {
377
- state.activeFormattingElements.push(element);
426
+ if (isFormattingElement) {
427
+ pushToActiveFormattingElements(state, element);
378
428
  }
379
429
  }
380
430
  } else if (token.type === TokenType.TAG_CLOSE) {
381
431
  const tagName = token.value.toLowerCase();
382
432
 
383
- if (FORMATTING_ELEMENTS.has(tagName)) {
433
+ if (FORMATTING_ELEMENTS.has(tagName) && !isInForeignContent(state)) {
384
434
  runAdoptionAgencyAlgorithm(state, tagName);
385
435
  return;
386
436
  }
387
437
 
438
+ if (tagName === "p") {
439
+ closeParagraphElement(state);
440
+ return;
441
+ }
442
+
388
443
  const impliedEndTags = [
389
444
  "dd",
390
445
  "dt",
391
446
  "li",
392
447
  "option",
393
448
  "optgroup",
394
- "p",
395
449
  "rb",
396
450
  "rp",
397
451
  "rt",
@@ -440,76 +494,132 @@ const runAdoptionAgencyAlgorithm = (
440
494
  state: ParserState,
441
495
  tagName: string,
442
496
  ): void => {
443
- const result = findFormattingElementInStack(state.stack, tagName);
497
+ const maxIterations = 8;
444
498
 
445
- if (!result) {
446
- return;
447
- }
499
+ for (let iteration = 0; iteration < maxIterations; iteration++) {
500
+ const formattingElementIndex = state.activeFormattingElements.findIndex(
501
+ (el) =>
502
+ el && el.tagName && el.tagName.toLowerCase() === tagName.toLowerCase(),
503
+ );
448
504
 
449
- const { element: formattingElement, index: formattingElementIndex } = result;
505
+ if (formattingElementIndex === -1) {
506
+ return;
507
+ }
450
508
 
451
- const currentElement = getCurrentElement(state);
452
- if (currentElement === formattingElement) {
453
- state.stack.pop();
454
- removeFromActiveFormattingElements(state, formattingElement);
455
- return;
456
- }
509
+ const formattingElement =
510
+ state.activeFormattingElements[formattingElementIndex];
511
+ const stackIndex = state.stack.indexOf(formattingElement);
457
512
 
458
- const fbResult = findFurthestBlock(state.stack, formattingElementIndex);
513
+ if (stackIndex === -1) {
514
+ state.activeFormattingElements.splice(formattingElementIndex, 1);
515
+ return;
516
+ }
459
517
 
460
- if (!fbResult) {
461
- while (state.stack.length > formattingElementIndex) {
518
+ const currentElement = getCurrentElement(state);
519
+ if (currentElement === formattingElement) {
462
520
  state.stack.pop();
521
+ removeFromActiveFormattingElements(state, formattingElement);
522
+ return;
463
523
  }
464
- removeFromActiveFormattingElements(state, formattingElement);
465
- return;
466
- }
467
524
 
468
- const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
469
- const commonAncestor = getCommonAncestor(state.stack, formattingElementIndex);
525
+ const fbResult = findFurthestBlock(state.stack, stackIndex);
470
526
 
471
- if (!commonAncestor) {
472
- return;
473
- }
527
+ if (!fbResult) {
528
+ while (state.stack.length > stackIndex) {
529
+ state.stack.pop();
530
+ }
531
+ removeFromActiveFormattingElements(state, formattingElement);
532
+ return;
533
+ }
534
+
535
+ const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
536
+ const commonAncestor = getCommonAncestor(state.stack, stackIndex);
537
+
538
+ if (!commonAncestor) {
539
+ return;
540
+ }
541
+
542
+ let lastNode = furthestBlock;
543
+ const clonedNodes: any[] = [];
544
+ const nodesToRemoveFromStack: any[] = [];
545
+ let innerLoopCounter = 0;
546
+ let nodeIndex = furthestBlockIndex;
547
+
548
+ while (true) {
549
+ innerLoopCounter++;
550
+ nodeIndex--;
551
+ const node = state.stack[nodeIndex];
552
+
553
+ if (node === formattingElement) {
554
+ break;
555
+ }
474
556
 
475
- let lastNode = furthestBlock;
476
- const clonedNodes: any[] = [];
557
+ if (
558
+ innerLoopCounter > 3 &&
559
+ state.activeFormattingElements.includes(node)
560
+ ) {
561
+ removeFromActiveFormattingElements(state, node);
562
+ }
477
563
 
478
- for (let i = furthestBlockIndex - 1; i > formattingElementIndex; i--) {
479
- const node = state.stack[i];
480
- const nodeClone = cloneFormattingElement(node);
481
- clonedNodes.unshift(nodeClone);
564
+ if (!state.activeFormattingElements.includes(node)) {
565
+ nodesToRemoveFromStack.push(node);
566
+ continue;
567
+ }
568
+
569
+ const nodeClone = cloneFormattingElement(node);
570
+ clonedNodes.unshift(nodeClone);
482
571
 
483
- replaceInActiveFormattingElements(state, node, nodeClone);
572
+ replaceInActiveFormattingElements(state, node, nodeClone);
484
573
 
485
- const nodeChildIdx = node.childNodes.indexOf(lastNode);
486
- if (nodeChildIdx !== -1) {
487
- node.childNodes.splice(nodeChildIdx, 1);
574
+ const nodeChildIdx = node.childNodes.indexOf(lastNode);
575
+ if (nodeChildIdx !== -1) {
576
+ node.childNodes.splice(nodeChildIdx, 1);
577
+ }
578
+
579
+ appendChild(nodeClone, lastNode);
580
+ lastNode = nodeClone;
488
581
  }
489
582
 
490
- appendChild(nodeClone, lastNode);
491
- lastNode = nodeClone;
492
- }
583
+ for (const node of nodesToRemoveFromStack) {
584
+ const idx = state.stack.indexOf(node);
585
+ if (idx !== -1) {
586
+ state.stack.splice(idx, 1);
587
+ }
588
+ }
493
589
 
494
- const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
495
- if (fbIdx !== -1) {
496
- formattingElement.childNodes.splice(fbIdx, 1);
497
- furthestBlock.parentNode = null;
498
- }
590
+ const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
591
+ if (fbIdx !== -1) {
592
+ formattingElement.childNodes.splice(fbIdx, 1);
593
+ furthestBlock.parentNode = null;
594
+ }
499
595
 
500
- appendChild(commonAncestor, lastNode);
596
+ appendChild(commonAncestor, lastNode);
501
597
 
502
- const newFormattingElement = cloneFormattingElement(formattingElement);
503
- reparentChildren(furthestBlock, newFormattingElement);
504
- appendChild(furthestBlock, newFormattingElement);
598
+ const newFormattingElement = cloneFormattingElement(formattingElement);
599
+ reparentChildren(furthestBlock, newFormattingElement);
600
+ appendChild(furthestBlock, newFormattingElement);
505
601
 
506
- removeFromActiveFormattingElements(state, formattingElement);
602
+ removeFromActiveFormattingElements(state, formattingElement);
603
+ state.activeFormattingElements.splice(
604
+ formattingElementIndex,
605
+ 0,
606
+ newFormattingElement,
607
+ );
608
+
609
+ const elementsAfterFurthestBlock = state.stack.slice(
610
+ furthestBlockIndex + 1,
611
+ );
507
612
 
508
- state.stack.length = formattingElementIndex;
509
- for (const clonedNode of clonedNodes) {
510
- state.stack.push(clonedNode);
613
+ state.stack.length = stackIndex;
614
+ for (const clonedNode of clonedNodes) {
615
+ state.stack.push(clonedNode);
616
+ }
617
+ state.stack.push(furthestBlock);
618
+ state.stack.push(newFormattingElement);
619
+ for (const element of elementsAfterFurthestBlock) {
620
+ state.stack.push(element);
621
+ }
511
622
  }
512
- state.stack.push(furthestBlock);
513
623
  };
514
624
 
515
625
  const removeFromActiveFormattingElements = (
@@ -533,6 +643,60 @@ const replaceInActiveFormattingElements = (
533
643
  }
534
644
  };
535
645
 
646
+ const pushToActiveFormattingElements = (
647
+ state: ParserState,
648
+ element: any,
649
+ ): void => {
650
+ const list = state.activeFormattingElements;
651
+ const tagName = element.tagName?.toLowerCase();
652
+
653
+ let count = 0;
654
+ let oldestMatchIndex = -1;
655
+
656
+ for (let i = list.length - 1; i >= 0; i--) {
657
+ const entry = list[i];
658
+ if (entry === null) {
659
+ break;
660
+ }
661
+
662
+ if (
663
+ entry.tagName?.toLowerCase() === tagName &&
664
+ attributesMatch(entry, element)
665
+ ) {
666
+ if (oldestMatchIndex === -1) {
667
+ oldestMatchIndex = i;
668
+ }
669
+ count++;
670
+ if (count >= 3) {
671
+ list.splice(oldestMatchIndex, 1);
672
+ break;
673
+ }
674
+ oldestMatchIndex = i;
675
+ }
676
+ }
677
+
678
+ list.push(element);
679
+ };
680
+
681
+ const attributesMatch = (el1: any, el2: any): boolean => {
682
+ const attrs1 = el1.attributes || {};
683
+ const attrs2 = el2.attributes || {};
684
+ const keys1 = Object.keys(attrs1);
685
+ const keys2 = Object.keys(attrs2);
686
+
687
+ if (keys1.length !== keys2.length) {
688
+ return false;
689
+ }
690
+
691
+ for (const key of keys1) {
692
+ if (attrs1[key] !== attrs2[key]) {
693
+ return false;
694
+ }
695
+ }
696
+
697
+ return true;
698
+ };
699
+
536
700
  const parseText = (state: ParserState, token: Token): void => {
537
701
  const content = token.value;
538
702
 
@@ -541,8 +705,6 @@ const parseText = (state: ParserState, token: Token): void => {
541
705
  return;
542
706
  }
543
707
 
544
- reconstructActiveFormattingElements(state);
545
-
546
708
  const textNode = createTextNode(content);
547
709
 
548
710
  const inTableContext = isInTableContext(state);
@@ -552,9 +714,10 @@ const parseText = (state: ParserState, token: Token): void => {
552
714
  currentParent.tagName &&
553
715
  TABLE_CONTEXT_ELEMENTS.has(currentParent.tagName.toLowerCase())
554
716
  ) {
555
- insertWithFosterParenting(state, textNode);
717
+ insertWithFosterParentingAndReconstruct(state, textNode);
556
718
  } else {
557
- appendChild(currentParent, textNode);
719
+ reconstructActiveFormattingElements(state);
720
+ appendChild(getCurrentParent(state), textNode);
558
721
  }
559
722
  };
560
723
 
@@ -583,18 +746,57 @@ const parseProcessingInstruction = (state: ParserState, token: Token): void => {
583
746
  appendChild(currentParent, piNode);
584
747
  };
585
748
 
586
- const handleAutoClosing = (state: ParserState, tagName: string): void => {
749
+ const closeParagraphElement = (state: ParserState): void => {
750
+ let pIndex = -1;
751
+ for (let i = state.stack.length - 1; i >= 0; i--) {
752
+ const element = state.stack[i];
753
+ const elementTag = element.tagName?.toLowerCase();
754
+
755
+ if (elementTag === "p") {
756
+ pIndex = i;
757
+ break;
758
+ }
759
+
760
+ if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
761
+ return;
762
+ }
763
+ }
764
+
765
+ if (pIndex === -1) {
766
+ return;
767
+ }
768
+
769
+ while (state.stack.length > pIndex) {
770
+ state.stack.pop();
771
+ }
772
+ };
773
+
774
+ const handleAutoClosing = (state: ParserState, tagName: string): boolean => {
587
775
  const autoCloseList = AUTO_CLOSE_RULES[tagName];
588
- if (!autoCloseList) return;
776
+ if (!autoCloseList) return false;
589
777
 
590
- const currentElement = getCurrentElement(state);
591
- if (
592
- currentElement &&
593
- currentElement.tagName &&
594
- autoCloseList.includes(currentElement.tagName.toLowerCase())
595
- ) {
778
+ let targetIndex = -1;
779
+ for (let i = state.stack.length - 1; i >= 0; i--) {
780
+ const element = state.stack[i];
781
+ const elementTag = element.tagName?.toLowerCase();
782
+
783
+ if (elementTag && autoCloseList.includes(elementTag)) {
784
+ targetIndex = i;
785
+ break;
786
+ }
787
+
788
+ if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
789
+ return false;
790
+ }
791
+ }
792
+
793
+ if (targetIndex === -1) return false;
794
+
795
+ while (state.stack.length > targetIndex) {
596
796
  state.stack.pop();
597
797
  }
798
+
799
+ return true;
598
800
  };
599
801
 
600
802
  const getCurrentParent = (state: ParserState): any => {
@@ -710,6 +912,57 @@ const isInTableContext = (state: ParserState): boolean => {
710
912
  return false;
711
913
  };
712
914
 
915
+ const isInForeignContent = (state: ParserState): boolean => {
916
+ for (let i = state.stack.length - 1; i >= 0; i--) {
917
+ const el = state.stack[i];
918
+ if (
919
+ el.namespaceURI === SVG_NAMESPACE ||
920
+ el.namespaceURI === MATHML_NAMESPACE
921
+ ) {
922
+ return true;
923
+ }
924
+ if (el.tagName && el.tagName.toLowerCase() === "html") {
925
+ return false;
926
+ }
927
+ }
928
+ return false;
929
+ };
930
+
931
+ const getCurrentNamespace = (state: ParserState): string | undefined => {
932
+ for (let i = state.stack.length - 1; i >= 0; i--) {
933
+ const el = state.stack[i];
934
+ if (el.namespaceURI) {
935
+ return el.namespaceURI;
936
+ }
937
+ }
938
+ return undefined;
939
+ };
940
+
941
+ const findTableContextParent = (state: ParserState): any | null => {
942
+ for (let i = state.stack.length - 1; i >= 0; i--) {
943
+ const el = state.stack[i];
944
+ if (el.tagName && TABLE_CONTEXT_ELEMENTS.has(el.tagName.toLowerCase())) {
945
+ return el;
946
+ }
947
+ }
948
+ return null;
949
+ };
950
+
951
+ const popStackUntilTableContext = (state: ParserState): void => {
952
+ while (state.stack.length > 1) {
953
+ const el = getCurrentElement(state);
954
+ if (
955
+ el &&
956
+ el.tagName &&
957
+ TABLE_CONTEXT_ELEMENTS.has(el.tagName.toLowerCase())
958
+ ) {
959
+ break;
960
+ }
961
+ state.stack.pop();
962
+ }
963
+ state.activeFormattingElements.push(null);
964
+ };
965
+
713
966
  const isValidChildForTableParent = (
714
967
  parentTagName: string,
715
968
  childTagName: string,
@@ -760,13 +1013,102 @@ const insertWithFosterParenting = (state: ParserState, node: any): void => {
760
1013
  if (idx !== -1) {
761
1014
  node.parentNode = target.parent;
762
1015
  target.parent.childNodes.splice(idx, 0, node);
1016
+ if (node.nodeType === 3) {
1017
+ mergeAdjacentTextNodes(target.parent, idx);
1018
+ }
763
1019
  return;
764
1020
  }
765
1021
  }
766
1022
  appendChild(target.parent, node);
1023
+ if (node.nodeType === 3) {
1024
+ const insertedIdx = target.parent.childNodes.indexOf(node);
1025
+ if (insertedIdx !== -1) {
1026
+ mergeAdjacentTextNodes(target.parent, insertedIdx);
1027
+ }
1028
+ }
767
1029
  return;
768
1030
  }
769
1031
  }
770
1032
 
771
1033
  appendChild(currentParent, node);
772
1034
  };
1035
+
1036
+ const insertWithFosterParentingAndReconstruct = (
1037
+ state: ParserState,
1038
+ node: any,
1039
+ ): void => {
1040
+ const target = findFosterParentTarget(state);
1041
+ if (!target) {
1042
+ appendChild(getCurrentParent(state), node);
1043
+ return;
1044
+ }
1045
+
1046
+ const activeElements = getActiveFormattingElementsBeforeMarker(state);
1047
+
1048
+ if (activeElements.length === 0) {
1049
+ if (target.before) {
1050
+ const idx = target.parent.childNodes.indexOf(target.before);
1051
+ if (idx !== -1) {
1052
+ node.parentNode = target.parent;
1053
+ target.parent.childNodes.splice(idx, 0, node);
1054
+ if (node.nodeType === 3) {
1055
+ mergeAdjacentTextNodes(target.parent, idx);
1056
+ }
1057
+ return;
1058
+ }
1059
+ }
1060
+ appendChild(target.parent, node);
1061
+ if (node.nodeType === 3) {
1062
+ const insertedIdx = target.parent.childNodes.indexOf(node);
1063
+ if (insertedIdx !== -1) {
1064
+ mergeAdjacentTextNodes(target.parent, insertedIdx);
1065
+ }
1066
+ }
1067
+ return;
1068
+ }
1069
+
1070
+ const hasMarker = state.activeFormattingElements.includes(null);
1071
+ const lastFormatEl = activeElements[activeElements.length - 1];
1072
+
1073
+ if (
1074
+ !hasMarker &&
1075
+ lastFormatEl.parentNode === target.parent &&
1076
+ target.parent.childNodes.indexOf(lastFormatEl) <
1077
+ target.parent.childNodes.indexOf(target.before)
1078
+ ) {
1079
+ appendChild(lastFormatEl, node);
1080
+ return;
1081
+ }
1082
+
1083
+ let currentNode = node;
1084
+ for (let i = activeElements.length - 1; i >= 0; i--) {
1085
+ const formatEl = activeElements[i];
1086
+ const clone = cloneFormattingElement(formatEl);
1087
+ appendChild(clone, currentNode);
1088
+ currentNode = clone;
1089
+ }
1090
+
1091
+ if (target.before) {
1092
+ const idx = target.parent.childNodes.indexOf(target.before);
1093
+ if (idx !== -1) {
1094
+ currentNode.parentNode = target.parent;
1095
+ target.parent.childNodes.splice(idx, 0, currentNode);
1096
+ return;
1097
+ }
1098
+ }
1099
+ appendChild(target.parent, currentNode);
1100
+ };
1101
+
1102
+ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
1103
+ const result: any[] = [];
1104
+ for (let i = 0; i < state.activeFormattingElements.length; i++) {
1105
+ const el = state.activeFormattingElements[i];
1106
+ if (el === null) {
1107
+ continue;
1108
+ }
1109
+ if (!isInStack(state.stack, el)) {
1110
+ result.push(el);
1111
+ }
1112
+ }
1113
+ return result;
1114
+ };
@@ -0,0 +1,118 @@
1
+ import { it, expect } from "bun:test";
2
+ import { parseHTML } from "../index.js";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
4
+
5
+ it("should run AAA 2 times - test case with nested divs", () => {
6
+ const html = "<a>1<div>2<div>3</a>4</div>5</div>";
7
+ const doc = parseHTML(html);
8
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
9
+
10
+ const expected = `| <html>
11
+ | <head>
12
+ | <body>
13
+ | <a>
14
+ | "1"
15
+ | <div>
16
+ | <a>
17
+ | "2"
18
+ | <div>
19
+ | <a>
20
+ | "3"
21
+ | "4"
22
+ | "5"
23
+ `;
24
+
25
+ expect(serialized).toBe(expected);
26
+ });
27
+
28
+ it("should run AAA 8 times - deeply nested divs", () => {
29
+ const html =
30
+ "<div><a><b><div><div><div><div><div><div><div><div><div><div></a>";
31
+ const doc = parseHTML(html);
32
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
33
+
34
+ const expected = `| <html>
35
+ | <head>
36
+ | <body>
37
+ | <div>
38
+ | <a>
39
+ | <b>
40
+ | <b>
41
+ | <div>
42
+ | <a>
43
+ | <div>
44
+ | <a>
45
+ | <div>
46
+ | <a>
47
+ | <div>
48
+ | <a>
49
+ | <div>
50
+ | <a>
51
+ | <div>
52
+ | <a>
53
+ | <div>
54
+ | <a>
55
+ | <div>
56
+ | <a>
57
+ | <div>
58
+ | <div>
59
+ `;
60
+
61
+ expect(serialized).toBe(expected);
62
+ });
63
+
64
+ it("should run AAA 2 times - with style and address elements", () => {
65
+ const html = "<a><div><style></style><address><a>";
66
+ const doc = parseHTML(html);
67
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
68
+
69
+ const expected = `| <html>
70
+ | <head>
71
+ | <body>
72
+ | <a>
73
+ | <div>
74
+ | <a>
75
+ | <style>
76
+ | <address>
77
+ | <a>
78
+ | <a>
79
+ `;
80
+
81
+ expect(serialized).toBe(expected);
82
+ });
83
+
84
+ it("should run AAA with formatting element cloning", () => {
85
+ const html = "<a>x<div>y</a>z</div>";
86
+ const doc = parseHTML(html);
87
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
88
+
89
+ const expected = `| <html>
90
+ | <head>
91
+ | <body>
92
+ | <a>
93
+ | "x"
94
+ | <div>
95
+ | <a>
96
+ | "y"
97
+ | "z"
98
+ `;
99
+
100
+ expect(serialized).toBe(expected);
101
+ });
102
+
103
+ it("should stop AAA when no more formatting elements to adopt", () => {
104
+ const html = "<b>text</b><div>content</div>";
105
+ const doc = parseHTML(html);
106
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
107
+
108
+ const expected = `| <html>
109
+ | <head>
110
+ | <body>
111
+ | <b>
112
+ | "text"
113
+ | <div>
114
+ | "content"
115
+ `;
116
+
117
+ expect(serialized).toBe(expected);
118
+ });
@@ -0,0 +1,127 @@
1
+ import { expect, it, describe } from "bun:test";
2
+ import { parseHTML } from "../index";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter";
4
+
5
+ describe("Foster Parenting", () => {
6
+ describe("Text foster parenting", () => {
7
+ it("should foster parent text before table and merge adjacent text nodes", () => {
8
+ const doc = parseHTML("<table>A<td>B</td>C</table>");
9
+ const serialized = serializeToHtml5lib(doc, {
10
+ skipImplicitDoctype: true,
11
+ });
12
+ expect(serialized).toBe(`| <html>
13
+ | <head>
14
+ | <body>
15
+ | "AC"
16
+ | <table>
17
+ | <tbody>
18
+ | <tr>
19
+ | <td>
20
+ | "B"
21
+ `);
22
+ });
23
+
24
+ it("should foster parent text with whitespace correctly", () => {
25
+ const doc = parseHTML("<table> X </table>");
26
+ const serialized = serializeToHtml5lib(doc, {
27
+ skipImplicitDoctype: true,
28
+ });
29
+ expect(serialized).toBe(`| <html>
30
+ | <head>
31
+ | <body>
32
+ | " X "
33
+ | <table>
34
+ `);
35
+ });
36
+ });
37
+
38
+ describe("Element foster parenting", () => {
39
+ it("should foster parent <a> before table with AAA reconstruction", () => {
40
+ const doc = parseHTML("<table><a>1<td>2</td>3</table>");
41
+ const serialized = serializeToHtml5lib(doc, {
42
+ skipImplicitDoctype: true,
43
+ });
44
+ expect(serialized).toBe(`| <html>
45
+ | <head>
46
+ | <body>
47
+ | <a>
48
+ | "1"
49
+ | <a>
50
+ | "3"
51
+ | <table>
52
+ | <tbody>
53
+ | <tr>
54
+ | <td>
55
+ | "2"
56
+ `);
57
+ });
58
+
59
+ it("should foster parent elements with AAA for formatting in <p>", () => {
60
+ const doc = parseHTML("<table><a>1<p>2</a>3</p>");
61
+ const serialized = serializeToHtml5lib(doc, {
62
+ skipImplicitDoctype: true,
63
+ });
64
+ expect(serialized).toBe(`| <html>
65
+ | <head>
66
+ | <body>
67
+ | <a>
68
+ | "1"
69
+ | <p>
70
+ | <a>
71
+ | "2"
72
+ | "3"
73
+ | <table>
74
+ `);
75
+ });
76
+ });
77
+
78
+ describe("Implicit table structure", () => {
79
+ it("should create implicit tbody and tr for td in table", () => {
80
+ const doc = parseHTML("<table><td>X</td></table>");
81
+ const serialized = serializeToHtml5lib(doc, {
82
+ skipImplicitDoctype: true,
83
+ });
84
+ expect(serialized).toBe(`| <html>
85
+ | <head>
86
+ | <body>
87
+ | <table>
88
+ | <tbody>
89
+ | <tr>
90
+ | <td>
91
+ | "X"
92
+ `);
93
+ });
94
+
95
+ it("should create implicit tr for td in tbody", () => {
96
+ const doc = parseHTML("<table><tbody><td>X</td></tbody></table>");
97
+ const serialized = serializeToHtml5lib(doc, {
98
+ skipImplicitDoctype: true,
99
+ });
100
+ expect(serialized).toBe(`| <html>
101
+ | <head>
102
+ | <body>
103
+ | <table>
104
+ | <tbody>
105
+ | <tr>
106
+ | <td>
107
+ | "X"
108
+ `);
109
+ });
110
+
111
+ it("should not create implicit structure when tr is present", () => {
112
+ const doc = parseHTML("<table><tr><td>X</td></tr></table>");
113
+ const serialized = serializeToHtml5lib(doc, {
114
+ skipImplicitDoctype: true,
115
+ });
116
+ expect(serialized).toBe(`| <html>
117
+ | <head>
118
+ | <body>
119
+ | <table>
120
+ | <tbody>
121
+ | <tr>
122
+ | <td>
123
+ | "X"
124
+ `);
125
+ });
126
+ });
127
+ });
@@ -25,12 +25,12 @@ export function serializeToHtml5lib(
25
25
 
26
26
  let nsPrefix = "";
27
27
  if (ns === "http://www.w3.org/2000/svg") {
28
- nsPrefix = " svg";
28
+ nsPrefix = "svg ";
29
29
  } else if (ns === "http://www.w3.org/1998/Math/MathML") {
30
- nsPrefix = " math";
30
+ nsPrefix = "math ";
31
31
  }
32
32
 
33
- lines.push(`${indent}<${tagName}${nsPrefix}>`);
33
+ lines.push(`${indent}<${nsPrefix}${tagName}>`);
34
34
 
35
35
  // Atributos en orden alfabético
36
36
  const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
@@ -0,0 +1,113 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import { parseHTML } from "../index.js";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
4
+
5
+ describe("implicit close with formatting element reconstruction", () => {
6
+ it("should close <p> and reconstruct <b> elements when new <p> opens", () => {
7
+ const html = "<p><b><b><b><b><p>x";
8
+ const doc = parseHTML(html);
9
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
10
+
11
+ expect(result).toBe(`| <html>
12
+ | <head>
13
+ | <body>
14
+ | <p>
15
+ | <b>
16
+ | <b>
17
+ | <b>
18
+ | <b>
19
+ | <p>
20
+ | <b>
21
+ | <b>
22
+ | <b>
23
+ | "x"
24
+ `);
25
+ });
26
+
27
+ it("should close <p> through nested formatting and reconstruct (single <b>)", () => {
28
+ const html = "<p><b><p>x";
29
+ const doc = parseHTML(html);
30
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
31
+
32
+ expect(result).toBe(`| <html>
33
+ | <head>
34
+ | <body>
35
+ | <p>
36
+ | <b>
37
+ | <p>
38
+ | <b>
39
+ | "x"
40
+ `);
41
+ });
42
+
43
+ it("should handle text before and after implicit close", () => {
44
+ const html = "<p><b>1<p>2";
45
+ const doc = parseHTML(html);
46
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
47
+
48
+ expect(result).toBe(`| <html>
49
+ | <head>
50
+ | <body>
51
+ | <p>
52
+ | <b>
53
+ | "1"
54
+ | <p>
55
+ | <b>
56
+ | "2"
57
+ `);
58
+ });
59
+
60
+ it("should handle multiple different formatting elements", () => {
61
+ const html = "<p><b><i><p>x";
62
+ const doc = parseHTML(html);
63
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
64
+
65
+ expect(result).toBe(`| <html>
66
+ | <head>
67
+ | <body>
68
+ | <p>
69
+ | <b>
70
+ | <i>
71
+ | <p>
72
+ | <b>
73
+ | <i>
74
+ | "x"
75
+ `);
76
+ });
77
+
78
+ it("should handle div closing <p> and reconstructing formatting", () => {
79
+ const html = "<p><b><div>x";
80
+ const doc = parseHTML(html);
81
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
82
+
83
+ expect(result).toBe(`| <html>
84
+ | <head>
85
+ | <body>
86
+ | <p>
87
+ | <b>
88
+ | <div>
89
+ | <b>
90
+ | "x"
91
+ `);
92
+ });
93
+
94
+ it("should handle multiple auto-closing with formatting", () => {
95
+ const html = "<p><b><p><i><p>x";
96
+ const doc = parseHTML(html);
97
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
98
+
99
+ expect(result).toBe(`| <html>
100
+ | <head>
101
+ | <body>
102
+ | <p>
103
+ | <b>
104
+ | <p>
105
+ | <b>
106
+ | <i>
107
+ | <p>
108
+ | <b>
109
+ | <i>
110
+ | "x"
111
+ `);
112
+ });
113
+ });
@@ -31,7 +31,9 @@ describe("Tree Construction Adoption01 Tests", () => {
31
31
  }
32
32
  }
33
33
 
34
- const passingTests = [1, 2, 3, 4, 7, 8, 9, 16];
34
+ const passingTests = [
35
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
36
+ ];
35
37
  const testFn = passingTests.includes(index + 1) ? it : it.skip;
36
38
 
37
39
  testFn(`Adoption test ${index + 1}`, () => {
@@ -9,7 +9,7 @@ describe("Tree Construction Adoption02 Tests", () => {
9
9
  "utf8",
10
10
  );
11
11
  const sections = content.split("#data\n").slice(1);
12
- const passingTests = [1];
12
+ const passingTests = [1, 2];
13
13
 
14
14
  sections.forEach((section, index) => {
15
15
  const lines = section.trim().split("\n");