@tkeron/html-parser 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/parser/constants.ts +12 -0
- package/src/parser/parse.ts +253 -63
- package/tests/adoption-multiple-iterations.test.ts +118 -0
- package/tests/helpers/tree-adapter.ts +3 -3
- package/tests/implicit-close-formatting.test.ts +113 -0
- package/tests/tree-construction-adoption01.test.ts +3 -1
- package/tests/tree-construction-adoption02.test.ts +1 -1
package/package.json
CHANGED
package/src/parser/constants.ts
CHANGED
|
@@ -135,3 +135,15 @@ export const VALID_TR_CHILDREN = new Set([
|
|
|
135
135
|
"template",
|
|
136
136
|
"style",
|
|
137
137
|
]);
|
|
138
|
+
|
|
139
|
+
export const BUTTON_SCOPE_TERMINATORS = new Set([
|
|
140
|
+
"applet",
|
|
141
|
+
"caption",
|
|
142
|
+
"html",
|
|
143
|
+
"table",
|
|
144
|
+
"td",
|
|
145
|
+
"th",
|
|
146
|
+
"marquee",
|
|
147
|
+
"object",
|
|
148
|
+
"template",
|
|
149
|
+
]);
|
package/src/parser/parse.ts
CHANGED
|
@@ -21,9 +21,9 @@ import {
|
|
|
21
21
|
VALID_TABLE_CHILDREN,
|
|
22
22
|
VALID_TABLE_SECTION_CHILDREN,
|
|
23
23
|
VALID_TR_CHILDREN,
|
|
24
|
+
BUTTON_SCOPE_TERMINATORS,
|
|
24
25
|
} from "./constants";
|
|
25
26
|
import {
|
|
26
|
-
findFormattingElementInStack,
|
|
27
27
|
findFurthestBlock,
|
|
28
28
|
getCommonAncestor,
|
|
29
29
|
cloneFormattingElement,
|
|
@@ -333,7 +333,16 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
333
333
|
if (token.type === TokenType.TAG_OPEN) {
|
|
334
334
|
const tagName = token.value.toLowerCase();
|
|
335
335
|
|
|
336
|
-
|
|
336
|
+
if (tagName === "a") {
|
|
337
|
+
const existingA = state.activeFormattingElements.find(
|
|
338
|
+
(el) => el && el.tagName && el.tagName.toLowerCase() === "a",
|
|
339
|
+
);
|
|
340
|
+
if (existingA) {
|
|
341
|
+
runAdoptionAgencyAlgorithm(state, "a");
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const closedParagraph = handleAutoClosing(state, tagName);
|
|
337
346
|
|
|
338
347
|
const inTableContext = isInTableContext(state);
|
|
339
348
|
const isTableStructureElement =
|
|
@@ -354,7 +363,7 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
354
363
|
if (tableParent) {
|
|
355
364
|
popStackUntilTableContext(state);
|
|
356
365
|
}
|
|
357
|
-
} else if (!parentIsTableContext) {
|
|
366
|
+
} else if (!parentIsTableContext && !closedParagraph) {
|
|
358
367
|
reconstructActiveFormattingElements(state);
|
|
359
368
|
}
|
|
360
369
|
|
|
@@ -365,6 +374,8 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
365
374
|
namespaceURI = SVG_NAMESPACE;
|
|
366
375
|
} else if (tagName === "math") {
|
|
367
376
|
namespaceURI = MATHML_NAMESPACE;
|
|
377
|
+
} else {
|
|
378
|
+
namespaceURI = getCurrentNamespace(state);
|
|
368
379
|
}
|
|
369
380
|
|
|
370
381
|
const element = createElement(
|
|
@@ -413,24 +424,28 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
413
424
|
}
|
|
414
425
|
|
|
415
426
|
if (isFormattingElement) {
|
|
416
|
-
state
|
|
427
|
+
pushToActiveFormattingElements(state, element);
|
|
417
428
|
}
|
|
418
429
|
}
|
|
419
430
|
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
420
431
|
const tagName = token.value.toLowerCase();
|
|
421
432
|
|
|
422
|
-
if (FORMATTING_ELEMENTS.has(tagName)) {
|
|
433
|
+
if (FORMATTING_ELEMENTS.has(tagName) && !isInForeignContent(state)) {
|
|
423
434
|
runAdoptionAgencyAlgorithm(state, tagName);
|
|
424
435
|
return;
|
|
425
436
|
}
|
|
426
437
|
|
|
438
|
+
if (tagName === "p") {
|
|
439
|
+
closeParagraphElement(state);
|
|
440
|
+
return;
|
|
441
|
+
}
|
|
442
|
+
|
|
427
443
|
const impliedEndTags = [
|
|
428
444
|
"dd",
|
|
429
445
|
"dt",
|
|
430
446
|
"li",
|
|
431
447
|
"option",
|
|
432
448
|
"optgroup",
|
|
433
|
-
"p",
|
|
434
449
|
"rb",
|
|
435
450
|
"rp",
|
|
436
451
|
"rt",
|
|
@@ -479,76 +494,132 @@ const runAdoptionAgencyAlgorithm = (
|
|
|
479
494
|
state: ParserState,
|
|
480
495
|
tagName: string,
|
|
481
496
|
): void => {
|
|
482
|
-
const
|
|
497
|
+
const maxIterations = 8;
|
|
483
498
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
499
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
500
|
+
const formattingElementIndex = state.activeFormattingElements.findIndex(
|
|
501
|
+
(el) =>
|
|
502
|
+
el && el.tagName && el.tagName.toLowerCase() === tagName.toLowerCase(),
|
|
503
|
+
);
|
|
487
504
|
|
|
488
|
-
|
|
505
|
+
if (formattingElementIndex === -1) {
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
489
508
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
state.stack.
|
|
493
|
-
removeFromActiveFormattingElements(state, formattingElement);
|
|
494
|
-
return;
|
|
495
|
-
}
|
|
509
|
+
const formattingElement =
|
|
510
|
+
state.activeFormattingElements[formattingElementIndex];
|
|
511
|
+
const stackIndex = state.stack.indexOf(formattingElement);
|
|
496
512
|
|
|
497
|
-
|
|
513
|
+
if (stackIndex === -1) {
|
|
514
|
+
state.activeFormattingElements.splice(formattingElementIndex, 1);
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
498
517
|
|
|
499
|
-
|
|
500
|
-
|
|
518
|
+
const currentElement = getCurrentElement(state);
|
|
519
|
+
if (currentElement === formattingElement) {
|
|
501
520
|
state.stack.pop();
|
|
521
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
522
|
+
return;
|
|
502
523
|
}
|
|
503
|
-
removeFromActiveFormattingElements(state, formattingElement);
|
|
504
|
-
return;
|
|
505
|
-
}
|
|
506
524
|
|
|
507
|
-
|
|
508
|
-
const commonAncestor = getCommonAncestor(state.stack, formattingElementIndex);
|
|
525
|
+
const fbResult = findFurthestBlock(state.stack, stackIndex);
|
|
509
526
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
527
|
+
if (!fbResult) {
|
|
528
|
+
while (state.stack.length > stackIndex) {
|
|
529
|
+
state.stack.pop();
|
|
530
|
+
}
|
|
531
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
532
|
+
return;
|
|
533
|
+
}
|
|
513
534
|
|
|
514
|
-
|
|
515
|
-
|
|
535
|
+
const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
|
|
536
|
+
const commonAncestor = getCommonAncestor(state.stack, stackIndex);
|
|
516
537
|
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
538
|
+
if (!commonAncestor) {
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
let lastNode = furthestBlock;
|
|
543
|
+
const clonedNodes: any[] = [];
|
|
544
|
+
const nodesToRemoveFromStack: any[] = [];
|
|
545
|
+
let innerLoopCounter = 0;
|
|
546
|
+
let nodeIndex = furthestBlockIndex;
|
|
521
547
|
|
|
522
|
-
|
|
548
|
+
while (true) {
|
|
549
|
+
innerLoopCounter++;
|
|
550
|
+
nodeIndex--;
|
|
551
|
+
const node = state.stack[nodeIndex];
|
|
523
552
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
553
|
+
if (node === formattingElement) {
|
|
554
|
+
break;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
if (
|
|
558
|
+
innerLoopCounter > 3 &&
|
|
559
|
+
state.activeFormattingElements.includes(node)
|
|
560
|
+
) {
|
|
561
|
+
removeFromActiveFormattingElements(state, node);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
if (!state.activeFormattingElements.includes(node)) {
|
|
565
|
+
nodesToRemoveFromStack.push(node);
|
|
566
|
+
continue;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
const nodeClone = cloneFormattingElement(node);
|
|
570
|
+
clonedNodes.unshift(nodeClone);
|
|
571
|
+
|
|
572
|
+
replaceInActiveFormattingElements(state, node, nodeClone);
|
|
573
|
+
|
|
574
|
+
const nodeChildIdx = node.childNodes.indexOf(lastNode);
|
|
575
|
+
if (nodeChildIdx !== -1) {
|
|
576
|
+
node.childNodes.splice(nodeChildIdx, 1);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
appendChild(nodeClone, lastNode);
|
|
580
|
+
lastNode = nodeClone;
|
|
527
581
|
}
|
|
528
582
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
583
|
+
for (const node of nodesToRemoveFromStack) {
|
|
584
|
+
const idx = state.stack.indexOf(node);
|
|
585
|
+
if (idx !== -1) {
|
|
586
|
+
state.stack.splice(idx, 1);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
532
589
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
590
|
+
const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
|
|
591
|
+
if (fbIdx !== -1) {
|
|
592
|
+
formattingElement.childNodes.splice(fbIdx, 1);
|
|
593
|
+
furthestBlock.parentNode = null;
|
|
594
|
+
}
|
|
538
595
|
|
|
539
|
-
|
|
596
|
+
appendChild(commonAncestor, lastNode);
|
|
540
597
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
598
|
+
const newFormattingElement = cloneFormattingElement(formattingElement);
|
|
599
|
+
reparentChildren(furthestBlock, newFormattingElement);
|
|
600
|
+
appendChild(furthestBlock, newFormattingElement);
|
|
544
601
|
|
|
545
|
-
|
|
602
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
603
|
+
state.activeFormattingElements.splice(
|
|
604
|
+
formattingElementIndex,
|
|
605
|
+
0,
|
|
606
|
+
newFormattingElement,
|
|
607
|
+
);
|
|
608
|
+
|
|
609
|
+
const elementsAfterFurthestBlock = state.stack.slice(
|
|
610
|
+
furthestBlockIndex + 1,
|
|
611
|
+
);
|
|
546
612
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
613
|
+
state.stack.length = stackIndex;
|
|
614
|
+
for (const clonedNode of clonedNodes) {
|
|
615
|
+
state.stack.push(clonedNode);
|
|
616
|
+
}
|
|
617
|
+
state.stack.push(furthestBlock);
|
|
618
|
+
state.stack.push(newFormattingElement);
|
|
619
|
+
for (const element of elementsAfterFurthestBlock) {
|
|
620
|
+
state.stack.push(element);
|
|
621
|
+
}
|
|
550
622
|
}
|
|
551
|
-
state.stack.push(furthestBlock);
|
|
552
623
|
};
|
|
553
624
|
|
|
554
625
|
const removeFromActiveFormattingElements = (
|
|
@@ -572,6 +643,60 @@ const replaceInActiveFormattingElements = (
|
|
|
572
643
|
}
|
|
573
644
|
};
|
|
574
645
|
|
|
646
|
+
const pushToActiveFormattingElements = (
|
|
647
|
+
state: ParserState,
|
|
648
|
+
element: any,
|
|
649
|
+
): void => {
|
|
650
|
+
const list = state.activeFormattingElements;
|
|
651
|
+
const tagName = element.tagName?.toLowerCase();
|
|
652
|
+
|
|
653
|
+
let count = 0;
|
|
654
|
+
let oldestMatchIndex = -1;
|
|
655
|
+
|
|
656
|
+
for (let i = list.length - 1; i >= 0; i--) {
|
|
657
|
+
const entry = list[i];
|
|
658
|
+
if (entry === null) {
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
if (
|
|
663
|
+
entry.tagName?.toLowerCase() === tagName &&
|
|
664
|
+
attributesMatch(entry, element)
|
|
665
|
+
) {
|
|
666
|
+
if (oldestMatchIndex === -1) {
|
|
667
|
+
oldestMatchIndex = i;
|
|
668
|
+
}
|
|
669
|
+
count++;
|
|
670
|
+
if (count >= 3) {
|
|
671
|
+
list.splice(oldestMatchIndex, 1);
|
|
672
|
+
break;
|
|
673
|
+
}
|
|
674
|
+
oldestMatchIndex = i;
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
list.push(element);
|
|
679
|
+
};
|
|
680
|
+
|
|
681
|
+
const attributesMatch = (el1: any, el2: any): boolean => {
|
|
682
|
+
const attrs1 = el1.attributes || {};
|
|
683
|
+
const attrs2 = el2.attributes || {};
|
|
684
|
+
const keys1 = Object.keys(attrs1);
|
|
685
|
+
const keys2 = Object.keys(attrs2);
|
|
686
|
+
|
|
687
|
+
if (keys1.length !== keys2.length) {
|
|
688
|
+
return false;
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
for (const key of keys1) {
|
|
692
|
+
if (attrs1[key] !== attrs2[key]) {
|
|
693
|
+
return false;
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
return true;
|
|
698
|
+
};
|
|
699
|
+
|
|
575
700
|
const parseText = (state: ParserState, token: Token): void => {
|
|
576
701
|
const content = token.value;
|
|
577
702
|
|
|
@@ -621,18 +746,57 @@ const parseProcessingInstruction = (state: ParserState, token: Token): void => {
|
|
|
621
746
|
appendChild(currentParent, piNode);
|
|
622
747
|
};
|
|
623
748
|
|
|
624
|
-
const
|
|
749
|
+
const closeParagraphElement = (state: ParserState): void => {
|
|
750
|
+
let pIndex = -1;
|
|
751
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
752
|
+
const element = state.stack[i];
|
|
753
|
+
const elementTag = element.tagName?.toLowerCase();
|
|
754
|
+
|
|
755
|
+
if (elementTag === "p") {
|
|
756
|
+
pIndex = i;
|
|
757
|
+
break;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
|
|
761
|
+
return;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
if (pIndex === -1) {
|
|
766
|
+
return;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
while (state.stack.length > pIndex) {
|
|
770
|
+
state.stack.pop();
|
|
771
|
+
}
|
|
772
|
+
};
|
|
773
|
+
|
|
774
|
+
const handleAutoClosing = (state: ParserState, tagName: string): boolean => {
|
|
625
775
|
const autoCloseList = AUTO_CLOSE_RULES[tagName];
|
|
626
|
-
if (!autoCloseList) return;
|
|
776
|
+
if (!autoCloseList) return false;
|
|
627
777
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
778
|
+
let targetIndex = -1;
|
|
779
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
780
|
+
const element = state.stack[i];
|
|
781
|
+
const elementTag = element.tagName?.toLowerCase();
|
|
782
|
+
|
|
783
|
+
if (elementTag && autoCloseList.includes(elementTag)) {
|
|
784
|
+
targetIndex = i;
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
|
|
789
|
+
return false;
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
if (targetIndex === -1) return false;
|
|
794
|
+
|
|
795
|
+
while (state.stack.length > targetIndex) {
|
|
634
796
|
state.stack.pop();
|
|
635
797
|
}
|
|
798
|
+
|
|
799
|
+
return true;
|
|
636
800
|
};
|
|
637
801
|
|
|
638
802
|
const getCurrentParent = (state: ParserState): any => {
|
|
@@ -748,6 +912,32 @@ const isInTableContext = (state: ParserState): boolean => {
|
|
|
748
912
|
return false;
|
|
749
913
|
};
|
|
750
914
|
|
|
915
|
+
const isInForeignContent = (state: ParserState): boolean => {
|
|
916
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
917
|
+
const el = state.stack[i];
|
|
918
|
+
if (
|
|
919
|
+
el.namespaceURI === SVG_NAMESPACE ||
|
|
920
|
+
el.namespaceURI === MATHML_NAMESPACE
|
|
921
|
+
) {
|
|
922
|
+
return true;
|
|
923
|
+
}
|
|
924
|
+
if (el.tagName && el.tagName.toLowerCase() === "html") {
|
|
925
|
+
return false;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
return false;
|
|
929
|
+
};
|
|
930
|
+
|
|
931
|
+
const getCurrentNamespace = (state: ParserState): string | undefined => {
|
|
932
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
933
|
+
const el = state.stack[i];
|
|
934
|
+
if (el.namespaceURI) {
|
|
935
|
+
return el.namespaceURI;
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
return undefined;
|
|
939
|
+
};
|
|
940
|
+
|
|
751
941
|
const findTableContextParent = (state: ParserState): any | null => {
|
|
752
942
|
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
753
943
|
const el = state.stack[i];
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../index.js";
|
|
3
|
+
import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
|
|
4
|
+
|
|
5
|
+
it("should run AAA 2 times - test case with nested divs", () => {
|
|
6
|
+
const html = "<a>1<div>2<div>3</a>4</div>5</div>";
|
|
7
|
+
const doc = parseHTML(html);
|
|
8
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
9
|
+
|
|
10
|
+
const expected = `| <html>
|
|
11
|
+
| <head>
|
|
12
|
+
| <body>
|
|
13
|
+
| <a>
|
|
14
|
+
| "1"
|
|
15
|
+
| <div>
|
|
16
|
+
| <a>
|
|
17
|
+
| "2"
|
|
18
|
+
| <div>
|
|
19
|
+
| <a>
|
|
20
|
+
| "3"
|
|
21
|
+
| "4"
|
|
22
|
+
| "5"
|
|
23
|
+
`;
|
|
24
|
+
|
|
25
|
+
expect(serialized).toBe(expected);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("should run AAA 8 times - deeply nested divs", () => {
|
|
29
|
+
const html =
|
|
30
|
+
"<div><a><b><div><div><div><div><div><div><div><div><div><div></a>";
|
|
31
|
+
const doc = parseHTML(html);
|
|
32
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
33
|
+
|
|
34
|
+
const expected = `| <html>
|
|
35
|
+
| <head>
|
|
36
|
+
| <body>
|
|
37
|
+
| <div>
|
|
38
|
+
| <a>
|
|
39
|
+
| <b>
|
|
40
|
+
| <b>
|
|
41
|
+
| <div>
|
|
42
|
+
| <a>
|
|
43
|
+
| <div>
|
|
44
|
+
| <a>
|
|
45
|
+
| <div>
|
|
46
|
+
| <a>
|
|
47
|
+
| <div>
|
|
48
|
+
| <a>
|
|
49
|
+
| <div>
|
|
50
|
+
| <a>
|
|
51
|
+
| <div>
|
|
52
|
+
| <a>
|
|
53
|
+
| <div>
|
|
54
|
+
| <a>
|
|
55
|
+
| <div>
|
|
56
|
+
| <a>
|
|
57
|
+
| <div>
|
|
58
|
+
| <div>
|
|
59
|
+
`;
|
|
60
|
+
|
|
61
|
+
expect(serialized).toBe(expected);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("should run AAA 2 times - with style and address elements", () => {
|
|
65
|
+
const html = "<a><div><style></style><address><a>";
|
|
66
|
+
const doc = parseHTML(html);
|
|
67
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
68
|
+
|
|
69
|
+
const expected = `| <html>
|
|
70
|
+
| <head>
|
|
71
|
+
| <body>
|
|
72
|
+
| <a>
|
|
73
|
+
| <div>
|
|
74
|
+
| <a>
|
|
75
|
+
| <style>
|
|
76
|
+
| <address>
|
|
77
|
+
| <a>
|
|
78
|
+
| <a>
|
|
79
|
+
`;
|
|
80
|
+
|
|
81
|
+
expect(serialized).toBe(expected);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("should run AAA with formatting element cloning", () => {
|
|
85
|
+
const html = "<a>x<div>y</a>z</div>";
|
|
86
|
+
const doc = parseHTML(html);
|
|
87
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
88
|
+
|
|
89
|
+
const expected = `| <html>
|
|
90
|
+
| <head>
|
|
91
|
+
| <body>
|
|
92
|
+
| <a>
|
|
93
|
+
| "x"
|
|
94
|
+
| <div>
|
|
95
|
+
| <a>
|
|
96
|
+
| "y"
|
|
97
|
+
| "z"
|
|
98
|
+
`;
|
|
99
|
+
|
|
100
|
+
expect(serialized).toBe(expected);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it("should stop AAA when no more formatting elements to adopt", () => {
|
|
104
|
+
const html = "<b>text</b><div>content</div>";
|
|
105
|
+
const doc = parseHTML(html);
|
|
106
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
107
|
+
|
|
108
|
+
const expected = `| <html>
|
|
109
|
+
| <head>
|
|
110
|
+
| <body>
|
|
111
|
+
| <b>
|
|
112
|
+
| "text"
|
|
113
|
+
| <div>
|
|
114
|
+
| "content"
|
|
115
|
+
`;
|
|
116
|
+
|
|
117
|
+
expect(serialized).toBe(expected);
|
|
118
|
+
});
|
|
@@ -25,12 +25,12 @@ export function serializeToHtml5lib(
|
|
|
25
25
|
|
|
26
26
|
let nsPrefix = "";
|
|
27
27
|
if (ns === "http://www.w3.org/2000/svg") {
|
|
28
|
-
nsPrefix = "
|
|
28
|
+
nsPrefix = "svg ";
|
|
29
29
|
} else if (ns === "http://www.w3.org/1998/Math/MathML") {
|
|
30
|
-
nsPrefix = "
|
|
30
|
+
nsPrefix = "math ";
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
lines.push(`${indent}<${
|
|
33
|
+
lines.push(`${indent}<${nsPrefix}${tagName}>`);
|
|
34
34
|
|
|
35
35
|
// Atributos en orden alfabético
|
|
36
36
|
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../index.js";
|
|
3
|
+
import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
|
|
4
|
+
|
|
5
|
+
describe("implicit close with formatting element reconstruction", () => {
|
|
6
|
+
it("should close <p> and reconstruct <b> elements when new <p> opens", () => {
|
|
7
|
+
const html = "<p><b><b><b><b><p>x";
|
|
8
|
+
const doc = parseHTML(html);
|
|
9
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
10
|
+
|
|
11
|
+
expect(result).toBe(`| <html>
|
|
12
|
+
| <head>
|
|
13
|
+
| <body>
|
|
14
|
+
| <p>
|
|
15
|
+
| <b>
|
|
16
|
+
| <b>
|
|
17
|
+
| <b>
|
|
18
|
+
| <b>
|
|
19
|
+
| <p>
|
|
20
|
+
| <b>
|
|
21
|
+
| <b>
|
|
22
|
+
| <b>
|
|
23
|
+
| "x"
|
|
24
|
+
`);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("should close <p> through nested formatting and reconstruct (single <b>)", () => {
|
|
28
|
+
const html = "<p><b><p>x";
|
|
29
|
+
const doc = parseHTML(html);
|
|
30
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
31
|
+
|
|
32
|
+
expect(result).toBe(`| <html>
|
|
33
|
+
| <head>
|
|
34
|
+
| <body>
|
|
35
|
+
| <p>
|
|
36
|
+
| <b>
|
|
37
|
+
| <p>
|
|
38
|
+
| <b>
|
|
39
|
+
| "x"
|
|
40
|
+
`);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("should handle text before and after implicit close", () => {
|
|
44
|
+
const html = "<p><b>1<p>2";
|
|
45
|
+
const doc = parseHTML(html);
|
|
46
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
47
|
+
|
|
48
|
+
expect(result).toBe(`| <html>
|
|
49
|
+
| <head>
|
|
50
|
+
| <body>
|
|
51
|
+
| <p>
|
|
52
|
+
| <b>
|
|
53
|
+
| "1"
|
|
54
|
+
| <p>
|
|
55
|
+
| <b>
|
|
56
|
+
| "2"
|
|
57
|
+
`);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("should handle multiple different formatting elements", () => {
|
|
61
|
+
const html = "<p><b><i><p>x";
|
|
62
|
+
const doc = parseHTML(html);
|
|
63
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
64
|
+
|
|
65
|
+
expect(result).toBe(`| <html>
|
|
66
|
+
| <head>
|
|
67
|
+
| <body>
|
|
68
|
+
| <p>
|
|
69
|
+
| <b>
|
|
70
|
+
| <i>
|
|
71
|
+
| <p>
|
|
72
|
+
| <b>
|
|
73
|
+
| <i>
|
|
74
|
+
| "x"
|
|
75
|
+
`);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("should handle div closing <p> and reconstructing formatting", () => {
|
|
79
|
+
const html = "<p><b><div>x";
|
|
80
|
+
const doc = parseHTML(html);
|
|
81
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
82
|
+
|
|
83
|
+
expect(result).toBe(`| <html>
|
|
84
|
+
| <head>
|
|
85
|
+
| <body>
|
|
86
|
+
| <p>
|
|
87
|
+
| <b>
|
|
88
|
+
| <div>
|
|
89
|
+
| <b>
|
|
90
|
+
| "x"
|
|
91
|
+
`);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("should handle multiple auto-closing with formatting", () => {
|
|
95
|
+
const html = "<p><b><p><i><p>x";
|
|
96
|
+
const doc = parseHTML(html);
|
|
97
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
98
|
+
|
|
99
|
+
expect(result).toBe(`| <html>
|
|
100
|
+
| <head>
|
|
101
|
+
| <body>
|
|
102
|
+
| <p>
|
|
103
|
+
| <b>
|
|
104
|
+
| <p>
|
|
105
|
+
| <b>
|
|
106
|
+
| <i>
|
|
107
|
+
| <p>
|
|
108
|
+
| <b>
|
|
109
|
+
| <i>
|
|
110
|
+
| "x"
|
|
111
|
+
`);
|
|
112
|
+
});
|
|
113
|
+
});
|
|
@@ -31,7 +31,9 @@ describe("Tree Construction Adoption01 Tests", () => {
|
|
|
31
31
|
}
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
-
const passingTests = [
|
|
34
|
+
const passingTests = [
|
|
35
|
+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
|
|
36
|
+
];
|
|
35
37
|
const testFn = passingTests.includes(index + 1) ? it : it.skip;
|
|
36
38
|
|
|
37
39
|
testFn(`Adoption test ${index + 1}`, () => {
|
|
@@ -9,7 +9,7 @@ describe("Tree Construction Adoption02 Tests", () => {
|
|
|
9
9
|
"utf8",
|
|
10
10
|
);
|
|
11
11
|
const sections = content.split("#data\n").slice(1);
|
|
12
|
-
const passingTests = [1];
|
|
12
|
+
const passingTests = [1, 2];
|
|
13
13
|
|
|
14
14
|
sections.forEach((section, index) => {
|
|
15
15
|
const lines = section.trim().split("\n");
|