@tkeron/html-parser 1.4.0 â 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -1
- package/index.ts +6 -2
- package/package.json +1 -1
- package/src/parser/constants.ts +12 -0
- package/src/parser/index.ts +1 -1
- package/src/parser/parse.ts +283 -63
- package/tests/adoption-multiple-iterations.test.ts +118 -0
- package/tests/helpers/tree-adapter.ts +52 -3
- package/tests/implicit-close-formatting.test.ts +113 -0
- package/tests/tree-construction-adoption01.test.ts +30 -14
- package/tests/tree-construction-adoption02.test.ts +1 -1
package/README.md
CHANGED
|
@@ -9,8 +9,9 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
9
9
|
- ðŠķ **Lightweight**: Zero external dependencies
|
|
10
10
|
- ð **Standards Compliant**: Returns standard DOM Document objects
|
|
11
11
|
- ð§ **TypeScript Support**: Full TypeScript definitions included
|
|
12
|
-
- â
**Well Tested**: Comprehensive test suite (
|
|
12
|
+
- â
**Well Tested**: Comprehensive test suite (5660+ tests passing)
|
|
13
13
|
- ðŊ **HTML5 Spec**: Implements Adoption Agency Algorithm for proper formatting element handling
|
|
14
|
+
- ð§Đ **Fragment Parsing**: Parse HTML fragments with context element support
|
|
14
15
|
|
|
15
16
|
## Installation
|
|
16
17
|
|
|
@@ -76,6 +77,28 @@ Parses an HTML string and returns a DOM Document object.
|
|
|
76
77
|
|
|
77
78
|
- `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
|
|
78
79
|
|
|
80
|
+
### `parseHTMLFragment(html: string, contextTagName: string): Node[]`
|
|
81
|
+
|
|
82
|
+
Parses an HTML string as a fragment within a context element. Useful for parsing innerHTML-style content.
|
|
83
|
+
|
|
84
|
+
**Parameters:**
|
|
85
|
+
|
|
86
|
+
- `html` (string): The HTML string to parse
|
|
87
|
+
- `contextTagName` (string): The tag name of the context element (e.g., `"div"`, `"body"`)
|
|
88
|
+
|
|
89
|
+
**Returns:**
|
|
90
|
+
|
|
91
|
+
- `Node[]`: An array of parsed nodes
|
|
92
|
+
|
|
93
|
+
**Example:**
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
import { parseHTMLFragment } from "@tkeron/html-parser";
|
|
97
|
+
|
|
98
|
+
const nodes = parseHTMLFragment("<b>Hello</b> <i>World</i>", "div");
|
|
99
|
+
console.log(nodes.length); // 3 (b element, text node, i element)
|
|
100
|
+
```
|
|
101
|
+
|
|
79
102
|
## Development
|
|
80
103
|
|
|
81
104
|
This project is built with Bun. To get started:
|
package/index.ts
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import { tokenize } from "./src/tokenizer/index.js";
|
|
2
|
-
import { parse } from "./src/parser/index.js";
|
|
2
|
+
import { parse, parseFragment } from "./src/parser/index.js";
|
|
3
3
|
import { astToDOM } from "./src/dom-simulator.js";
|
|
4
4
|
|
|
5
5
|
export function parseHTML(html: string = ""): Document {
|
|
6
6
|
const tokens = tokenize(html);
|
|
7
7
|
const ast = parse(tokens);
|
|
8
|
-
// If parse already returns a DOM document, return it directly
|
|
9
8
|
if (ast && typeof ast.nodeType === "number" && ast.nodeType === 9) {
|
|
10
9
|
return ast;
|
|
11
10
|
}
|
|
12
11
|
return astToDOM(ast);
|
|
13
12
|
}
|
|
13
|
+
|
|
14
|
+
export function parseHTMLFragment(html: string, contextTagName: string): any[] {
|
|
15
|
+
const tokens = tokenize(html);
|
|
16
|
+
return parseFragment(tokens, contextTagName);
|
|
17
|
+
}
|
package/package.json
CHANGED
package/src/parser/constants.ts
CHANGED
|
@@ -135,3 +135,15 @@ export const VALID_TR_CHILDREN = new Set([
|
|
|
135
135
|
"template",
|
|
136
136
|
"style",
|
|
137
137
|
]);
|
|
138
|
+
|
|
139
|
+
export const BUTTON_SCOPE_TERMINATORS = new Set([
|
|
140
|
+
"applet",
|
|
141
|
+
"caption",
|
|
142
|
+
"html",
|
|
143
|
+
"table",
|
|
144
|
+
"td",
|
|
145
|
+
"th",
|
|
146
|
+
"marquee",
|
|
147
|
+
"object",
|
|
148
|
+
"template",
|
|
149
|
+
]);
|
package/src/parser/index.ts
CHANGED
package/src/parser/parse.ts
CHANGED
|
@@ -21,9 +21,9 @@ import {
|
|
|
21
21
|
VALID_TABLE_CHILDREN,
|
|
22
22
|
VALID_TABLE_SECTION_CHILDREN,
|
|
23
23
|
VALID_TR_CHILDREN,
|
|
24
|
+
BUTTON_SCOPE_TERMINATORS,
|
|
24
25
|
} from "./constants";
|
|
25
26
|
import {
|
|
26
|
-
findFormattingElementInStack,
|
|
27
27
|
findFurthestBlock,
|
|
28
28
|
getCommonAncestor,
|
|
29
29
|
cloneFormattingElement,
|
|
@@ -333,7 +333,16 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
333
333
|
if (token.type === TokenType.TAG_OPEN) {
|
|
334
334
|
const tagName = token.value.toLowerCase();
|
|
335
335
|
|
|
336
|
-
|
|
336
|
+
if (tagName === "a") {
|
|
337
|
+
const existingA = state.activeFormattingElements.find(
|
|
338
|
+
(el) => el && el.tagName && el.tagName.toLowerCase() === "a",
|
|
339
|
+
);
|
|
340
|
+
if (existingA) {
|
|
341
|
+
runAdoptionAgencyAlgorithm(state, "a");
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const closedParagraph = handleAutoClosing(state, tagName);
|
|
337
346
|
|
|
338
347
|
const inTableContext = isInTableContext(state);
|
|
339
348
|
const isTableStructureElement =
|
|
@@ -354,7 +363,7 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
354
363
|
if (tableParent) {
|
|
355
364
|
popStackUntilTableContext(state);
|
|
356
365
|
}
|
|
357
|
-
} else if (!parentIsTableContext) {
|
|
366
|
+
} else if (!parentIsTableContext && !closedParagraph) {
|
|
358
367
|
reconstructActiveFormattingElements(state);
|
|
359
368
|
}
|
|
360
369
|
|
|
@@ -365,6 +374,8 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
365
374
|
namespaceURI = SVG_NAMESPACE;
|
|
366
375
|
} else if (tagName === "math") {
|
|
367
376
|
namespaceURI = MATHML_NAMESPACE;
|
|
377
|
+
} else {
|
|
378
|
+
namespaceURI = getCurrentNamespace(state);
|
|
368
379
|
}
|
|
369
380
|
|
|
370
381
|
const element = createElement(
|
|
@@ -413,24 +424,28 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
|
|
|
413
424
|
}
|
|
414
425
|
|
|
415
426
|
if (isFormattingElement) {
|
|
416
|
-
state
|
|
427
|
+
pushToActiveFormattingElements(state, element);
|
|
417
428
|
}
|
|
418
429
|
}
|
|
419
430
|
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
420
431
|
const tagName = token.value.toLowerCase();
|
|
421
432
|
|
|
422
|
-
if (FORMATTING_ELEMENTS.has(tagName)) {
|
|
433
|
+
if (FORMATTING_ELEMENTS.has(tagName) && !isInForeignContent(state)) {
|
|
423
434
|
runAdoptionAgencyAlgorithm(state, tagName);
|
|
424
435
|
return;
|
|
425
436
|
}
|
|
426
437
|
|
|
438
|
+
if (tagName === "p") {
|
|
439
|
+
closeParagraphElement(state);
|
|
440
|
+
return;
|
|
441
|
+
}
|
|
442
|
+
|
|
427
443
|
const impliedEndTags = [
|
|
428
444
|
"dd",
|
|
429
445
|
"dt",
|
|
430
446
|
"li",
|
|
431
447
|
"option",
|
|
432
448
|
"optgroup",
|
|
433
|
-
"p",
|
|
434
449
|
"rb",
|
|
435
450
|
"rp",
|
|
436
451
|
"rt",
|
|
@@ -479,76 +494,132 @@ const runAdoptionAgencyAlgorithm = (
|
|
|
479
494
|
state: ParserState,
|
|
480
495
|
tagName: string,
|
|
481
496
|
): void => {
|
|
482
|
-
const
|
|
497
|
+
const maxIterations = 8;
|
|
483
498
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
499
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
500
|
+
const formattingElementIndex = state.activeFormattingElements.findIndex(
|
|
501
|
+
(el) =>
|
|
502
|
+
el && el.tagName && el.tagName.toLowerCase() === tagName.toLowerCase(),
|
|
503
|
+
);
|
|
487
504
|
|
|
488
|
-
|
|
505
|
+
if (formattingElementIndex === -1) {
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
489
508
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
state.stack.
|
|
493
|
-
removeFromActiveFormattingElements(state, formattingElement);
|
|
494
|
-
return;
|
|
495
|
-
}
|
|
509
|
+
const formattingElement =
|
|
510
|
+
state.activeFormattingElements[formattingElementIndex];
|
|
511
|
+
const stackIndex = state.stack.indexOf(formattingElement);
|
|
496
512
|
|
|
497
|
-
|
|
513
|
+
if (stackIndex === -1) {
|
|
514
|
+
state.activeFormattingElements.splice(formattingElementIndex, 1);
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
498
517
|
|
|
499
|
-
|
|
500
|
-
|
|
518
|
+
const currentElement = getCurrentElement(state);
|
|
519
|
+
if (currentElement === formattingElement) {
|
|
501
520
|
state.stack.pop();
|
|
521
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
522
|
+
return;
|
|
502
523
|
}
|
|
503
|
-
removeFromActiveFormattingElements(state, formattingElement);
|
|
504
|
-
return;
|
|
505
|
-
}
|
|
506
524
|
|
|
507
|
-
|
|
508
|
-
const commonAncestor = getCommonAncestor(state.stack, formattingElementIndex);
|
|
525
|
+
const fbResult = findFurthestBlock(state.stack, stackIndex);
|
|
509
526
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
527
|
+
if (!fbResult) {
|
|
528
|
+
while (state.stack.length > stackIndex) {
|
|
529
|
+
state.stack.pop();
|
|
530
|
+
}
|
|
531
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
532
|
+
return;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
|
|
536
|
+
const commonAncestor = getCommonAncestor(state.stack, stackIndex);
|
|
537
|
+
|
|
538
|
+
if (!commonAncestor) {
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
let lastNode = furthestBlock;
|
|
543
|
+
const clonedNodes: any[] = [];
|
|
544
|
+
const nodesToRemoveFromStack: any[] = [];
|
|
545
|
+
let innerLoopCounter = 0;
|
|
546
|
+
let nodeIndex = furthestBlockIndex;
|
|
547
|
+
|
|
548
|
+
while (true) {
|
|
549
|
+
innerLoopCounter++;
|
|
550
|
+
nodeIndex--;
|
|
551
|
+
const node = state.stack[nodeIndex];
|
|
552
|
+
|
|
553
|
+
if (node === formattingElement) {
|
|
554
|
+
break;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
if (
|
|
558
|
+
innerLoopCounter > 3 &&
|
|
559
|
+
state.activeFormattingElements.includes(node)
|
|
560
|
+
) {
|
|
561
|
+
removeFromActiveFormattingElements(state, node);
|
|
562
|
+
}
|
|
513
563
|
|
|
514
|
-
|
|
515
|
-
|
|
564
|
+
if (!state.activeFormattingElements.includes(node)) {
|
|
565
|
+
nodesToRemoveFromStack.push(node);
|
|
566
|
+
continue;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
const nodeClone = cloneFormattingElement(node);
|
|
570
|
+
clonedNodes.unshift(nodeClone);
|
|
516
571
|
|
|
517
|
-
|
|
518
|
-
const node = state.stack[i];
|
|
519
|
-
const nodeClone = cloneFormattingElement(node);
|
|
520
|
-
clonedNodes.unshift(nodeClone);
|
|
572
|
+
replaceInActiveFormattingElements(state, node, nodeClone);
|
|
521
573
|
|
|
522
|
-
|
|
574
|
+
const nodeChildIdx = node.childNodes.indexOf(lastNode);
|
|
575
|
+
if (nodeChildIdx !== -1) {
|
|
576
|
+
node.childNodes.splice(nodeChildIdx, 1);
|
|
577
|
+
}
|
|
523
578
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
node.childNodes.splice(nodeChildIdx, 1);
|
|
579
|
+
appendChild(nodeClone, lastNode);
|
|
580
|
+
lastNode = nodeClone;
|
|
527
581
|
}
|
|
528
582
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
583
|
+
for (const node of nodesToRemoveFromStack) {
|
|
584
|
+
const idx = state.stack.indexOf(node);
|
|
585
|
+
if (idx !== -1) {
|
|
586
|
+
state.stack.splice(idx, 1);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
532
589
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
590
|
+
const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
|
|
591
|
+
if (fbIdx !== -1) {
|
|
592
|
+
formattingElement.childNodes.splice(fbIdx, 1);
|
|
593
|
+
furthestBlock.parentNode = null;
|
|
594
|
+
}
|
|
538
595
|
|
|
539
|
-
|
|
596
|
+
appendChild(commonAncestor, lastNode);
|
|
540
597
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
598
|
+
const newFormattingElement = cloneFormattingElement(formattingElement);
|
|
599
|
+
reparentChildren(furthestBlock, newFormattingElement);
|
|
600
|
+
appendChild(furthestBlock, newFormattingElement);
|
|
544
601
|
|
|
545
|
-
|
|
602
|
+
removeFromActiveFormattingElements(state, formattingElement);
|
|
603
|
+
state.activeFormattingElements.splice(
|
|
604
|
+
formattingElementIndex,
|
|
605
|
+
0,
|
|
606
|
+
newFormattingElement,
|
|
607
|
+
);
|
|
608
|
+
|
|
609
|
+
const elementsAfterFurthestBlock = state.stack.slice(
|
|
610
|
+
furthestBlockIndex + 1,
|
|
611
|
+
);
|
|
546
612
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
613
|
+
state.stack.length = stackIndex;
|
|
614
|
+
for (const clonedNode of clonedNodes) {
|
|
615
|
+
state.stack.push(clonedNode);
|
|
616
|
+
}
|
|
617
|
+
state.stack.push(furthestBlock);
|
|
618
|
+
state.stack.push(newFormattingElement);
|
|
619
|
+
for (const element of elementsAfterFurthestBlock) {
|
|
620
|
+
state.stack.push(element);
|
|
621
|
+
}
|
|
550
622
|
}
|
|
551
|
-
state.stack.push(furthestBlock);
|
|
552
623
|
};
|
|
553
624
|
|
|
554
625
|
const removeFromActiveFormattingElements = (
|
|
@@ -572,6 +643,60 @@ const replaceInActiveFormattingElements = (
|
|
|
572
643
|
}
|
|
573
644
|
};
|
|
574
645
|
|
|
646
|
+
const pushToActiveFormattingElements = (
|
|
647
|
+
state: ParserState,
|
|
648
|
+
element: any,
|
|
649
|
+
): void => {
|
|
650
|
+
const list = state.activeFormattingElements;
|
|
651
|
+
const tagName = element.tagName?.toLowerCase();
|
|
652
|
+
|
|
653
|
+
let count = 0;
|
|
654
|
+
let oldestMatchIndex = -1;
|
|
655
|
+
|
|
656
|
+
for (let i = list.length - 1; i >= 0; i--) {
|
|
657
|
+
const entry = list[i];
|
|
658
|
+
if (entry === null) {
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
if (
|
|
663
|
+
entry.tagName?.toLowerCase() === tagName &&
|
|
664
|
+
attributesMatch(entry, element)
|
|
665
|
+
) {
|
|
666
|
+
if (oldestMatchIndex === -1) {
|
|
667
|
+
oldestMatchIndex = i;
|
|
668
|
+
}
|
|
669
|
+
count++;
|
|
670
|
+
if (count >= 3) {
|
|
671
|
+
list.splice(oldestMatchIndex, 1);
|
|
672
|
+
break;
|
|
673
|
+
}
|
|
674
|
+
oldestMatchIndex = i;
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
list.push(element);
|
|
679
|
+
};
|
|
680
|
+
|
|
681
|
+
const attributesMatch = (el1: any, el2: any): boolean => {
|
|
682
|
+
const attrs1 = el1.attributes || {};
|
|
683
|
+
const attrs2 = el2.attributes || {};
|
|
684
|
+
const keys1 = Object.keys(attrs1);
|
|
685
|
+
const keys2 = Object.keys(attrs2);
|
|
686
|
+
|
|
687
|
+
if (keys1.length !== keys2.length) {
|
|
688
|
+
return false;
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
for (const key of keys1) {
|
|
692
|
+
if (attrs1[key] !== attrs2[key]) {
|
|
693
|
+
return false;
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
return true;
|
|
698
|
+
};
|
|
699
|
+
|
|
575
700
|
const parseText = (state: ParserState, token: Token): void => {
|
|
576
701
|
const content = token.value;
|
|
577
702
|
|
|
@@ -621,18 +746,57 @@ const parseProcessingInstruction = (state: ParserState, token: Token): void => {
|
|
|
621
746
|
appendChild(currentParent, piNode);
|
|
622
747
|
};
|
|
623
748
|
|
|
624
|
-
const
|
|
749
|
+
const closeParagraphElement = (state: ParserState): void => {
|
|
750
|
+
let pIndex = -1;
|
|
751
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
752
|
+
const element = state.stack[i];
|
|
753
|
+
const elementTag = element.tagName?.toLowerCase();
|
|
754
|
+
|
|
755
|
+
if (elementTag === "p") {
|
|
756
|
+
pIndex = i;
|
|
757
|
+
break;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
|
|
761
|
+
return;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
if (pIndex === -1) {
|
|
766
|
+
return;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
while (state.stack.length > pIndex) {
|
|
770
|
+
state.stack.pop();
|
|
771
|
+
}
|
|
772
|
+
};
|
|
773
|
+
|
|
774
|
+
const handleAutoClosing = (state: ParserState, tagName: string): boolean => {
|
|
625
775
|
const autoCloseList = AUTO_CLOSE_RULES[tagName];
|
|
626
|
-
if (!autoCloseList) return;
|
|
776
|
+
if (!autoCloseList) return false;
|
|
627
777
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
778
|
+
let targetIndex = -1;
|
|
779
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
780
|
+
const element = state.stack[i];
|
|
781
|
+
const elementTag = element.tagName?.toLowerCase();
|
|
782
|
+
|
|
783
|
+
if (elementTag && autoCloseList.includes(elementTag)) {
|
|
784
|
+
targetIndex = i;
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
|
|
789
|
+
return false;
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
if (targetIndex === -1) return false;
|
|
794
|
+
|
|
795
|
+
while (state.stack.length > targetIndex) {
|
|
634
796
|
state.stack.pop();
|
|
635
797
|
}
|
|
798
|
+
|
|
799
|
+
return true;
|
|
636
800
|
};
|
|
637
801
|
|
|
638
802
|
const getCurrentParent = (state: ParserState): any => {
|
|
@@ -748,6 +912,32 @@ const isInTableContext = (state: ParserState): boolean => {
|
|
|
748
912
|
return false;
|
|
749
913
|
};
|
|
750
914
|
|
|
915
|
+
const isInForeignContent = (state: ParserState): boolean => {
|
|
916
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
917
|
+
const el = state.stack[i];
|
|
918
|
+
if (
|
|
919
|
+
el.namespaceURI === SVG_NAMESPACE ||
|
|
920
|
+
el.namespaceURI === MATHML_NAMESPACE
|
|
921
|
+
) {
|
|
922
|
+
return true;
|
|
923
|
+
}
|
|
924
|
+
if (el.tagName && el.tagName.toLowerCase() === "html") {
|
|
925
|
+
return false;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
return false;
|
|
929
|
+
};
|
|
930
|
+
|
|
931
|
+
const getCurrentNamespace = (state: ParserState): string | undefined => {
|
|
932
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
933
|
+
const el = state.stack[i];
|
|
934
|
+
if (el.namespaceURI) {
|
|
935
|
+
return el.namespaceURI;
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
return undefined;
|
|
939
|
+
};
|
|
940
|
+
|
|
751
941
|
const findTableContextParent = (state: ParserState): any | null => {
|
|
752
942
|
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
753
943
|
const el = state.stack[i];
|
|
@@ -922,3 +1112,33 @@ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
|
|
|
922
1112
|
}
|
|
923
1113
|
return result;
|
|
924
1114
|
};
|
|
1115
|
+
|
|
1116
|
+
export const parseFragment = (tokens: Token[], contextTagName: string): any => {
|
|
1117
|
+
const root = createDocument();
|
|
1118
|
+
const contextElement = createElement(contextTagName.toLowerCase(), {});
|
|
1119
|
+
appendChild(root, contextElement);
|
|
1120
|
+
|
|
1121
|
+
const state: ParserState = {
|
|
1122
|
+
tokens,
|
|
1123
|
+
position: 0,
|
|
1124
|
+
length: tokens.length,
|
|
1125
|
+
stack: [root, contextElement],
|
|
1126
|
+
root,
|
|
1127
|
+
insertionMode: InsertionMode.InBody,
|
|
1128
|
+
errors: [],
|
|
1129
|
+
activeFormattingElements: [],
|
|
1130
|
+
};
|
|
1131
|
+
|
|
1132
|
+
while (state.position < state.length) {
|
|
1133
|
+
const token = getCurrentToken(state);
|
|
1134
|
+
|
|
1135
|
+
if (!token || token.type === TokenType.EOF) {
|
|
1136
|
+
break;
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
parseToken(state, token);
|
|
1140
|
+
advance(state);
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
return contextElement.childNodes;
|
|
1144
|
+
};
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../index.js";
|
|
3
|
+
import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
|
|
4
|
+
|
|
5
|
+
it("should run AAA 2 times - test case with nested divs", () => {
|
|
6
|
+
const html = "<a>1<div>2<div>3</a>4</div>5</div>";
|
|
7
|
+
const doc = parseHTML(html);
|
|
8
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
9
|
+
|
|
10
|
+
const expected = `| <html>
|
|
11
|
+
| <head>
|
|
12
|
+
| <body>
|
|
13
|
+
| <a>
|
|
14
|
+
| "1"
|
|
15
|
+
| <div>
|
|
16
|
+
| <a>
|
|
17
|
+
| "2"
|
|
18
|
+
| <div>
|
|
19
|
+
| <a>
|
|
20
|
+
| "3"
|
|
21
|
+
| "4"
|
|
22
|
+
| "5"
|
|
23
|
+
`;
|
|
24
|
+
|
|
25
|
+
expect(serialized).toBe(expected);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("should run AAA 8 times - deeply nested divs", () => {
|
|
29
|
+
const html =
|
|
30
|
+
"<div><a><b><div><div><div><div><div><div><div><div><div><div></a>";
|
|
31
|
+
const doc = parseHTML(html);
|
|
32
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
33
|
+
|
|
34
|
+
const expected = `| <html>
|
|
35
|
+
| <head>
|
|
36
|
+
| <body>
|
|
37
|
+
| <div>
|
|
38
|
+
| <a>
|
|
39
|
+
| <b>
|
|
40
|
+
| <b>
|
|
41
|
+
| <div>
|
|
42
|
+
| <a>
|
|
43
|
+
| <div>
|
|
44
|
+
| <a>
|
|
45
|
+
| <div>
|
|
46
|
+
| <a>
|
|
47
|
+
| <div>
|
|
48
|
+
| <a>
|
|
49
|
+
| <div>
|
|
50
|
+
| <a>
|
|
51
|
+
| <div>
|
|
52
|
+
| <a>
|
|
53
|
+
| <div>
|
|
54
|
+
| <a>
|
|
55
|
+
| <div>
|
|
56
|
+
| <a>
|
|
57
|
+
| <div>
|
|
58
|
+
| <div>
|
|
59
|
+
`;
|
|
60
|
+
|
|
61
|
+
expect(serialized).toBe(expected);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("should run AAA 2 times - with style and address elements", () => {
|
|
65
|
+
const html = "<a><div><style></style><address><a>";
|
|
66
|
+
const doc = parseHTML(html);
|
|
67
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
68
|
+
|
|
69
|
+
const expected = `| <html>
|
|
70
|
+
| <head>
|
|
71
|
+
| <body>
|
|
72
|
+
| <a>
|
|
73
|
+
| <div>
|
|
74
|
+
| <a>
|
|
75
|
+
| <style>
|
|
76
|
+
| <address>
|
|
77
|
+
| <a>
|
|
78
|
+
| <a>
|
|
79
|
+
`;
|
|
80
|
+
|
|
81
|
+
expect(serialized).toBe(expected);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("should run AAA with formatting element cloning", () => {
|
|
85
|
+
const html = "<a>x<div>y</a>z</div>";
|
|
86
|
+
const doc = parseHTML(html);
|
|
87
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
88
|
+
|
|
89
|
+
const expected = `| <html>
|
|
90
|
+
| <head>
|
|
91
|
+
| <body>
|
|
92
|
+
| <a>
|
|
93
|
+
| "x"
|
|
94
|
+
| <div>
|
|
95
|
+
| <a>
|
|
96
|
+
| "y"
|
|
97
|
+
| "z"
|
|
98
|
+
`;
|
|
99
|
+
|
|
100
|
+
expect(serialized).toBe(expected);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it("should stop AAA when no more formatting elements to adopt", () => {
|
|
104
|
+
const html = "<b>text</b><div>content</div>";
|
|
105
|
+
const doc = parseHTML(html);
|
|
106
|
+
const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
107
|
+
|
|
108
|
+
const expected = `| <html>
|
|
109
|
+
| <head>
|
|
110
|
+
| <body>
|
|
111
|
+
| <b>
|
|
112
|
+
| "text"
|
|
113
|
+
| <div>
|
|
114
|
+
| "content"
|
|
115
|
+
`;
|
|
116
|
+
|
|
117
|
+
expect(serialized).toBe(expected);
|
|
118
|
+
});
|
|
@@ -25,12 +25,12 @@ export function serializeToHtml5lib(
|
|
|
25
25
|
|
|
26
26
|
let nsPrefix = "";
|
|
27
27
|
if (ns === "http://www.w3.org/2000/svg") {
|
|
28
|
-
nsPrefix = "
|
|
28
|
+
nsPrefix = "svg ";
|
|
29
29
|
} else if (ns === "http://www.w3.org/1998/Math/MathML") {
|
|
30
|
-
nsPrefix = "
|
|
30
|
+
nsPrefix = "math ";
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
lines.push(`${indent}<${
|
|
33
|
+
lines.push(`${indent}<${nsPrefix}${tagName}>`);
|
|
34
34
|
|
|
35
35
|
// Atributos en orden alfabÃĐtico
|
|
36
36
|
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
@@ -68,3 +68,52 @@ export function serializeToHtml5lib(
|
|
|
68
68
|
serialize(doc, 0);
|
|
69
69
|
return lines.join("\n") + "\n";
|
|
70
70
|
}
|
|
71
|
+
|
|
72
|
+
export function serializeFragmentToHtml5lib(nodes: any[]): string {
|
|
73
|
+
const lines: string[] = [];
|
|
74
|
+
|
|
75
|
+
function serialize(node: any, depth: number): void {
|
|
76
|
+
const indent = "| " + " ".repeat(depth);
|
|
77
|
+
|
|
78
|
+
if (node.nodeType === 1) {
|
|
79
|
+
const tagName = node.tagName.toLowerCase();
|
|
80
|
+
const ns = node.namespaceURI;
|
|
81
|
+
|
|
82
|
+
let nsPrefix = "";
|
|
83
|
+
if (ns === "http://www.w3.org/2000/svg") {
|
|
84
|
+
nsPrefix = "svg ";
|
|
85
|
+
} else if (ns === "http://www.w3.org/1998/Math/MathML") {
|
|
86
|
+
nsPrefix = "math ";
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
lines.push(`${indent}<${nsPrefix}${tagName}>`);
|
|
90
|
+
|
|
91
|
+
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
92
|
+
a.localeCompare(b),
|
|
93
|
+
);
|
|
94
|
+
for (const [name, value] of attrs) {
|
|
95
|
+
lines.push(`${indent} ${name}="${value}"`);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (node.tagName.toLowerCase() === "template" && node.content) {
|
|
99
|
+
lines.push(`${indent} content`);
|
|
100
|
+
serialize(node.content, depth + 2);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
for (const child of node.childNodes || []) {
|
|
104
|
+
serialize(child, depth + 1);
|
|
105
|
+
}
|
|
106
|
+
} else if (node.nodeType === 3) {
|
|
107
|
+
lines.push(`${indent}"${node.textContent}"`);
|
|
108
|
+
} else if (node.nodeType === 8) {
|
|
109
|
+
const commentData = node.data || node.nodeValue || node.textContent || "";
|
|
110
|
+
lines.push(`${indent}<!-- ${commentData} -->`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for (const node of nodes) {
|
|
115
|
+
serialize(node, 0);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return lines.join("\n") + "\n";
|
|
119
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../index.js";
|
|
3
|
+
import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
|
|
4
|
+
|
|
5
|
+
describe("implicit close with formatting element reconstruction", () => {
|
|
6
|
+
it("should close <p> and reconstruct <b> elements when new <p> opens", () => {
|
|
7
|
+
const html = "<p><b><b><b><b><p>x";
|
|
8
|
+
const doc = parseHTML(html);
|
|
9
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
10
|
+
|
|
11
|
+
expect(result).toBe(`| <html>
|
|
12
|
+
| <head>
|
|
13
|
+
| <body>
|
|
14
|
+
| <p>
|
|
15
|
+
| <b>
|
|
16
|
+
| <b>
|
|
17
|
+
| <b>
|
|
18
|
+
| <b>
|
|
19
|
+
| <p>
|
|
20
|
+
| <b>
|
|
21
|
+
| <b>
|
|
22
|
+
| <b>
|
|
23
|
+
| "x"
|
|
24
|
+
`);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("should close <p> through nested formatting and reconstruct (single <b>)", () => {
|
|
28
|
+
const html = "<p><b><p>x";
|
|
29
|
+
const doc = parseHTML(html);
|
|
30
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
31
|
+
|
|
32
|
+
expect(result).toBe(`| <html>
|
|
33
|
+
| <head>
|
|
34
|
+
| <body>
|
|
35
|
+
| <p>
|
|
36
|
+
| <b>
|
|
37
|
+
| <p>
|
|
38
|
+
| <b>
|
|
39
|
+
| "x"
|
|
40
|
+
`);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("should handle text before and after implicit close", () => {
|
|
44
|
+
const html = "<p><b>1<p>2";
|
|
45
|
+
const doc = parseHTML(html);
|
|
46
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
47
|
+
|
|
48
|
+
expect(result).toBe(`| <html>
|
|
49
|
+
| <head>
|
|
50
|
+
| <body>
|
|
51
|
+
| <p>
|
|
52
|
+
| <b>
|
|
53
|
+
| "1"
|
|
54
|
+
| <p>
|
|
55
|
+
| <b>
|
|
56
|
+
| "2"
|
|
57
|
+
`);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("should handle multiple different formatting elements", () => {
|
|
61
|
+
const html = "<p><b><i><p>x";
|
|
62
|
+
const doc = parseHTML(html);
|
|
63
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
64
|
+
|
|
65
|
+
expect(result).toBe(`| <html>
|
|
66
|
+
| <head>
|
|
67
|
+
| <body>
|
|
68
|
+
| <p>
|
|
69
|
+
| <b>
|
|
70
|
+
| <i>
|
|
71
|
+
| <p>
|
|
72
|
+
| <b>
|
|
73
|
+
| <i>
|
|
74
|
+
| "x"
|
|
75
|
+
`);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("should handle div closing <p> and reconstructing formatting", () => {
|
|
79
|
+
const html = "<p><b><div>x";
|
|
80
|
+
const doc = parseHTML(html);
|
|
81
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
82
|
+
|
|
83
|
+
expect(result).toBe(`| <html>
|
|
84
|
+
| <head>
|
|
85
|
+
| <body>
|
|
86
|
+
| <p>
|
|
87
|
+
| <b>
|
|
88
|
+
| <div>
|
|
89
|
+
| <b>
|
|
90
|
+
| "x"
|
|
91
|
+
`);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("should handle multiple auto-closing with formatting", () => {
|
|
95
|
+
const html = "<p><b><p><i><p>x";
|
|
96
|
+
const doc = parseHTML(html);
|
|
97
|
+
const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
|
|
98
|
+
|
|
99
|
+
expect(result).toBe(`| <html>
|
|
100
|
+
| <head>
|
|
101
|
+
| <body>
|
|
102
|
+
| <p>
|
|
103
|
+
| <b>
|
|
104
|
+
| <p>
|
|
105
|
+
| <b>
|
|
106
|
+
| <i>
|
|
107
|
+
| <p>
|
|
108
|
+
| <b>
|
|
109
|
+
| <i>
|
|
110
|
+
| "x"
|
|
111
|
+
`);
|
|
112
|
+
});
|
|
113
|
+
});
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import { expect, it, describe } from "bun:test";
|
|
2
|
-
import { parseHTML } from "../index";
|
|
3
|
-
import {
|
|
2
|
+
import { parseHTML, parseHTMLFragment } from "../index";
|
|
3
|
+
import {
|
|
4
|
+
serializeToHtml5lib,
|
|
5
|
+
serializeFragmentToHtml5lib,
|
|
6
|
+
} from "./helpers/tree-adapter";
|
|
4
7
|
import { readFileSync } from "fs";
|
|
5
8
|
|
|
6
9
|
describe("Tree Construction Adoption01 Tests", () => {
|
|
@@ -15,10 +18,18 @@ describe("Tree Construction Adoption01 Tests", () => {
|
|
|
15
18
|
let data = "";
|
|
16
19
|
let document = "";
|
|
17
20
|
let inDocument = false;
|
|
18
|
-
let inData = true;
|
|
21
|
+
let inData = true;
|
|
22
|
+
let isFragmentTest = false;
|
|
23
|
+
let fragmentContext = "";
|
|
19
24
|
|
|
20
25
|
for (const line of lines) {
|
|
21
|
-
if (line.startsWith("#document")) {
|
|
26
|
+
if (line.startsWith("#document-fragment")) {
|
|
27
|
+
isFragmentTest = true;
|
|
28
|
+
inDocument = false;
|
|
29
|
+
inData = false;
|
|
30
|
+
} else if (isFragmentTest && !fragmentContext && !line.startsWith("#")) {
|
|
31
|
+
fragmentContext = line.trim();
|
|
32
|
+
} else if (line.startsWith("#document")) {
|
|
22
33
|
inDocument = true;
|
|
23
34
|
inData = false;
|
|
24
35
|
} else if (line.startsWith("#errors")) {
|
|
@@ -31,16 +42,21 @@ describe("Tree Construction Adoption01 Tests", () => {
|
|
|
31
42
|
}
|
|
32
43
|
}
|
|
33
44
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
|
|
40
|
-
const serialized = serializeToHtml5lib(doc, {
|
|
41
|
-
skipImplicitDoctype: !hasExplicitDoctype,
|
|
45
|
+
if (isFragmentTest) {
|
|
46
|
+
it(`Adoption test ${index + 1} (fragment: ${fragmentContext})`, () => {
|
|
47
|
+
const nodes = parseHTMLFragment(data, fragmentContext);
|
|
48
|
+
const serialized = serializeFragmentToHtml5lib(nodes);
|
|
49
|
+
expect(serialized).toBe(document);
|
|
42
50
|
});
|
|
43
|
-
|
|
44
|
-
|
|
51
|
+
} else {
|
|
52
|
+
it(`Adoption test ${index + 1}`, () => {
|
|
53
|
+
const doc = parseHTML(data);
|
|
54
|
+
const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
|
|
55
|
+
const serialized = serializeToHtml5lib(doc, {
|
|
56
|
+
skipImplicitDoctype: !hasExplicitDoctype,
|
|
57
|
+
});
|
|
58
|
+
expect(serialized).toBe(document);
|
|
59
|
+
});
|
|
60
|
+
}
|
|
45
61
|
});
|
|
46
62
|
});
|
|
@@ -9,7 +9,7 @@ describe("Tree Construction Adoption02 Tests", () => {
|
|
|
9
9
|
"utf8",
|
|
10
10
|
);
|
|
11
11
|
const sections = content.split("#data\n").slice(1);
|
|
12
|
-
const passingTests = [1];
|
|
12
|
+
const passingTests = [1, 2];
|
|
13
13
|
|
|
14
14
|
sections.forEach((section, index) => {
|
|
15
15
|
const lines = section.trim().split("\n");
|