@tkeron/html-parser 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,8 +9,9 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
9
9
  - ðŸŠķ **Lightweight**: Zero external dependencies
10
10
  - 🌐 **Standards Compliant**: Returns standard DOM Document objects
11
11
  - 🔧 **TypeScript Support**: Full TypeScript definitions included
12
- - ✅ **Well Tested**: Comprehensive test suite (5600+ tests passing)
12
+ - ✅ **Well Tested**: Comprehensive test suite (5660+ tests passing)
13
13
  - ðŸŽŊ **HTML5 Spec**: Implements Adoption Agency Algorithm for proper formatting element handling
14
+ - ðŸ§Đ **Fragment Parsing**: Parse HTML fragments with context element support
14
15
 
15
16
  ## Installation
16
17
 
@@ -76,6 +77,28 @@ Parses an HTML string and returns a DOM Document object.
76
77
 
77
78
  - `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
78
79
 
80
+ ### `parseHTMLFragment(html: string, contextTagName: string): Node[]`
81
+
82
+ Parses an HTML string as a fragment within a context element. Useful for parsing innerHTML-style content.
83
+
84
+ **Parameters:**
85
+
86
+ - `html` (string): The HTML string to parse
87
+ - `contextTagName` (string): The tag name of the context element (e.g., `"div"`, `"body"`)
88
+
89
+ **Returns:**
90
+
91
+ - `Node[]`: An array of parsed nodes
92
+
93
+ **Example:**
94
+
95
+ ```typescript
96
+ import { parseHTMLFragment } from "@tkeron/html-parser";
97
+
98
+ const nodes = parseHTMLFragment("<b>Hello</b> <i>World</i>", "div");
99
+ console.log(nodes.length); // 3 (b element, text node, i element)
100
+ ```
101
+
79
102
  ## Development
80
103
 
81
104
  This project is built with Bun. To get started:
package/index.ts CHANGED
@@ -1,13 +1,17 @@
1
1
  import { tokenize } from "./src/tokenizer/index.js";
2
- import { parse } from "./src/parser/index.js";
2
+ import { parse, parseFragment } from "./src/parser/index.js";
3
3
  import { astToDOM } from "./src/dom-simulator.js";
4
4
 
5
5
  export function parseHTML(html: string = ""): Document {
6
6
  const tokens = tokenize(html);
7
7
  const ast = parse(tokens);
8
- // If parse already returns a DOM document, return it directly
9
8
  if (ast && typeof ast.nodeType === "number" && ast.nodeType === 9) {
10
9
  return ast;
11
10
  }
12
11
  return astToDOM(ast);
13
12
  }
13
+
14
+ export function parseHTMLFragment(html: string, contextTagName: string): any[] {
15
+ const tokens = tokenize(html);
16
+ return parseFragment(tokens, contextTagName);
17
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.4.0",
3
+ "version": "1.5.0",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -135,3 +135,15 @@ export const VALID_TR_CHILDREN = new Set([
135
135
  "template",
136
136
  "style",
137
137
  ]);
138
+
139
+ export const BUTTON_SCOPE_TERMINATORS = new Set([
140
+ "applet",
141
+ "caption",
142
+ "html",
143
+ "table",
144
+ "td",
145
+ "th",
146
+ "marquee",
147
+ "object",
148
+ "template",
149
+ ]);
@@ -1,4 +1,4 @@
1
- export { parse } from "./parse";
1
+ export { parse, parseFragment } from "./parse";
2
2
  export { domToAST } from "./dom-to-ast";
3
3
  export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
4
4
  export { ASTNodeType } from "./types";
@@ -21,9 +21,9 @@ import {
21
21
  VALID_TABLE_CHILDREN,
22
22
  VALID_TABLE_SECTION_CHILDREN,
23
23
  VALID_TR_CHILDREN,
24
+ BUTTON_SCOPE_TERMINATORS,
24
25
  } from "./constants";
25
26
  import {
26
- findFormattingElementInStack,
27
27
  findFurthestBlock,
28
28
  getCommonAncestor,
29
29
  cloneFormattingElement,
@@ -333,7 +333,16 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
333
333
  if (token.type === TokenType.TAG_OPEN) {
334
334
  const tagName = token.value.toLowerCase();
335
335
 
336
- handleAutoClosing(state, tagName);
336
+ if (tagName === "a") {
337
+ const existingA = state.activeFormattingElements.find(
338
+ (el) => el && el.tagName && el.tagName.toLowerCase() === "a",
339
+ );
340
+ if (existingA) {
341
+ runAdoptionAgencyAlgorithm(state, "a");
342
+ }
343
+ }
344
+
345
+ const closedParagraph = handleAutoClosing(state, tagName);
337
346
 
338
347
  const inTableContext = isInTableContext(state);
339
348
  const isTableStructureElement =
@@ -354,7 +363,7 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
354
363
  if (tableParent) {
355
364
  popStackUntilTableContext(state);
356
365
  }
357
- } else if (!parentIsTableContext) {
366
+ } else if (!parentIsTableContext && !closedParagraph) {
358
367
  reconstructActiveFormattingElements(state);
359
368
  }
360
369
 
@@ -365,6 +374,8 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
365
374
  namespaceURI = SVG_NAMESPACE;
366
375
  } else if (tagName === "math") {
367
376
  namespaceURI = MATHML_NAMESPACE;
377
+ } else {
378
+ namespaceURI = getCurrentNamespace(state);
368
379
  }
369
380
 
370
381
  const element = createElement(
@@ -413,24 +424,28 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
413
424
  }
414
425
 
415
426
  if (isFormattingElement) {
416
- state.activeFormattingElements.push(element);
427
+ pushToActiveFormattingElements(state, element);
417
428
  }
418
429
  }
419
430
  } else if (token.type === TokenType.TAG_CLOSE) {
420
431
  const tagName = token.value.toLowerCase();
421
432
 
422
- if (FORMATTING_ELEMENTS.has(tagName)) {
433
+ if (FORMATTING_ELEMENTS.has(tagName) && !isInForeignContent(state)) {
423
434
  runAdoptionAgencyAlgorithm(state, tagName);
424
435
  return;
425
436
  }
426
437
 
438
+ if (tagName === "p") {
439
+ closeParagraphElement(state);
440
+ return;
441
+ }
442
+
427
443
  const impliedEndTags = [
428
444
  "dd",
429
445
  "dt",
430
446
  "li",
431
447
  "option",
432
448
  "optgroup",
433
- "p",
434
449
  "rb",
435
450
  "rp",
436
451
  "rt",
@@ -479,76 +494,132 @@ const runAdoptionAgencyAlgorithm = (
479
494
  state: ParserState,
480
495
  tagName: string,
481
496
  ): void => {
482
- const result = findFormattingElementInStack(state.stack, tagName);
497
+ const maxIterations = 8;
483
498
 
484
- if (!result) {
485
- return;
486
- }
499
+ for (let iteration = 0; iteration < maxIterations; iteration++) {
500
+ const formattingElementIndex = state.activeFormattingElements.findIndex(
501
+ (el) =>
502
+ el && el.tagName && el.tagName.toLowerCase() === tagName.toLowerCase(),
503
+ );
487
504
 
488
- const { element: formattingElement, index: formattingElementIndex } = result;
505
+ if (formattingElementIndex === -1) {
506
+ return;
507
+ }
489
508
 
490
- const currentElement = getCurrentElement(state);
491
- if (currentElement === formattingElement) {
492
- state.stack.pop();
493
- removeFromActiveFormattingElements(state, formattingElement);
494
- return;
495
- }
509
+ const formattingElement =
510
+ state.activeFormattingElements[formattingElementIndex];
511
+ const stackIndex = state.stack.indexOf(formattingElement);
496
512
 
497
- const fbResult = findFurthestBlock(state.stack, formattingElementIndex);
513
+ if (stackIndex === -1) {
514
+ state.activeFormattingElements.splice(formattingElementIndex, 1);
515
+ return;
516
+ }
498
517
 
499
- if (!fbResult) {
500
- while (state.stack.length > formattingElementIndex) {
518
+ const currentElement = getCurrentElement(state);
519
+ if (currentElement === formattingElement) {
501
520
  state.stack.pop();
521
+ removeFromActiveFormattingElements(state, formattingElement);
522
+ return;
502
523
  }
503
- removeFromActiveFormattingElements(state, formattingElement);
504
- return;
505
- }
506
524
 
507
- const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
508
- const commonAncestor = getCommonAncestor(state.stack, formattingElementIndex);
525
+ const fbResult = findFurthestBlock(state.stack, stackIndex);
509
526
 
510
- if (!commonAncestor) {
511
- return;
512
- }
527
+ if (!fbResult) {
528
+ while (state.stack.length > stackIndex) {
529
+ state.stack.pop();
530
+ }
531
+ removeFromActiveFormattingElements(state, formattingElement);
532
+ return;
533
+ }
534
+
535
+ const { element: furthestBlock, index: furthestBlockIndex } = fbResult;
536
+ const commonAncestor = getCommonAncestor(state.stack, stackIndex);
537
+
538
+ if (!commonAncestor) {
539
+ return;
540
+ }
541
+
542
+ let lastNode = furthestBlock;
543
+ const clonedNodes: any[] = [];
544
+ const nodesToRemoveFromStack: any[] = [];
545
+ let innerLoopCounter = 0;
546
+ let nodeIndex = furthestBlockIndex;
547
+
548
+ while (true) {
549
+ innerLoopCounter++;
550
+ nodeIndex--;
551
+ const node = state.stack[nodeIndex];
552
+
553
+ if (node === formattingElement) {
554
+ break;
555
+ }
556
+
557
+ if (
558
+ innerLoopCounter > 3 &&
559
+ state.activeFormattingElements.includes(node)
560
+ ) {
561
+ removeFromActiveFormattingElements(state, node);
562
+ }
513
563
 
514
- let lastNode = furthestBlock;
515
- const clonedNodes: any[] = [];
564
+ if (!state.activeFormattingElements.includes(node)) {
565
+ nodesToRemoveFromStack.push(node);
566
+ continue;
567
+ }
568
+
569
+ const nodeClone = cloneFormattingElement(node);
570
+ clonedNodes.unshift(nodeClone);
516
571
 
517
- for (let i = furthestBlockIndex - 1; i > formattingElementIndex; i--) {
518
- const node = state.stack[i];
519
- const nodeClone = cloneFormattingElement(node);
520
- clonedNodes.unshift(nodeClone);
572
+ replaceInActiveFormattingElements(state, node, nodeClone);
521
573
 
522
- replaceInActiveFormattingElements(state, node, nodeClone);
574
+ const nodeChildIdx = node.childNodes.indexOf(lastNode);
575
+ if (nodeChildIdx !== -1) {
576
+ node.childNodes.splice(nodeChildIdx, 1);
577
+ }
523
578
 
524
- const nodeChildIdx = node.childNodes.indexOf(lastNode);
525
- if (nodeChildIdx !== -1) {
526
- node.childNodes.splice(nodeChildIdx, 1);
579
+ appendChild(nodeClone, lastNode);
580
+ lastNode = nodeClone;
527
581
  }
528
582
 
529
- appendChild(nodeClone, lastNode);
530
- lastNode = nodeClone;
531
- }
583
+ for (const node of nodesToRemoveFromStack) {
584
+ const idx = state.stack.indexOf(node);
585
+ if (idx !== -1) {
586
+ state.stack.splice(idx, 1);
587
+ }
588
+ }
532
589
 
533
- const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
534
- if (fbIdx !== -1) {
535
- formattingElement.childNodes.splice(fbIdx, 1);
536
- furthestBlock.parentNode = null;
537
- }
590
+ const fbIdx = formattingElement.childNodes.indexOf(furthestBlock);
591
+ if (fbIdx !== -1) {
592
+ formattingElement.childNodes.splice(fbIdx, 1);
593
+ furthestBlock.parentNode = null;
594
+ }
538
595
 
539
- appendChild(commonAncestor, lastNode);
596
+ appendChild(commonAncestor, lastNode);
540
597
 
541
- const newFormattingElement = cloneFormattingElement(formattingElement);
542
- reparentChildren(furthestBlock, newFormattingElement);
543
- appendChild(furthestBlock, newFormattingElement);
598
+ const newFormattingElement = cloneFormattingElement(formattingElement);
599
+ reparentChildren(furthestBlock, newFormattingElement);
600
+ appendChild(furthestBlock, newFormattingElement);
544
601
 
545
- removeFromActiveFormattingElements(state, formattingElement);
602
+ removeFromActiveFormattingElements(state, formattingElement);
603
+ state.activeFormattingElements.splice(
604
+ formattingElementIndex,
605
+ 0,
606
+ newFormattingElement,
607
+ );
608
+
609
+ const elementsAfterFurthestBlock = state.stack.slice(
610
+ furthestBlockIndex + 1,
611
+ );
546
612
 
547
- state.stack.length = formattingElementIndex;
548
- for (const clonedNode of clonedNodes) {
549
- state.stack.push(clonedNode);
613
+ state.stack.length = stackIndex;
614
+ for (const clonedNode of clonedNodes) {
615
+ state.stack.push(clonedNode);
616
+ }
617
+ state.stack.push(furthestBlock);
618
+ state.stack.push(newFormattingElement);
619
+ for (const element of elementsAfterFurthestBlock) {
620
+ state.stack.push(element);
621
+ }
550
622
  }
551
- state.stack.push(furthestBlock);
552
623
  };
553
624
 
554
625
  const removeFromActiveFormattingElements = (
@@ -572,6 +643,60 @@ const replaceInActiveFormattingElements = (
572
643
  }
573
644
  };
574
645
 
646
+ const pushToActiveFormattingElements = (
647
+ state: ParserState,
648
+ element: any,
649
+ ): void => {
650
+ const list = state.activeFormattingElements;
651
+ const tagName = element.tagName?.toLowerCase();
652
+
653
+ let count = 0;
654
+ let oldestMatchIndex = -1;
655
+
656
+ for (let i = list.length - 1; i >= 0; i--) {
657
+ const entry = list[i];
658
+ if (entry === null) {
659
+ break;
660
+ }
661
+
662
+ if (
663
+ entry.tagName?.toLowerCase() === tagName &&
664
+ attributesMatch(entry, element)
665
+ ) {
666
+ if (oldestMatchIndex === -1) {
667
+ oldestMatchIndex = i;
668
+ }
669
+ count++;
670
+ if (count >= 3) {
671
+ list.splice(oldestMatchIndex, 1);
672
+ break;
673
+ }
674
+ oldestMatchIndex = i;
675
+ }
676
+ }
677
+
678
+ list.push(element);
679
+ };
680
+
681
+ const attributesMatch = (el1: any, el2: any): boolean => {
682
+ const attrs1 = el1.attributes || {};
683
+ const attrs2 = el2.attributes || {};
684
+ const keys1 = Object.keys(attrs1);
685
+ const keys2 = Object.keys(attrs2);
686
+
687
+ if (keys1.length !== keys2.length) {
688
+ return false;
689
+ }
690
+
691
+ for (const key of keys1) {
692
+ if (attrs1[key] !== attrs2[key]) {
693
+ return false;
694
+ }
695
+ }
696
+
697
+ return true;
698
+ };
699
+
575
700
  const parseText = (state: ParserState, token: Token): void => {
576
701
  const content = token.value;
577
702
 
@@ -621,18 +746,57 @@ const parseProcessingInstruction = (state: ParserState, token: Token): void => {
621
746
  appendChild(currentParent, piNode);
622
747
  };
623
748
 
624
- const handleAutoClosing = (state: ParserState, tagName: string): void => {
749
+ const closeParagraphElement = (state: ParserState): void => {
750
+ let pIndex = -1;
751
+ for (let i = state.stack.length - 1; i >= 0; i--) {
752
+ const element = state.stack[i];
753
+ const elementTag = element.tagName?.toLowerCase();
754
+
755
+ if (elementTag === "p") {
756
+ pIndex = i;
757
+ break;
758
+ }
759
+
760
+ if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
761
+ return;
762
+ }
763
+ }
764
+
765
+ if (pIndex === -1) {
766
+ return;
767
+ }
768
+
769
+ while (state.stack.length > pIndex) {
770
+ state.stack.pop();
771
+ }
772
+ };
773
+
774
+ const handleAutoClosing = (state: ParserState, tagName: string): boolean => {
625
775
  const autoCloseList = AUTO_CLOSE_RULES[tagName];
626
- if (!autoCloseList) return;
776
+ if (!autoCloseList) return false;
627
777
 
628
- const currentElement = getCurrentElement(state);
629
- if (
630
- currentElement &&
631
- currentElement.tagName &&
632
- autoCloseList.includes(currentElement.tagName.toLowerCase())
633
- ) {
778
+ let targetIndex = -1;
779
+ for (let i = state.stack.length - 1; i >= 0; i--) {
780
+ const element = state.stack[i];
781
+ const elementTag = element.tagName?.toLowerCase();
782
+
783
+ if (elementTag && autoCloseList.includes(elementTag)) {
784
+ targetIndex = i;
785
+ break;
786
+ }
787
+
788
+ if (elementTag && BUTTON_SCOPE_TERMINATORS.has(elementTag)) {
789
+ return false;
790
+ }
791
+ }
792
+
793
+ if (targetIndex === -1) return false;
794
+
795
+ while (state.stack.length > targetIndex) {
634
796
  state.stack.pop();
635
797
  }
798
+
799
+ return true;
636
800
  };
637
801
 
638
802
  const getCurrentParent = (state: ParserState): any => {
@@ -748,6 +912,32 @@ const isInTableContext = (state: ParserState): boolean => {
748
912
  return false;
749
913
  };
750
914
 
915
+ const isInForeignContent = (state: ParserState): boolean => {
916
+ for (let i = state.stack.length - 1; i >= 0; i--) {
917
+ const el = state.stack[i];
918
+ if (
919
+ el.namespaceURI === SVG_NAMESPACE ||
920
+ el.namespaceURI === MATHML_NAMESPACE
921
+ ) {
922
+ return true;
923
+ }
924
+ if (el.tagName && el.tagName.toLowerCase() === "html") {
925
+ return false;
926
+ }
927
+ }
928
+ return false;
929
+ };
930
+
931
+ const getCurrentNamespace = (state: ParserState): string | undefined => {
932
+ for (let i = state.stack.length - 1; i >= 0; i--) {
933
+ const el = state.stack[i];
934
+ if (el.namespaceURI) {
935
+ return el.namespaceURI;
936
+ }
937
+ }
938
+ return undefined;
939
+ };
940
+
751
941
  const findTableContextParent = (state: ParserState): any | null => {
752
942
  for (let i = state.stack.length - 1; i >= 0; i--) {
753
943
  const el = state.stack[i];
@@ -922,3 +1112,33 @@ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
922
1112
  }
923
1113
  return result;
924
1114
  };
1115
+
1116
+ export const parseFragment = (tokens: Token[], contextTagName: string): any => {
1117
+ const root = createDocument();
1118
+ const contextElement = createElement(contextTagName.toLowerCase(), {});
1119
+ appendChild(root, contextElement);
1120
+
1121
+ const state: ParserState = {
1122
+ tokens,
1123
+ position: 0,
1124
+ length: tokens.length,
1125
+ stack: [root, contextElement],
1126
+ root,
1127
+ insertionMode: InsertionMode.InBody,
1128
+ errors: [],
1129
+ activeFormattingElements: [],
1130
+ };
1131
+
1132
+ while (state.position < state.length) {
1133
+ const token = getCurrentToken(state);
1134
+
1135
+ if (!token || token.type === TokenType.EOF) {
1136
+ break;
1137
+ }
1138
+
1139
+ parseToken(state, token);
1140
+ advance(state);
1141
+ }
1142
+
1143
+ return contextElement.childNodes;
1144
+ };
@@ -0,0 +1,118 @@
1
+ import { it, expect } from "bun:test";
2
+ import { parseHTML } from "../index.js";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
4
+
5
+ it("should run AAA 2 times - test case with nested divs", () => {
6
+ const html = "<a>1<div>2<div>3</a>4</div>5</div>";
7
+ const doc = parseHTML(html);
8
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
9
+
10
+ const expected = `| <html>
11
+ | <head>
12
+ | <body>
13
+ | <a>
14
+ | "1"
15
+ | <div>
16
+ | <a>
17
+ | "2"
18
+ | <div>
19
+ | <a>
20
+ | "3"
21
+ | "4"
22
+ | "5"
23
+ `;
24
+
25
+ expect(serialized).toBe(expected);
26
+ });
27
+
28
+ it("should run AAA 8 times - deeply nested divs", () => {
29
+ const html =
30
+ "<div><a><b><div><div><div><div><div><div><div><div><div><div></a>";
31
+ const doc = parseHTML(html);
32
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
33
+
34
+ const expected = `| <html>
35
+ | <head>
36
+ | <body>
37
+ | <div>
38
+ | <a>
39
+ | <b>
40
+ | <b>
41
+ | <div>
42
+ | <a>
43
+ | <div>
44
+ | <a>
45
+ | <div>
46
+ | <a>
47
+ | <div>
48
+ | <a>
49
+ | <div>
50
+ | <a>
51
+ | <div>
52
+ | <a>
53
+ | <div>
54
+ | <a>
55
+ | <div>
56
+ | <a>
57
+ | <div>
58
+ | <div>
59
+ `;
60
+
61
+ expect(serialized).toBe(expected);
62
+ });
63
+
64
+ it("should run AAA 2 times - with style and address elements", () => {
65
+ const html = "<a><div><style></style><address><a>";
66
+ const doc = parseHTML(html);
67
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
68
+
69
+ const expected = `| <html>
70
+ | <head>
71
+ | <body>
72
+ | <a>
73
+ | <div>
74
+ | <a>
75
+ | <style>
76
+ | <address>
77
+ | <a>
78
+ | <a>
79
+ `;
80
+
81
+ expect(serialized).toBe(expected);
82
+ });
83
+
84
+ it("should run AAA with formatting element cloning", () => {
85
+ const html = "<a>x<div>y</a>z</div>";
86
+ const doc = parseHTML(html);
87
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
88
+
89
+ const expected = `| <html>
90
+ | <head>
91
+ | <body>
92
+ | <a>
93
+ | "x"
94
+ | <div>
95
+ | <a>
96
+ | "y"
97
+ | "z"
98
+ `;
99
+
100
+ expect(serialized).toBe(expected);
101
+ });
102
+
103
+ it("should stop AAA when no more formatting elements to adopt", () => {
104
+ const html = "<b>text</b><div>content</div>";
105
+ const doc = parseHTML(html);
106
+ const serialized = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
107
+
108
+ const expected = `| <html>
109
+ | <head>
110
+ | <body>
111
+ | <b>
112
+ | "text"
113
+ | <div>
114
+ | "content"
115
+ `;
116
+
117
+ expect(serialized).toBe(expected);
118
+ });
@@ -25,12 +25,12 @@ export function serializeToHtml5lib(
25
25
 
26
26
  let nsPrefix = "";
27
27
  if (ns === "http://www.w3.org/2000/svg") {
28
- nsPrefix = " svg";
28
+ nsPrefix = "svg ";
29
29
  } else if (ns === "http://www.w3.org/1998/Math/MathML") {
30
- nsPrefix = " math";
30
+ nsPrefix = "math ";
31
31
  }
32
32
 
33
- lines.push(`${indent}<${tagName}${nsPrefix}>`);
33
+ lines.push(`${indent}<${nsPrefix}${tagName}>`);
34
34
 
35
35
  // Atributos en orden alfabÃĐtico
36
36
  const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
@@ -68,3 +68,52 @@ export function serializeToHtml5lib(
68
68
  serialize(doc, 0);
69
69
  return lines.join("\n") + "\n";
70
70
  }
71
+
72
+ export function serializeFragmentToHtml5lib(nodes: any[]): string {
73
+ const lines: string[] = [];
74
+
75
+ function serialize(node: any, depth: number): void {
76
+ const indent = "| " + " ".repeat(depth);
77
+
78
+ if (node.nodeType === 1) {
79
+ const tagName = node.tagName.toLowerCase();
80
+ const ns = node.namespaceURI;
81
+
82
+ let nsPrefix = "";
83
+ if (ns === "http://www.w3.org/2000/svg") {
84
+ nsPrefix = "svg ";
85
+ } else if (ns === "http://www.w3.org/1998/Math/MathML") {
86
+ nsPrefix = "math ";
87
+ }
88
+
89
+ lines.push(`${indent}<${nsPrefix}${tagName}>`);
90
+
91
+ const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
92
+ a.localeCompare(b),
93
+ );
94
+ for (const [name, value] of attrs) {
95
+ lines.push(`${indent} ${name}="${value}"`);
96
+ }
97
+
98
+ if (node.tagName.toLowerCase() === "template" && node.content) {
99
+ lines.push(`${indent} content`);
100
+ serialize(node.content, depth + 2);
101
+ }
102
+
103
+ for (const child of node.childNodes || []) {
104
+ serialize(child, depth + 1);
105
+ }
106
+ } else if (node.nodeType === 3) {
107
+ lines.push(`${indent}"${node.textContent}"`);
108
+ } else if (node.nodeType === 8) {
109
+ const commentData = node.data || node.nodeValue || node.textContent || "";
110
+ lines.push(`${indent}<!-- ${commentData} -->`);
111
+ }
112
+ }
113
+
114
+ for (const node of nodes) {
115
+ serialize(node, 0);
116
+ }
117
+
118
+ return lines.join("\n") + "\n";
119
+ }
@@ -0,0 +1,113 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import { parseHTML } from "../index.js";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter.js";
4
+
5
+ describe("implicit close with formatting element reconstruction", () => {
6
+ it("should close <p> and reconstruct <b> elements when new <p> opens", () => {
7
+ const html = "<p><b><b><b><b><p>x";
8
+ const doc = parseHTML(html);
9
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
10
+
11
+ expect(result).toBe(`| <html>
12
+ | <head>
13
+ | <body>
14
+ | <p>
15
+ | <b>
16
+ | <b>
17
+ | <b>
18
+ | <b>
19
+ | <p>
20
+ | <b>
21
+ | <b>
22
+ | <b>
23
+ | "x"
24
+ `);
25
+ });
26
+
27
+ it("should close <p> through nested formatting and reconstruct (single <b>)", () => {
28
+ const html = "<p><b><p>x";
29
+ const doc = parseHTML(html);
30
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
31
+
32
+ expect(result).toBe(`| <html>
33
+ | <head>
34
+ | <body>
35
+ | <p>
36
+ | <b>
37
+ | <p>
38
+ | <b>
39
+ | "x"
40
+ `);
41
+ });
42
+
43
+ it("should handle text before and after implicit close", () => {
44
+ const html = "<p><b>1<p>2";
45
+ const doc = parseHTML(html);
46
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
47
+
48
+ expect(result).toBe(`| <html>
49
+ | <head>
50
+ | <body>
51
+ | <p>
52
+ | <b>
53
+ | "1"
54
+ | <p>
55
+ | <b>
56
+ | "2"
57
+ `);
58
+ });
59
+
60
+ it("should handle multiple different formatting elements", () => {
61
+ const html = "<p><b><i><p>x";
62
+ const doc = parseHTML(html);
63
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
64
+
65
+ expect(result).toBe(`| <html>
66
+ | <head>
67
+ | <body>
68
+ | <p>
69
+ | <b>
70
+ | <i>
71
+ | <p>
72
+ | <b>
73
+ | <i>
74
+ | "x"
75
+ `);
76
+ });
77
+
78
+ it("should handle div closing <p> and reconstructing formatting", () => {
79
+ const html = "<p><b><div>x";
80
+ const doc = parseHTML(html);
81
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
82
+
83
+ expect(result).toBe(`| <html>
84
+ | <head>
85
+ | <body>
86
+ | <p>
87
+ | <b>
88
+ | <div>
89
+ | <b>
90
+ | "x"
91
+ `);
92
+ });
93
+
94
+ it("should handle multiple auto-closing with formatting", () => {
95
+ const html = "<p><b><p><i><p>x";
96
+ const doc = parseHTML(html);
97
+ const result = serializeToHtml5lib(doc, { skipImplicitDoctype: true });
98
+
99
+ expect(result).toBe(`| <html>
100
+ | <head>
101
+ | <body>
102
+ | <p>
103
+ | <b>
104
+ | <p>
105
+ | <b>
106
+ | <i>
107
+ | <p>
108
+ | <b>
109
+ | <i>
110
+ | "x"
111
+ `);
112
+ });
113
+ });
@@ -1,6 +1,9 @@
1
1
  import { expect, it, describe } from "bun:test";
2
- import { parseHTML } from "../index";
3
- import { serializeToHtml5lib } from "./helpers/tree-adapter";
2
+ import { parseHTML, parseHTMLFragment } from "../index";
3
+ import {
4
+ serializeToHtml5lib,
5
+ serializeFragmentToHtml5lib,
6
+ } from "./helpers/tree-adapter";
4
7
  import { readFileSync } from "fs";
5
8
 
6
9
  describe("Tree Construction Adoption01 Tests", () => {
@@ -15,10 +18,18 @@ describe("Tree Construction Adoption01 Tests", () => {
15
18
  let data = "";
16
19
  let document = "";
17
20
  let inDocument = false;
18
- let inData = true; // Start with data since we split on #data\n
21
+ let inData = true;
22
+ let isFragmentTest = false;
23
+ let fragmentContext = "";
19
24
 
20
25
  for (const line of lines) {
21
- if (line.startsWith("#document")) {
26
+ if (line.startsWith("#document-fragment")) {
27
+ isFragmentTest = true;
28
+ inDocument = false;
29
+ inData = false;
30
+ } else if (isFragmentTest && !fragmentContext && !line.startsWith("#")) {
31
+ fragmentContext = line.trim();
32
+ } else if (line.startsWith("#document")) {
22
33
  inDocument = true;
23
34
  inData = false;
24
35
  } else if (line.startsWith("#errors")) {
@@ -31,16 +42,21 @@ describe("Tree Construction Adoption01 Tests", () => {
31
42
  }
32
43
  }
33
44
 
34
- const passingTests = [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 16];
35
- const testFn = passingTests.includes(index + 1) ? it : it.skip;
36
-
37
- testFn(`Adoption test ${index + 1}`, () => {
38
- const doc = parseHTML(data);
39
- const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
40
- const serialized = serializeToHtml5lib(doc, {
41
- skipImplicitDoctype: !hasExplicitDoctype,
45
+ if (isFragmentTest) {
46
+ it(`Adoption test ${index + 1} (fragment: ${fragmentContext})`, () => {
47
+ const nodes = parseHTMLFragment(data, fragmentContext);
48
+ const serialized = serializeFragmentToHtml5lib(nodes);
49
+ expect(serialized).toBe(document);
42
50
  });
43
- expect(serialized).toBe(document);
44
- });
51
+ } else {
52
+ it(`Adoption test ${index + 1}`, () => {
53
+ const doc = parseHTML(data);
54
+ const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
55
+ const serialized = serializeToHtml5lib(doc, {
56
+ skipImplicitDoctype: !hasExplicitDoctype,
57
+ });
58
+ expect(serialized).toBe(document);
59
+ });
60
+ }
45
61
  });
46
62
  });
@@ -9,7 +9,7 @@ describe("Tree Construction Adoption02 Tests", () => {
9
9
  "utf8",
10
10
  );
11
11
  const sections = content.split("#data\n").slice(1);
12
- const passingTests = [1];
12
+ const passingTests = [1, 2];
13
13
 
14
14
  sections.forEach((section, index) => {
15
15
  const lines = section.trim().split("\n");