@tkeron/html-parser 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/index.ts +0 -5
- package/package.json +1 -1
- package/src/css-selector.ts +0 -5
- package/src/dom-simulator.ts +122 -45
- package/src/tokenizer.ts +0 -20
- package/tests/advanced.test.ts +2 -2
- package/tests/cloneNode.test.ts +50 -50
- package/tests/custom-elements.test.ts +8 -8
- package/tests/official/acid/acid-tests.test.ts +6 -6
- package/tests/official/final-output/final-output.test.ts +15 -15
- package/tests/official/html5lib/tokenizer-utils.ts +19 -31
- package/tests/official/html5lib/tokenizer.test.ts +4 -4
- package/tests/official/html5lib/tree-construction-utils.ts +20 -34
- package/tests/official/html5lib/tree-construction.test.ts +5 -5
- package/tests/official/validator/validator-tests.test.ts +11 -11
- package/tests/official/wpt/wpt-tests.test.ts +5 -5
- package/tests/outerHTML-replacement.test.ts +208 -0
- package/tests/parser.test.ts +1 -1
- package/tests/test-page-0.txt +12 -355
- package/tests/api-integration.test.ts +0 -114
- package/tests/cloneNode-bug-reproduction.test.ts +0 -325
- package/tests/cloneNode-interactive.ts +0 -235
- package/tests/dom-adoption.test.ts +0 -363
- package/tests/dom-synchronization.test.ts +0 -675
- package/tests/setAttribute-outerHTML.test.ts +0 -102
package/README.md
CHANGED
|
@@ -21,19 +21,19 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
21
21
|
Once published, it will be available as:
|
|
22
22
|
|
|
23
23
|
```bash
|
|
24
|
-
npm install html-parser
|
|
24
|
+
npm install @tkeron/html-parser
|
|
25
25
|
```
|
|
26
26
|
|
|
27
27
|
Or with Bun:
|
|
28
28
|
|
|
29
29
|
```bash
|
|
30
|
-
bun add html-parser
|
|
30
|
+
bun add @tkeron/html-parser
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## Usage
|
|
34
34
|
|
|
35
35
|
```typescript
|
|
36
|
-
import { parseHTML } from "html-parser";
|
|
36
|
+
import { parseHTML } from "@tkeron/html-parser";
|
|
37
37
|
|
|
38
38
|
// Parse HTML string into DOM Document
|
|
39
39
|
const html =
|
|
@@ -51,7 +51,7 @@ console.log(heading); // "Hello World"
|
|
|
51
51
|
### Simple Example
|
|
52
52
|
|
|
53
53
|
```typescript
|
|
54
|
-
import { parseHTML } from "html-parser";
|
|
54
|
+
import { parseHTML } from "@tkeron/html-parser";
|
|
55
55
|
|
|
56
56
|
const html = `
|
|
57
57
|
<div class="container">
|
|
@@ -117,4 +117,4 @@ MIT
|
|
|
117
117
|
|
|
118
118
|
## Support
|
|
119
119
|
|
|
120
|
-
If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/
|
|
120
|
+
If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/tkeron/html-parser).
|
package/index.ts
CHANGED
|
@@ -4,11 +4,6 @@ import {
|
|
|
4
4
|
astToDOM,
|
|
5
5
|
} from './src/dom-simulator.js';
|
|
6
6
|
|
|
7
|
-
/**
|
|
8
|
-
* Parse HTML string into Document object
|
|
9
|
-
* @param html The HTML string to parse
|
|
10
|
-
* @returns A Document object
|
|
11
|
-
*/
|
|
12
7
|
export function parseHTML(html: string = ""): Document {
|
|
13
8
|
const tokens = tokenize(html);
|
|
14
9
|
const ast = parse(tokens);
|
package/package.json
CHANGED
package/src/css-selector.ts
CHANGED
|
@@ -21,18 +21,15 @@ function parseSelector(selector: string): SelectorGroup[] {
|
|
|
21
21
|
} else if (trimmed.startsWith(".")) {
|
|
22
22
|
tokens = [{ type: "class", value: trimmed.slice(1) }];
|
|
23
23
|
} else if (trimmed.includes("[") && trimmed.includes("]")) {
|
|
24
|
-
// Handle attribute selectors like input[type="email"], meta[charset], or [role="button"]
|
|
25
24
|
const attributeMatch = trimmed.match(/^([^[\]]*)\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]$/);
|
|
26
25
|
if (attributeMatch) {
|
|
27
26
|
const [, tagName, attrName, attrValue] = attributeMatch;
|
|
28
27
|
tokens = [];
|
|
29
28
|
|
|
30
|
-
// Add tag token if there's a tag name
|
|
31
29
|
if (tagName && tagName.trim()) {
|
|
32
30
|
tokens.push({ type: "tag", value: tagName.trim().toLowerCase() });
|
|
33
31
|
}
|
|
34
32
|
|
|
35
|
-
// Add attribute token
|
|
36
33
|
tokens.push({
|
|
37
34
|
type: "attribute",
|
|
38
35
|
value: (attrName || "").trim(),
|
|
@@ -67,11 +64,9 @@ function matchesToken(element: any, token: SelectorToken): boolean {
|
|
|
67
64
|
return element.attributes?.id === token.value;
|
|
68
65
|
case "attribute":
|
|
69
66
|
const attrValue = element.attributes?.[token.attributeName || ""];
|
|
70
|
-
// If no attribute value specified in selector, just check if attribute exists
|
|
71
67
|
if (token.attributeValue === undefined) {
|
|
72
68
|
return attrValue !== undefined;
|
|
73
69
|
}
|
|
74
|
-
// Otherwise check for exact match
|
|
75
70
|
return attrValue === token.attributeValue;
|
|
76
71
|
default:
|
|
77
72
|
return false;
|
package/src/dom-simulator.ts
CHANGED
|
@@ -22,7 +22,7 @@ export function createElement(
|
|
|
22
22
|
): any {
|
|
23
23
|
const innerHTML = "";
|
|
24
24
|
const tagNameLower = tagName.toLowerCase();
|
|
25
|
-
const
|
|
25
|
+
const initialOuterHTML = `<${tagNameLower}${Object.entries(attributes)
|
|
26
26
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
27
27
|
.join("")}></${tagNameLower}>`;
|
|
28
28
|
const textContent = "";
|
|
@@ -37,7 +37,7 @@ export function createElement(
|
|
|
37
37
|
children: [],
|
|
38
38
|
textContent,
|
|
39
39
|
innerHTML,
|
|
40
|
-
|
|
40
|
+
_internalOuterHTML: initialOuterHTML,
|
|
41
41
|
parentNode: null,
|
|
42
42
|
parentElement: null,
|
|
43
43
|
firstChild: null,
|
|
@@ -123,7 +123,6 @@ export function createElement(
|
|
|
123
123
|
configurable: true,
|
|
124
124
|
});
|
|
125
125
|
|
|
126
|
-
// Add className property
|
|
127
126
|
Object.defineProperty(element, "className", {
|
|
128
127
|
get() {
|
|
129
128
|
return element.attributes.class || "";
|
|
@@ -135,7 +134,6 @@ export function createElement(
|
|
|
135
134
|
configurable: true,
|
|
136
135
|
});
|
|
137
136
|
|
|
138
|
-
// Add id property
|
|
139
137
|
Object.defineProperty(element, "id", {
|
|
140
138
|
get() {
|
|
141
139
|
return element.attributes.id || "";
|
|
@@ -147,6 +145,17 @@ export function createElement(
|
|
|
147
145
|
configurable: true,
|
|
148
146
|
});
|
|
149
147
|
|
|
148
|
+
Object.defineProperty(element, "outerHTML", {
|
|
149
|
+
get() {
|
|
150
|
+
return element._internalOuterHTML || "";
|
|
151
|
+
},
|
|
152
|
+
set(value: string) {
|
|
153
|
+
setOuterHTML(element, value);
|
|
154
|
+
},
|
|
155
|
+
enumerable: true,
|
|
156
|
+
configurable: true,
|
|
157
|
+
});
|
|
158
|
+
|
|
150
159
|
return element;
|
|
151
160
|
}
|
|
152
161
|
|
|
@@ -326,8 +335,6 @@ function convertASTNodeToDOM(astNode: ASTNode): any {
|
|
|
326
335
|
}
|
|
327
336
|
|
|
328
337
|
function appendChild(parent: any, child: any): void {
|
|
329
|
-
// Check for hierarchy request error: prevent circular references
|
|
330
|
-
// Check if parent is a descendant of child
|
|
331
338
|
if (child.nodeType === NodeType.ELEMENT_NODE || child.nodeType === NodeType.DOCUMENT_NODE) {
|
|
332
339
|
let ancestor = parent;
|
|
333
340
|
while (ancestor) {
|
|
@@ -338,7 +345,6 @@ function appendChild(parent: any, child: any): void {
|
|
|
338
345
|
}
|
|
339
346
|
}
|
|
340
347
|
|
|
341
|
-
// Remove child from its current parent if it has one
|
|
342
348
|
if (child.parentNode) {
|
|
343
349
|
removeChild(child.parentNode, child);
|
|
344
350
|
}
|
|
@@ -411,7 +417,6 @@ function removeChild(parent: any, child: any): any {
|
|
|
411
417
|
parent.lastChild = child.previousSibling;
|
|
412
418
|
}
|
|
413
419
|
|
|
414
|
-
// Only handle element-specific relationships if parent is an element
|
|
415
420
|
if (parent.nodeType === NodeType.ELEMENT_NODE && child.nodeType === NodeType.ELEMENT_NODE) {
|
|
416
421
|
const childElement = child;
|
|
417
422
|
const elemIndex = parent.children.indexOf(childElement);
|
|
@@ -454,19 +459,16 @@ function removeChild(parent: any, child: any): any {
|
|
|
454
459
|
}
|
|
455
460
|
|
|
456
461
|
function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
457
|
-
// If referenceNode is null, append to the end
|
|
458
462
|
if (referenceNode === null) {
|
|
459
463
|
appendChild(parent, newNode);
|
|
460
464
|
return newNode;
|
|
461
465
|
}
|
|
462
466
|
|
|
463
|
-
// Verify referenceNode is actually a child of parent
|
|
464
467
|
const refIndex = parent.childNodes.indexOf(referenceNode);
|
|
465
468
|
if (refIndex === -1) {
|
|
466
469
|
throw new Error("Reference node is not a child of this node");
|
|
467
470
|
}
|
|
468
471
|
|
|
469
|
-
// Check for hierarchy request error: prevent circular references
|
|
470
472
|
if (newNode.nodeType === NodeType.ELEMENT_NODE || newNode.nodeType === NodeType.DOCUMENT_NODE) {
|
|
471
473
|
let ancestor = parent;
|
|
472
474
|
while (ancestor) {
|
|
@@ -477,16 +479,13 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
477
479
|
}
|
|
478
480
|
}
|
|
479
481
|
|
|
480
|
-
// Remove newNode from its current parent if it has one
|
|
481
482
|
if (newNode.parentNode) {
|
|
482
483
|
removeChild(newNode.parentNode, newNode);
|
|
483
484
|
}
|
|
484
485
|
|
|
485
|
-
// Insert into childNodes
|
|
486
486
|
parent.childNodes.splice(refIndex, 0, newNode);
|
|
487
487
|
newNode.parentNode = parent;
|
|
488
488
|
|
|
489
|
-
// Update sibling relationships for all nodes
|
|
490
489
|
newNode.previousSibling = referenceNode.previousSibling;
|
|
491
490
|
newNode.nextSibling = referenceNode;
|
|
492
491
|
|
|
@@ -495,12 +494,10 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
495
494
|
}
|
|
496
495
|
referenceNode.previousSibling = newNode;
|
|
497
496
|
|
|
498
|
-
// Update firstChild if inserting at the beginning
|
|
499
497
|
if (parent.firstChild === referenceNode) {
|
|
500
498
|
parent.firstChild = newNode;
|
|
501
499
|
}
|
|
502
500
|
|
|
503
|
-
// Handle element-specific relationships
|
|
504
501
|
if (
|
|
505
502
|
parent.nodeType === NodeType.ELEMENT_NODE &&
|
|
506
503
|
newNode.nodeType === NodeType.ELEMENT_NODE
|
|
@@ -510,12 +507,10 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
510
507
|
|
|
511
508
|
newElement.parentElement = parentElement;
|
|
512
509
|
|
|
513
|
-
// Find the reference node in the children array
|
|
514
510
|
let refElementIndex = -1;
|
|
515
511
|
if (referenceNode.nodeType === NodeType.ELEMENT_NODE) {
|
|
516
512
|
refElementIndex = parentElement.children.indexOf(referenceNode);
|
|
517
513
|
} else {
|
|
518
|
-
// Find the next element sibling
|
|
519
514
|
let nextElement = referenceNode.nextSibling;
|
|
520
515
|
while (nextElement && nextElement.nodeType !== NodeType.ELEMENT_NODE) {
|
|
521
516
|
nextElement = nextElement.nextSibling;
|
|
@@ -526,14 +521,11 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
526
521
|
}
|
|
527
522
|
|
|
528
523
|
if (refElementIndex === -1) {
|
|
529
|
-
// No element siblings after, append to children
|
|
530
524
|
parentElement.children.push(newElement);
|
|
531
525
|
} else {
|
|
532
|
-
// Insert before the reference element
|
|
533
526
|
parentElement.children.splice(refElementIndex, 0, newElement);
|
|
534
527
|
}
|
|
535
528
|
|
|
536
|
-
// Update element sibling relationships
|
|
537
529
|
const newElemIndex = parentElement.children.indexOf(newElement);
|
|
538
530
|
newElement.previousElementSibling =
|
|
539
531
|
newElemIndex > 0 ? parentElement.children[newElemIndex - 1] : null;
|
|
@@ -549,12 +541,9 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
549
541
|
newElement.nextElementSibling.previousElementSibling = newElement;
|
|
550
542
|
}
|
|
551
543
|
|
|
552
|
-
// Update firstElementChild if needed
|
|
553
544
|
if (newElemIndex === 0) {
|
|
554
545
|
parentElement.firstElementChild = newElement;
|
|
555
546
|
}
|
|
556
|
-
|
|
557
|
-
// lastElementChild is not affected since we're inserting before
|
|
558
547
|
}
|
|
559
548
|
|
|
560
549
|
if (parent.nodeType === NodeType.ELEMENT_NODE) {
|
|
@@ -565,13 +554,11 @@ function insertBefore(parent: any, newNode: any, referenceNode: any): any {
|
|
|
565
554
|
}
|
|
566
555
|
|
|
567
556
|
function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
568
|
-
// Verify oldChild is actually a child of parent
|
|
569
557
|
const oldIndex = parent.childNodes.indexOf(oldChild);
|
|
570
558
|
if (oldIndex === -1) {
|
|
571
559
|
throw new Error("Old child is not a child of this node");
|
|
572
560
|
}
|
|
573
561
|
|
|
574
|
-
// Check for hierarchy request error: prevent circular references
|
|
575
562
|
if (newChild.nodeType === NodeType.ELEMENT_NODE || newChild.nodeType === NodeType.DOCUMENT_NODE) {
|
|
576
563
|
let ancestor = parent;
|
|
577
564
|
while (ancestor) {
|
|
@@ -582,16 +569,13 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
582
569
|
}
|
|
583
570
|
}
|
|
584
571
|
|
|
585
|
-
// Remove newChild from its current parent if it has one
|
|
586
572
|
if (newChild.parentNode) {
|
|
587
573
|
removeChild(newChild.parentNode, newChild);
|
|
588
574
|
}
|
|
589
575
|
|
|
590
|
-
// Replace in childNodes array
|
|
591
576
|
parent.childNodes[oldIndex] = newChild;
|
|
592
577
|
newChild.parentNode = parent;
|
|
593
578
|
|
|
594
|
-
// Transfer sibling relationships
|
|
595
579
|
newChild.previousSibling = oldChild.previousSibling;
|
|
596
580
|
newChild.nextSibling = oldChild.nextSibling;
|
|
597
581
|
|
|
@@ -602,7 +586,6 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
602
586
|
oldChild.nextSibling.previousSibling = newChild;
|
|
603
587
|
}
|
|
604
588
|
|
|
605
|
-
// Update first/last child if needed
|
|
606
589
|
if (parent.firstChild === oldChild) {
|
|
607
590
|
parent.firstChild = newChild;
|
|
608
591
|
}
|
|
@@ -610,20 +593,16 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
610
593
|
parent.lastChild = newChild;
|
|
611
594
|
}
|
|
612
595
|
|
|
613
|
-
// Handle element-specific relationships
|
|
614
596
|
if (parent.nodeType === NodeType.ELEMENT_NODE) {
|
|
615
597
|
const parentElement = parent;
|
|
616
598
|
|
|
617
|
-
// Remove old element from children if it's an element
|
|
618
599
|
if (oldChild.nodeType === NodeType.ELEMENT_NODE) {
|
|
619
600
|
const oldElemIndex = parentElement.children.indexOf(oldChild);
|
|
620
601
|
if (oldElemIndex !== -1) {
|
|
621
602
|
if (newChild.nodeType === NodeType.ELEMENT_NODE) {
|
|
622
|
-
// Replace with new element
|
|
623
603
|
parentElement.children[oldElemIndex] = newChild;
|
|
624
604
|
newChild.parentElement = parentElement;
|
|
625
605
|
|
|
626
|
-
// Transfer element sibling relationships
|
|
627
606
|
newChild.previousElementSibling = oldChild.previousElementSibling;
|
|
628
607
|
newChild.nextElementSibling = oldChild.nextElementSibling;
|
|
629
608
|
|
|
@@ -641,7 +620,6 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
641
620
|
parentElement.lastElementChild = newChild;
|
|
642
621
|
}
|
|
643
622
|
} else {
|
|
644
|
-
// Replacing element with non-element, remove from children
|
|
645
623
|
parentElement.children.splice(oldElemIndex, 1);
|
|
646
624
|
|
|
647
625
|
if (oldChild.previousElementSibling) {
|
|
@@ -662,11 +640,9 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
662
640
|
}
|
|
663
641
|
}
|
|
664
642
|
} else if (newChild.nodeType === NodeType.ELEMENT_NODE) {
|
|
665
|
-
// Replacing non-element with element, need to insert into children array
|
|
666
643
|
const newElement = newChild;
|
|
667
644
|
newElement.parentElement = parentElement;
|
|
668
645
|
|
|
669
|
-
// Find correct position in children array
|
|
670
646
|
let insertIndex = 0;
|
|
671
647
|
for (let i = 0; i < oldIndex; i++) {
|
|
672
648
|
if (parent.childNodes[i].nodeType === NodeType.ELEMENT_NODE) {
|
|
@@ -676,7 +652,6 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
676
652
|
|
|
677
653
|
parentElement.children.splice(insertIndex, 0, newElement);
|
|
678
654
|
|
|
679
|
-
// Update element sibling relationships
|
|
680
655
|
newElement.previousElementSibling =
|
|
681
656
|
insertIndex > 0 ? parentElement.children[insertIndex - 1] : null;
|
|
682
657
|
newElement.nextElementSibling =
|
|
@@ -700,7 +675,6 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
700
675
|
}
|
|
701
676
|
}
|
|
702
677
|
|
|
703
|
-
// Clear oldChild's relationships
|
|
704
678
|
oldChild.parentNode = null;
|
|
705
679
|
if (oldChild.nodeType === NodeType.ELEMENT_NODE) {
|
|
706
680
|
oldChild.parentElement = null;
|
|
@@ -720,19 +694,16 @@ function replaceChild(parent: any, newChild: any, oldChild: any): any {
|
|
|
720
694
|
}
|
|
721
695
|
|
|
722
696
|
function insertAfter(parent: any, newNode: any, referenceNode: any): any {
|
|
723
|
-
// If referenceNode is null, insert at the beginning
|
|
724
697
|
if (referenceNode === null) {
|
|
725
698
|
insertBefore(parent, newNode, parent.firstChild);
|
|
726
699
|
return newNode;
|
|
727
700
|
}
|
|
728
701
|
|
|
729
|
-
// Verify referenceNode is actually a child of parent
|
|
730
702
|
const refIndex = parent.childNodes.indexOf(referenceNode);
|
|
731
703
|
if (refIndex === -1) {
|
|
732
704
|
throw new Error("Reference node is not a child of this node");
|
|
733
705
|
}
|
|
734
706
|
|
|
735
|
-
// Insert after means insert before the next sibling
|
|
736
707
|
const nextSibling = referenceNode.nextSibling;
|
|
737
708
|
return insertBefore(parent, newNode, nextSibling);
|
|
738
709
|
}
|
|
@@ -762,7 +733,13 @@ function updateElementContent(element: any): void {
|
|
|
762
733
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
763
734
|
.join("");
|
|
764
735
|
const tagNameLower = element.tagName.toLowerCase();
|
|
765
|
-
|
|
736
|
+
|
|
737
|
+
Object.defineProperty(element, "_internalOuterHTML", {
|
|
738
|
+
value: `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
|
|
739
|
+
writable: true,
|
|
740
|
+
enumerable: false,
|
|
741
|
+
configurable: true,
|
|
742
|
+
});
|
|
766
743
|
|
|
767
744
|
const computedTextContent = getTextContent(element);
|
|
768
745
|
Object.defineProperty(element, "_internalTextContent", {
|
|
@@ -772,7 +749,6 @@ function updateElementContent(element: any): void {
|
|
|
772
749
|
configurable: true,
|
|
773
750
|
});
|
|
774
751
|
|
|
775
|
-
// Propagate changes up to parent elements
|
|
776
752
|
if (element.parentElement) {
|
|
777
753
|
updateElementContent(element.parentElement);
|
|
778
754
|
}
|
|
@@ -854,7 +830,108 @@ export function setInnerHTML(element: any, html: string): void {
|
|
|
854
830
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
855
831
|
.join("");
|
|
856
832
|
const tagNameLower = element.tagName.toLowerCase();
|
|
857
|
-
|
|
833
|
+
|
|
834
|
+
Object.defineProperty(element, "_internalOuterHTML", {
|
|
835
|
+
value: `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
|
|
836
|
+
writable: true,
|
|
837
|
+
enumerable: false,
|
|
838
|
+
configurable: true,
|
|
839
|
+
});
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
export function setOuterHTML(element: any, html: string): void {
|
|
843
|
+
if (!element.parentNode) {
|
|
844
|
+
throw new Error("Cannot set outerHTML on element without a parent");
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
const parent = element.parentNode;
|
|
848
|
+
const indexInParent = parent.childNodes.indexOf(element);
|
|
849
|
+
|
|
850
|
+
if (indexInParent === -1) {
|
|
851
|
+
throw new Error("Element not found in parent's childNodes");
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
let newNodes: any[] = [];
|
|
855
|
+
|
|
856
|
+
if (html.trim()) {
|
|
857
|
+
const tokens = tokenize(html);
|
|
858
|
+
const ast = parse(tokens);
|
|
859
|
+
|
|
860
|
+
if (ast.children) {
|
|
861
|
+
for (const child of ast.children) {
|
|
862
|
+
const domChild = convertASTNodeToDOM(child);
|
|
863
|
+
if (domChild) {
|
|
864
|
+
newNodes.push(domChild);
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
const previousSibling = element.previousSibling;
|
|
871
|
+
const nextSibling = element.nextSibling;
|
|
872
|
+
|
|
873
|
+
parent.childNodes.splice(indexInParent, 1);
|
|
874
|
+
|
|
875
|
+
if (newNodes.length > 0) {
|
|
876
|
+
parent.childNodes.splice(indexInParent, 0, ...newNodes);
|
|
877
|
+
|
|
878
|
+
for (const newNode of newNodes) {
|
|
879
|
+
newNode.parentNode = parent;
|
|
880
|
+
newNode.parentElement = parent.nodeType === NodeType.ELEMENT_NODE ? parent : null;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
for (let i = 0; i < newNodes.length; i++) {
|
|
884
|
+
const currentNode = newNodes[i];
|
|
885
|
+
|
|
886
|
+
if (i === 0) {
|
|
887
|
+
currentNode.previousSibling = previousSibling;
|
|
888
|
+
if (previousSibling) {
|
|
889
|
+
previousSibling.nextSibling = currentNode;
|
|
890
|
+
}
|
|
891
|
+
} else {
|
|
892
|
+
currentNode.previousSibling = newNodes[i - 1];
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
if (i === newNodes.length - 1) {
|
|
896
|
+
currentNode.nextSibling = nextSibling;
|
|
897
|
+
if (nextSibling) {
|
|
898
|
+
nextSibling.previousSibling = currentNode;
|
|
899
|
+
}
|
|
900
|
+
} else {
|
|
901
|
+
currentNode.nextSibling = newNodes[i + 1];
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
} else {
|
|
905
|
+
if (previousSibling) {
|
|
906
|
+
previousSibling.nextSibling = nextSibling;
|
|
907
|
+
}
|
|
908
|
+
if (nextSibling) {
|
|
909
|
+
nextSibling.previousSibling = previousSibling;
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
element.parentNode = null;
|
|
914
|
+
element.parentElement = null;
|
|
915
|
+
element.previousSibling = null;
|
|
916
|
+
element.nextSibling = null;
|
|
917
|
+
|
|
918
|
+
parent.children = parent.childNodes.filter(
|
|
919
|
+
(child: any) => child.nodeType === NodeType.ELEMENT_NODE
|
|
920
|
+
);
|
|
921
|
+
|
|
922
|
+
parent.firstChild = parent.childNodes.length > 0 ? parent.childNodes[0] : null;
|
|
923
|
+
parent.lastChild = parent.childNodes.length > 0 ? parent.childNodes[parent.childNodes.length - 1] : null;
|
|
924
|
+
|
|
925
|
+
parent.firstElementChild = parent.children.length > 0 ? parent.children[0] : null;
|
|
926
|
+
parent.lastElementChild = parent.children.length > 0 ? parent.children[parent.children.length - 1] : null;
|
|
927
|
+
|
|
928
|
+
for (let i = 0; i < parent.children.length; i++) {
|
|
929
|
+
const child = parent.children[i];
|
|
930
|
+
child.previousElementSibling = i > 0 ? parent.children[i - 1] : null;
|
|
931
|
+
child.nextElementSibling = i < parent.children.length - 1 ? parent.children[i + 1] : null;
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
updateElementContent(parent);
|
|
858
935
|
}
|
|
859
936
|
|
|
860
937
|
function setTextContent(element: any, text: string): void {
|
package/src/tokenizer.ts
CHANGED
|
@@ -44,9 +44,6 @@ const HTML_ENTITIES: Record<string, string> = {
|
|
|
44
44
|
'¬': '¬'
|
|
45
45
|
};
|
|
46
46
|
|
|
47
|
-
/**
|
|
48
|
-
* Decode HTML entities in a string and handle null characters
|
|
49
|
-
*/
|
|
50
47
|
function decodeEntities(text: string): string {
|
|
51
48
|
let result = text.replace(/\u0000/g, '\uFFFD');
|
|
52
49
|
|
|
@@ -78,9 +75,6 @@ function decodeEntities(text: string): string {
|
|
|
78
75
|
});
|
|
79
76
|
}
|
|
80
77
|
|
|
81
|
-
/**
|
|
82
|
-
* Parse attributes from a tag string
|
|
83
|
-
*/
|
|
84
78
|
function parseAttributes(attributeString: string): Record<string, string> {
|
|
85
79
|
const attributes: Record<string, string> = {};
|
|
86
80
|
|
|
@@ -98,9 +92,6 @@ function parseAttributes(attributeString: string): Record<string, string> {
|
|
|
98
92
|
return attributes;
|
|
99
93
|
}
|
|
100
94
|
|
|
101
|
-
/**
|
|
102
|
-
* Calculate position in text
|
|
103
|
-
*/
|
|
104
95
|
function calculatePosition(text: string, offset: number): Position {
|
|
105
96
|
const lines = text.slice(0, offset).split('\n');
|
|
106
97
|
return {
|
|
@@ -110,10 +101,6 @@ function calculatePosition(text: string, offset: number): Position {
|
|
|
110
101
|
};
|
|
111
102
|
}
|
|
112
103
|
|
|
113
|
-
/**
|
|
114
|
-
* Tokenize HTML using a combination of HTMLRewriter and manual parsing
|
|
115
|
-
* HTMLRewriter is great for structured HTML but we need manual parsing for edge cases
|
|
116
|
-
*/
|
|
117
104
|
export function tokenize(html: string): Token[] {
|
|
118
105
|
const tokens: Token[] = [];
|
|
119
106
|
let position = 0;
|
|
@@ -254,10 +241,8 @@ export function tokenize(html: string): Token[] {
|
|
|
254
241
|
}
|
|
255
242
|
}
|
|
256
243
|
|
|
257
|
-
// Sort tokens by position
|
|
258
244
|
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
259
245
|
|
|
260
|
-
// Add EOF token
|
|
261
246
|
tokens.push({
|
|
262
247
|
type: TokenType.EOF,
|
|
263
248
|
value: '',
|
|
@@ -285,7 +270,6 @@ export function tokenizeWithRewriter(html: string): Token[] {
|
|
|
285
270
|
textBuffer = '';
|
|
286
271
|
}
|
|
287
272
|
|
|
288
|
-
// Add opening tag
|
|
289
273
|
const attributes: Record<string, string> = {};
|
|
290
274
|
for (const [name, value] of element.attributes) {
|
|
291
275
|
attributes[name] = value;
|
|
@@ -325,14 +309,12 @@ export function tokenizeWithRewriter(html: string): Token[] {
|
|
|
325
309
|
});
|
|
326
310
|
|
|
327
311
|
try {
|
|
328
|
-
// Transform the HTML (this triggers the rewriter)
|
|
329
312
|
const response = new Response(html, {
|
|
330
313
|
headers: { 'Content-Type': 'text/html' }
|
|
331
314
|
});
|
|
332
315
|
|
|
333
316
|
rewriter.transform(response);
|
|
334
317
|
|
|
335
|
-
// Flush any remaining text
|
|
336
318
|
if (textBuffer.trim()) {
|
|
337
319
|
tokens.push({
|
|
338
320
|
type: TokenType.TEXT,
|
|
@@ -342,12 +324,10 @@ export function tokenizeWithRewriter(html: string): Token[] {
|
|
|
342
324
|
}
|
|
343
325
|
|
|
344
326
|
} catch (error) {
|
|
345
|
-
// If HTMLRewriter fails, fall back to manual parsing
|
|
346
327
|
console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
|
|
347
328
|
return tokenize(html);
|
|
348
329
|
}
|
|
349
330
|
|
|
350
|
-
// Sort tokens by position and add EOF
|
|
351
331
|
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
352
332
|
tokens.push({
|
|
353
333
|
type: TokenType.EOF,
|
package/tests/advanced.test.ts
CHANGED
|
@@ -28,14 +28,14 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
28
28
|
});
|
|
29
29
|
|
|
30
30
|
test('should handle unicode characters', () => {
|
|
31
|
-
const tokens = tokenize('<div title="测试" data-emoji="🚀" class="
|
|
31
|
+
const tokens = tokenize('<div title="测试" data-emoji="🚀" class="lorem">');
|
|
32
32
|
expect(tokens.length).toBeGreaterThan(0);
|
|
33
33
|
const tag = tokens[0]!;
|
|
34
34
|
|
|
35
35
|
expect(tag.attributes).toEqual({
|
|
36
36
|
title: '测试',
|
|
37
37
|
'data-emoji': '🚀',
|
|
38
|
-
class: '
|
|
38
|
+
class: 'lorem'
|
|
39
39
|
});
|
|
40
40
|
});
|
|
41
41
|
|