@tkeron/html-parser 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +8 -3
  3. package/index.ts +4 -0
  4. package/package.json +13 -6
  5. package/src/css-selector.ts +45 -27
  6. package/src/dom-simulator.ts +162 -20
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +478 -183
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +59 -139
  12. package/tests/advanced.test.ts +119 -106
  13. package/tests/custom-elements.test.ts +172 -162
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +637 -0
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +43 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +172 -193
  45. package/tests/selectors.test.ts +64 -1
  46. package/tests/serializer-core.test.ts +16 -0
  47. package/tests/serializer-data/core.test +125 -0
  48. package/tests/serializer-data/injectmeta.test +66 -0
  49. package/tests/serializer-data/optionaltags.test +965 -0
  50. package/tests/serializer-data/options.test +60 -0
  51. package/tests/serializer-data/whitespace.test +51 -0
  52. package/tests/serializer-injectmeta.test.ts +16 -0
  53. package/tests/serializer-optionaltags.test.ts +16 -0
  54. package/tests/serializer-options.test.ts +16 -0
  55. package/tests/serializer-whitespace.test.ts +16 -0
  56. package/tests/tokenizer-namedEntities.test.ts +20 -0
  57. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  58. package/tests/tokenizer.test.ts +83 -0
  59. package/tests/tree-construction-adoption01.test.ts +37 -0
  60. package/tests/tree-construction-adoption02.test.ts +34 -0
  61. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  62. package/tests/tree-construction-entities02.test.ts +33 -0
  63. package/tests/tree-construction-html5test-com.test.ts +24 -0
  64. package/tests/tree-construction-math.test.ts +18 -0
  65. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  66. package/tests/tree-construction-noscript01.test.ts +18 -0
  67. package/tests/tree-construction-ruby.test.ts +21 -0
  68. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  69. package/tests/tree-construction-svg.test.ts +21 -0
  70. package/tests/tree-construction-template.test.ts +21 -0
  71. package/tests/tree-construction-tests10.test.ts +21 -0
  72. package/tests/tree-construction-tests11.test.ts +21 -0
  73. package/tests/tree-construction-tests20.test.ts +18 -0
  74. package/tests/tree-construction-tests21.test.ts +18 -0
  75. package/tests/tree-construction-tests23.test.ts +18 -0
  76. package/tests/tree-construction-tests24.test.ts +18 -0
  77. package/tests/tree-construction-tests5.test.ts +21 -0
  78. package/tests/tree-construction-tests6.test.ts +21 -0
  79. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  80. package/tests/void-elements.test.ts +471 -0
  81. package/tests/official/README.md +0 -87
  82. package/tests/official/acid/acid-tests.test.ts +0 -309
  83. package/tests/official/final-output/final-output.test.ts +0 -361
  84. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  85. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  86. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  87. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  88. package/tests/official/validator/validator-tests.test.ts +0 -237
  89. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  90. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  91. package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  # HTML Parser - Powered by Bun Native Tokenizer
2
2
 
3
- > ⚠️ **Work in Progress** - This package is currently under active development.
4
-
5
3
  A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
6
4
 
7
5
  ## Features
@@ -11,15 +9,11 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
11
9
  - 🪶 **Lightweight**: Minimal dependencies, native implementation
12
10
  - 🌐 **Standards Compliant**: Returns standard DOM Document objects
13
11
  - 🔧 **TypeScript Support**: Full TypeScript definitions included
14
- - ✅ **Well Tested**: Comprehensive unit test suite (181/181 passing)
12
+ - ✅ **Well Tested**: Comprehensive test suite (5200+ tests passing)
15
13
  - 🔄 **100% Compatible**: Drop-in replacement, same API
16
14
 
17
15
  ## Installation
18
16
 
19
- > **Note**: This package is not yet published to npm. For now, you can clone and build locally.
20
-
21
- Once published, it will be available as:
22
-
23
17
  ```bash
24
18
  npm install @tkeron/html-parser
25
19
  ```
package/bun.lock CHANGED
@@ -4,8 +4,11 @@
4
4
  "workspaces": {
5
5
  "": {
6
6
  "name": "@tkeron/html-parser",
7
+ "dependencies": {
8
+ "all-named-html-entities": "^3.1.3",
9
+ },
7
10
  "devDependencies": {
8
- "@types/bun": "^1.3.4",
11
+ "@types/bun": "^1.3.6",
9
12
  },
10
13
  "peerDependencies": {
11
14
  "typescript": "^5.9.3",
@@ -13,11 +16,13 @@
13
16
  },
14
17
  },
15
18
  "packages": {
16
- "@types/bun": ["@types/bun@1.3.4", "", { "dependencies": { "bun-types": "1.3.4" } }, "sha512-EEPTKXHP+zKGPkhRLv+HI0UEX8/o+65hqARxLy8Ov5rIxMBPNTjeZww00CIihrIQGEQBYg+0roO5qOnS/7boGA=="],
19
+ "@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
17
20
 
18
21
  "@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
19
22
 
20
- "bun-types": ["bun-types@1.3.4", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ua817+BZPZOlNaRgGBpZJOSAQ9RQ17pkwPD0yR7CfJg+r8DgIILByFifDTa+IPDDxzf5VNhtNlcKqFzDgJvlQ=="],
23
+ "all-named-html-entities": ["all-named-html-entities@3.1.3", "", {}, "sha512-eG7/XkhxyIUWApWvhVPcusxZ3PTebJo1AvkFkQj7MDSkBYmzXZsNadKZWuo1UxEX6QrE7y7JQx7G3Fx0YjVtnA=="],
24
+
25
+ "bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
21
26
 
22
27
  "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
23
28
 
package/index.ts CHANGED
@@ -7,6 +7,10 @@ import {
7
7
  export function parseHTML(html: string = ""): Document {
8
8
  const tokens = tokenize(html);
9
9
  const ast = parse(tokens);
10
+ // If parse already returns a DOM document, return it directly
11
+ if (ast && typeof ast.nodeType === 'number' && ast.nodeType === 9) {
12
+ return ast;
13
+ }
10
14
  return astToDOM(ast);
11
15
  }
12
16
 
package/package.json CHANGED
@@ -1,25 +1,32 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "0.1.5",
3
+ "version": "1.0.0",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
7
7
  "type": "module",
8
8
  "author": "tkeron",
9
9
  "license": "MIT",
10
+ "scripts": {
11
+ "test": "bun test --concurrent"
12
+ },
10
13
  "devDependencies": {
11
- "@types/bun": "^1.3.4"
14
+ "@types/bun": "^1.3.6"
12
15
  },
13
16
  "peerDependencies": {
14
17
  "typescript": "^5.9.3"
15
18
  },
16
19
  "keywords": [
17
- "cli",
18
- "commands",
19
- "command-line",
20
- "arguments"
20
+ "html",
21
+ "parser",
22
+ "dom",
23
+ "bun",
24
+ "tokenizer"
21
25
  ],
22
26
  "repository": {
23
27
  "url": "git@github.com:tkeron/html-parser.git"
28
+ },
29
+ "dependencies": {
30
+ "all-named-html-entities": "^3.1.3"
24
31
  }
25
32
  }
@@ -14,33 +14,47 @@ function parseSelector(selector: string): SelectorGroup[] {
14
14
 
15
15
  return parts.map((part) => {
16
16
  const trimmed = part.trim();
17
- let tokens: SelectorToken[];
18
-
19
- if (trimmed.startsWith("#")) {
20
- tokens = [{ type: "id", value: trimmed.slice(1) }];
21
- } else if (trimmed.startsWith(".")) {
22
- tokens = [{ type: "class", value: trimmed.slice(1) }];
23
- } else if (trimmed.includes("[") && trimmed.includes("]")) {
24
- const attributeMatch = trimmed.match(/^([^[\]]*)\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]$/);
25
- if (attributeMatch) {
26
- const [, tagName, attrName, attrValue] = attributeMatch;
27
- tokens = [];
28
-
29
- if (tagName && tagName.trim()) {
30
- tokens.push({ type: "tag", value: tagName.trim().toLowerCase() });
31
- }
32
-
33
- tokens.push({
34
- type: "attribute",
35
- value: (attrName || "").trim(),
36
- attributeName: (attrName || "").trim(),
37
- attributeValue: attrValue ? attrValue.trim() : undefined
38
- });
39
- } else {
40
- tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
41
- }
42
- } else {
43
- tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
17
+ let tokens: SelectorToken[] = [];
18
+
19
+ // Handle universal selector
20
+ if (trimmed === '*') {
21
+ // Match any element - we'll handle this specially
22
+ return { tokens: [] };
23
+ }
24
+
25
+ // Parse complex selectors like p#intro.first or .foo.bar.baz
26
+ let remaining = trimmed;
27
+
28
+ // Extract tag name first if present
29
+ const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
30
+ if (tagMatch) {
31
+ tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
32
+ remaining = remaining.slice(tagMatch[1].length);
33
+ }
34
+
35
+ // Extract all IDs (HTML5 allows IDs starting with digits)
36
+ const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
37
+ for (const match of idMatches) {
38
+ tokens.push({ type: "id", value: match[1] });
39
+ }
40
+ remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
41
+
42
+ // Extract all classes
43
+ const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
44
+ for (const match of classMatches) {
45
+ tokens.push({ type: "class", value: match[1] });
46
+ }
47
+ remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
48
+
49
+ // Extract attributes
50
+ const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
51
+ for (const match of attrMatches) {
52
+ tokens.push({
53
+ type: "attribute",
54
+ value: match[1].trim(),
55
+ attributeName: match[1].trim(),
56
+ attributeValue: match[2] ? match[2].trim() : undefined
57
+ });
44
58
  }
45
59
 
46
60
  return { tokens };
@@ -74,6 +88,10 @@ function matchesToken(element: any, token: SelectorToken): boolean {
74
88
  }
75
89
 
76
90
  function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
91
+ // Universal selector - matches any element
92
+ if (tokens.length === 0) {
93
+ return true;
94
+ }
77
95
  return tokens.every((token) => matchesToken(element, token));
78
96
  }
79
97
 
@@ -6,6 +6,16 @@ import {
6
6
  querySelectorAll as querySelectorAllFunction,
7
7
  } from "./css-selector.js";
8
8
 
9
+ // Escape special HTML characters in text content
10
+ function escapeTextContent(text: string): string {
11
+ return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
12
+ }
13
+
14
+ const VOID_ELEMENTS = new Set([
15
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
16
+ 'link', 'meta', 'param', 'source', 'track', 'wbr'
17
+ ]);
18
+
9
19
  export const enum NodeType {
10
20
  ELEMENT_NODE = 1,
11
21
  TEXT_NODE = 3,
@@ -22,9 +32,13 @@ export function createElement(
22
32
  ): any {
23
33
  const innerHTML = "";
24
34
  const tagNameLower = tagName.toLowerCase();
25
- const initialOuterHTML = `<${tagNameLower}${Object.entries(attributes)
35
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
36
+ const attrsStr = Object.entries(attributes)
26
37
  .map(([k, v]) => ` ${k}="${v}"`)
27
- .join("")}></${tagNameLower}>`;
38
+ .join("");
39
+ const initialOuterHTML = isVoid
40
+ ? `<${tagNameLower}${attrsStr}>`
41
+ : `<${tagNameLower}${attrsStr}></${tagNameLower}>`;
28
42
  const textContent = "";
29
43
 
30
44
  const element: any = {
@@ -54,6 +68,18 @@ export function createElement(
54
68
  return child;
55
69
  },
56
70
 
71
+ prepend(...nodes: any[]): void {
72
+ prepend(element, ...nodes);
73
+ },
74
+
75
+ append(...nodes: any[]): void {
76
+ append(element, ...nodes);
77
+ },
78
+
79
+ remove(): void {
80
+ remove(element);
81
+ },
82
+
57
83
  removeChild(child: any): any {
58
84
  return removeChild(element, child);
59
85
  },
@@ -96,6 +122,10 @@ export function createElement(
96
122
  return querySelectorAllFunction(element, selector);
97
123
  },
98
124
 
125
+ matches(selector: string): boolean {
126
+ return matches(element, selector);
127
+ },
128
+
99
129
  cloneNode(deep: boolean = false): any {
100
130
  return cloneNode(element, deep);
101
131
  },
@@ -172,6 +202,10 @@ export function createTextNode(content: string): any {
172
202
  lastChild: null,
173
203
  nextSibling: null,
174
204
  previousSibling: null,
205
+
206
+ remove(): void {
207
+ remove(textNode);
208
+ },
175
209
  };
176
210
  return textNode;
177
211
  }
@@ -189,10 +223,33 @@ export function createComment(content: string): any {
189
223
  lastChild: null,
190
224
  nextSibling: null,
191
225
  previousSibling: null,
226
+
227
+ remove(): void {
228
+ remove(commentNode);
229
+ },
192
230
  };
193
231
  return commentNode;
194
232
  }
195
233
 
234
+ export function createDoctype(name: string = 'html'): any {
235
+ const doctypeNode: any = {
236
+ nodeType: NodeType.DOCUMENT_TYPE_NODE,
237
+ nodeName: name.toUpperCase(),
238
+ name: name.toLowerCase(),
239
+ nodeValue: null,
240
+ textContent: "",
241
+ publicId: null,
242
+ systemId: null,
243
+ childNodes: [],
244
+ parentNode: null,
245
+ firstChild: null,
246
+ lastChild: null,
247
+ nextSibling: null,
248
+ previousSibling: null,
249
+ };
250
+ return doctypeNode;
251
+ }
252
+
196
253
  export function createDocument(): any {
197
254
  const document: any = {
198
255
  nodeType: NodeType.DOCUMENT_NODE,
@@ -222,6 +279,14 @@ export function createDocument(): any {
222
279
  return child;
223
280
  },
224
281
 
282
+ prepend(...nodes: any[]): void {
283
+ prepend(document, ...nodes);
284
+ },
285
+
286
+ append(...nodes: any[]): void {
287
+ append(document, ...nodes);
288
+ },
289
+
225
290
  removeChild(child: any): any {
226
291
  return removeChild(document, child);
227
292
  },
@@ -334,7 +399,7 @@ function convertASTNodeToDOM(astNode: ASTNode): any {
334
399
  }
335
400
  }
336
401
 
337
- function appendChild(parent: any, child: any): void {
402
+ export function appendChild(parent: any, child: any): void {
338
403
  if (child.nodeType === NodeType.ELEMENT_NODE || child.nodeType === NodeType.DOCUMENT_NODE) {
339
404
  let ancestor = parent;
340
405
  while (ancestor) {
@@ -395,6 +460,83 @@ function appendChild(parent: any, child: any): void {
395
460
  }
396
461
  }
397
462
 
463
+ function prepend(parent: any, ...nodes: any[]): void {
464
+ if (nodes.length === 0) return;
465
+
466
+ for (let i = nodes.length - 1; i >= 0; i--) {
467
+ const node = nodes[i];
468
+ let childNode: any;
469
+
470
+ if (typeof node === 'string') {
471
+ childNode = createTextNode(node);
472
+ } else {
473
+ childNode = node;
474
+ }
475
+
476
+ if (parent.firstChild) {
477
+ insertBefore(parent, childNode, parent.firstChild);
478
+ } else {
479
+ appendChild(parent, childNode);
480
+ }
481
+ }
482
+ }
483
+
484
+ function append(parent: any, ...nodes: any[]): void {
485
+ if (nodes.length === 0) return;
486
+
487
+ for (const node of nodes) {
488
+ let childNode: any;
489
+
490
+ if (typeof node === 'string') {
491
+ childNode = createTextNode(node);
492
+ } else {
493
+ childNode = node;
494
+ }
495
+
496
+ appendChild(parent, childNode);
497
+ }
498
+ }
499
+
500
+ function remove(node: any): void {
501
+ if (node.parentNode) {
502
+ removeChild(node.parentNode, node);
503
+ }
504
+ }
505
+
506
+ function matches(element: any, selector: string): boolean {
507
+ if (!selector || element.nodeType !== NodeType.ELEMENT_NODE) {
508
+ return false;
509
+ }
510
+
511
+ try {
512
+ // Para selectores complejos con descendientes, necesitamos buscar desde un ancestro
513
+ if (selector.includes(' ') || selector.includes('>')) {
514
+ // Buscar desde la raíz del documento
515
+ let root = element;
516
+ while (root.parentNode) {
517
+ root = root.parentNode;
518
+ }
519
+ const results = querySelectorAllFunction(root, selector);
520
+ return results.includes(element);
521
+ }
522
+
523
+ // Para selectores simples, usar el padre o crear uno temporal
524
+ const parent = element.parentNode || createTempParent(element);
525
+ const results = querySelectorAllFunction(parent, selector);
526
+ return results.includes(element);
527
+ } catch (error) {
528
+ return false;
529
+ }
530
+ }
531
+
532
+ function createTempParent(element: any): any {
533
+ const temp = createElement('div');
534
+ temp.childNodes.push(element);
535
+ temp.children.push(element);
536
+ element._tempParent = temp;
537
+ return temp;
538
+ }
539
+
398
540
  function removeChild(parent: any, child: any): any {
399
541
  const index = parent.childNodes.indexOf(child);
400
542
  if (index === -1) {
@@ -733,9 +875,10 @@ function updateElementContent(element: any): void {
733
875
  .map(([k, v]) => ` ${k}="${v}"`)
734
876
  .join("");
735
877
  const tagNameLower = element.tagName.toLowerCase();
878
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
736
879
 
737
880
  Object.defineProperty(element, "_internalOuterHTML", {
738
- value: `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
881
+ value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
739
882
  writable: true,
740
883
  enumerable: false,
741
884
  configurable: true,
@@ -799,13 +942,13 @@ export function setInnerHTML(element: any, html: string): void {
799
942
 
800
943
  if (html.trim()) {
801
944
  const tokens = tokenize(html);
802
- const ast = parse(tokens);
803
- if (ast.children) {
804
- for (const child of ast.children) {
805
- const domChild = convertASTNodeToDOM(child);
806
- if (domChild) {
807
- appendChild(element, domChild);
808
- }
945
+ const doc = parse(tokens);
946
+ const body = doc.body;
947
+ if (body && body.childNodes) {
948
+ const nodesToMove = [...body.childNodes];
949
+ for (const child of nodesToMove) {
950
+ child.parentNode = null;
951
+ appendChild(element, child);
809
952
  }
810
953
  }
811
954
  }
@@ -830,9 +973,10 @@ export function setInnerHTML(element: any, html: string): void {
830
973
  .map(([k, v]) => ` ${k}="${v}"`)
831
974
  .join("");
832
975
  const tagNameLower = element.tagName.toLowerCase();
976
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
833
977
 
834
978
  Object.defineProperty(element, "_internalOuterHTML", {
835
- value: `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
979
+ value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
836
980
  writable: true,
837
981
  enumerable: false,
838
982
  configurable: true,
@@ -855,14 +999,12 @@ export function setOuterHTML(element: any, html: string): void {
855
999
 
856
1000
  if (html.trim()) {
857
1001
  const tokens = tokenize(html);
858
- const ast = parse(tokens);
859
-
860
- if (ast.children) {
861
- for (const child of ast.children) {
862
- const domChild = convertASTNodeToDOM(child);
863
- if (domChild) {
864
- newNodes.push(domChild);
865
- }
1002
+ const doc = parse(tokens);
1003
+ const body = doc.body;
1004
+ if (body && body.childNodes) {
1005
+ for (const child of body.childNodes) {
1006
+ child.parentNode = null;
1007
+ newNodes.push(child);
866
1008
  }
867
1009
  }
868
1010
  }
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Detects the character encoding of an HTML document.
3
+ * Based on HTML5 specification for encoding detection.
4
+ */
5
+
6
+ const encodingAliases: Record<string, string> = {
7
+ 'iso-8859-1': 'windows-1252',
8
+ 'iso8859-1': 'windows-1252',
9
+ 'iso-8859-2': 'iso-8859-2',
10
+ 'iso8859-2': 'iso-8859-2',
11
+ 'utf-8': 'utf-8',
12
+ 'utf8': 'utf-8',
13
+ // Add more as needed
14
+ };
15
+
16
+ function normalizeEncoding(name: string): string | null {
17
+ const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
18
+ return encodingAliases[lower] || lower;
19
+ }
20
+
21
+ export function detectEncoding(html: string): string | null {
22
+ // Limit to first 1024 characters for performance
23
+ const prefix = html.substring(0, 1024);
24
+
25
+ // Look for <meta charset="...">
26
+ const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
27
+ if (charsetMatch) {
28
+ return normalizeEncoding(charsetMatch[1]);
29
+ }
30
+
31
+ // Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
32
+ const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
33
+ if (contentTypeMatch) {
34
+ return normalizeEncoding(contentTypeMatch[1]);
35
+ }
36
+
37
+ // Default to Windows-1252 if no encoding found (as per HTML5 spec)
38
+ return 'windows-1252';
39
+ }
package/src/index.ts ADDED
@@ -0,0 +1,9 @@
1
+ import { tokenize } from './tokenizer.js';
2
+ import { parse } from './parser.js';
3
+
4
+ export function parseHTML(html: string): any {
5
+ const tokens = tokenize(html);
6
+ return parse(tokens);
7
+ }
8
+
9
+ export { parse } from './parser';