@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +41 -17
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +509 -143
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +190 -118
  12. package/tests/advanced.test.ts +121 -108
  13. package/tests/custom-elements-head.test.ts +105 -0
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +60 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +173 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +25 -32
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +32 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/custom-elements.test.ts +0 -745
  80. package/tests/official/README.md +0 -87
  81. package/tests/official/acid/acid-tests.test.ts +0 -309
  82. package/tests/official/final-output/final-output.test.ts +0 -361
  83. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  84. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  85. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  86. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  87. package/tests/official/validator/validator-tests.test.ts +0 -237
  88. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  89. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  90. package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  # HTML Parser - Powered by Bun Native Tokenizer
2
2
 
3
- > ⚠️ **Work in Progress** - This package is currently under active development.
4
-
5
3
  A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
6
4
 
7
5
  ## Features
@@ -11,15 +9,11 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
11
9
  - 🪶 **Lightweight**: Minimal dependencies, native implementation
12
10
  - 🌐 **Standards Compliant**: Returns standard DOM Document objects
13
11
  - 🔧 **TypeScript Support**: Full TypeScript definitions included
14
- - ✅ **Well Tested**: Comprehensive unit test suite (569 tests passing)
12
+ - ✅ **Well Tested**: Comprehensive test suite (5200+ tests passing)
15
13
  - 🔄 **100% Compatible**: Drop-in replacement, same API
16
14
 
17
15
  ## Installation
18
16
 
19
- > **Note**: This package is not yet published to npm. For now, you can clone and build locally.
20
-
21
- Once published, it will be available as:
22
-
23
17
  ```bash
24
18
  npm install @tkeron/html-parser
25
19
  ```
package/bun.lock CHANGED
@@ -4,6 +4,9 @@
4
4
  "workspaces": {
5
5
  "": {
6
6
  "name": "@tkeron/html-parser",
7
+ "dependencies": {
8
+ "all-named-html-entities": "^3.1.3",
9
+ },
7
10
  "devDependencies": {
8
11
  "@types/bun": "^1.3.6",
9
12
  },
@@ -17,6 +20,8 @@
17
20
 
18
21
  "@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
19
22
 
23
+ "all-named-html-entities": ["all-named-html-entities@3.1.3", "", {}, "sha512-eG7/XkhxyIUWApWvhVPcusxZ3PTebJo1AvkFkQj7MDSkBYmzXZsNadKZWuo1UxEX6QrE7y7JQx7G3Fx0YjVtnA=="],
24
+
20
25
  "bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
21
26
 
22
27
  "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
package/index.ts CHANGED
@@ -7,6 +7,10 @@ import {
7
7
  export function parseHTML(html: string = ""): Document {
8
8
  const tokens = tokenize(html);
9
9
  const ast = parse(tokens);
10
+ // If parse already returns a DOM document, return it directly
11
+ if (ast && typeof ast.nodeType === 'number' && ast.nodeType === 9) {
12
+ return ast;
13
+ }
10
14
  return astToDOM(ast);
11
15
  }
12
16
 
package/package.json CHANGED
@@ -1,12 +1,15 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "0.1.7",
3
+ "version": "1.1.0",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
7
7
  "type": "module",
8
8
  "author": "tkeron",
9
9
  "license": "MIT",
10
+ "scripts": {
11
+ "test": "bun test --concurrent"
12
+ },
10
13
  "devDependencies": {
11
14
  "@types/bun": "^1.3.6"
12
15
  },
@@ -22,5 +25,8 @@
22
25
  ],
23
26
  "repository": {
24
27
  "url": "git@github.com:tkeron/html-parser.git"
28
+ },
29
+ "dependencies": {
30
+ "all-named-html-entities": "^3.1.3"
25
31
  }
26
32
  }
@@ -26,7 +26,7 @@ function parseSelector(selector: string): SelectorGroup[] {
26
26
  let remaining = trimmed;
27
27
 
28
28
  // Extract tag name first if present
29
- const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9]*)/);
29
+ const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
30
30
  if (tagMatch) {
31
31
  tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
32
32
  remaining = remaining.slice(tagMatch[1].length);
@@ -6,6 +6,11 @@ import {
6
6
  querySelectorAll as querySelectorAllFunction,
7
7
  } from "./css-selector.js";
8
8
 
9
+ // Escape special HTML characters in text content
10
+ function escapeTextContent(text: string): string {
11
+ return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
12
+ }
13
+
9
14
  const VOID_ELEMENTS = new Set([
10
15
  'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
11
16
  'link', 'meta', 'param', 'source', 'track', 'wbr'
@@ -23,7 +28,8 @@ export const enum NodeType {
23
28
 
24
29
  export function createElement(
25
30
  tagName: string,
26
- attributes: Record<string, string> = {}
31
+ attributes: Record<string, string> = {},
32
+ namespaceURI?: string
27
33
  ): any {
28
34
  const innerHTML = "";
29
35
  const tagNameLower = tagName.toLowerCase();
@@ -41,6 +47,7 @@ export function createElement(
41
47
  nodeName: tagName.toUpperCase(),
42
48
  nodeValue: null,
43
49
  tagName: tagName.toUpperCase(),
50
+ namespaceURI: namespaceURI || null,
44
51
  attributes: { ...attributes },
45
52
  childNodes: [],
46
53
  children: [],
@@ -226,6 +233,25 @@ export function createComment(content: string): any {
226
233
  return commentNode;
227
234
  }
228
235
 
236
+ export function createDoctype(name: string = 'html'): any {
237
+ const doctypeNode: any = {
238
+ nodeType: NodeType.DOCUMENT_TYPE_NODE,
239
+ nodeName: name.toUpperCase(),
240
+ name: name.toLowerCase(),
241
+ nodeValue: null,
242
+ textContent: "",
243
+ publicId: null,
244
+ systemId: null,
245
+ childNodes: [],
246
+ parentNode: null,
247
+ firstChild: null,
248
+ lastChild: null,
249
+ nextSibling: null,
250
+ previousSibling: null,
251
+ };
252
+ return doctypeNode;
253
+ }
254
+
229
255
  export function createDocument(): any {
230
256
  const document: any = {
231
257
  nodeType: NodeType.DOCUMENT_NODE,
@@ -375,7 +401,7 @@ function convertASTNodeToDOM(astNode: ASTNode): any {
375
401
  }
376
402
  }
377
403
 
378
- function appendChild(parent: any, child: any): void {
404
+ export function appendChild(parent: any, child: any): void {
379
405
  if (child.nodeType === NodeType.ELEMENT_NODE || child.nodeType === NodeType.DOCUMENT_NODE) {
380
406
  let ancestor = parent;
381
407
  while (ancestor) {
@@ -918,13 +944,13 @@ export function setInnerHTML(element: any, html: string): void {
918
944
 
919
945
  if (html.trim()) {
920
946
  const tokens = tokenize(html);
921
- const ast = parse(tokens);
922
- if (ast.children) {
923
- for (const child of ast.children) {
924
- const domChild = convertASTNodeToDOM(child);
925
- if (domChild) {
926
- appendChild(element, domChild);
927
- }
947
+ const doc = parse(tokens);
948
+ const body = doc.body;
949
+ if (body && body.childNodes) {
950
+ const nodesToMove = [...body.childNodes];
951
+ for (const child of nodesToMove) {
952
+ child.parentNode = null;
953
+ appendChild(element, child);
928
954
  }
929
955
  }
930
956
  }
@@ -975,14 +1001,12 @@ export function setOuterHTML(element: any, html: string): void {
975
1001
 
976
1002
  if (html.trim()) {
977
1003
  const tokens = tokenize(html);
978
- const ast = parse(tokens);
979
-
980
- if (ast.children) {
981
- for (const child of ast.children) {
982
- const domChild = convertASTNodeToDOM(child);
983
- if (domChild) {
984
- newNodes.push(domChild);
985
- }
1004
+ const doc = parse(tokens);
1005
+ const body = doc.body;
1006
+ if (body && body.childNodes) {
1007
+ for (const child of body.childNodes) {
1008
+ child.parentNode = null;
1009
+ newNodes.push(child);
986
1010
  }
987
1011
  }
988
1012
  }
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Detects the character encoding of an HTML document.
3
+ * Based on HTML5 specification for encoding detection.
4
+ */
5
+
6
+ const encodingAliases: Record<string, string> = {
7
+ 'iso-8859-1': 'windows-1252',
8
+ 'iso8859-1': 'windows-1252',
9
+ 'iso-8859-2': 'iso-8859-2',
10
+ 'iso8859-2': 'iso-8859-2',
11
+ 'utf-8': 'utf-8',
12
+ 'utf8': 'utf-8',
13
+ // Add more as needed
14
+ };
15
+
16
+ function normalizeEncoding(name: string): string | null {
17
+ const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
18
+ return encodingAliases[lower] || lower;
19
+ }
20
+
21
+ export function detectEncoding(html: string): string | null {
22
+ // Limit to first 1024 characters for performance
23
+ const prefix = html.substring(0, 1024);
24
+
25
+ // Look for <meta charset="...">
26
+ const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
27
+ if (charsetMatch) {
28
+ return normalizeEncoding(charsetMatch[1]);
29
+ }
30
+
31
+ // Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
32
+ const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
33
+ if (contentTypeMatch) {
34
+ return normalizeEncoding(contentTypeMatch[1]);
35
+ }
36
+
37
+ // Default to Windows-1252 if no encoding found (as per HTML5 spec)
38
+ return 'windows-1252';
39
+ }
package/src/index.ts ADDED
@@ -0,0 +1,9 @@
1
+ import { tokenize } from './tokenizer.js';
2
+ import { parse } from './parser.js';
3
+
4
+ export function parseHTML(html: string): any {
5
+ const tokens = tokenize(html);
6
+ return parse(tokens);
7
+ }
8
+
9
+ export { parse } from './parser';