@tkeron/html-parser 1.4.1 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,8 +9,9 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
9
9
  - ðŸŠķ **Lightweight**: Zero external dependencies
10
10
  - 🌐 **Standards Compliant**: Returns standard DOM Document objects
11
11
  - 🔧 **TypeScript Support**: Full TypeScript definitions included
12
- - ✅ **Well Tested**: Comprehensive test suite (5600+ tests passing)
12
+ - ✅ **Well Tested**: Comprehensive test suite (5660+ tests passing)
13
13
  - ðŸŽŊ **HTML5 Spec**: Implements Adoption Agency Algorithm for proper formatting element handling
14
+ - ðŸ§Đ **Fragment Parsing**: Parse HTML fragments with context element support
14
15
 
15
16
  ## Installation
16
17
 
@@ -76,6 +77,28 @@ Parses an HTML string and returns a DOM Document object.
76
77
 
77
78
  - `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
78
79
 
80
+ ### `parseHTMLFragment(html: string, contextTagName: string): Node[]`
81
+
82
+ Parses an HTML string as a fragment within a context element. Useful for parsing innerHTML-style content.
83
+
84
+ **Parameters:**
85
+
86
+ - `html` (string): The HTML string to parse
87
+ - `contextTagName` (string): The tag name of the context element (e.g., `"div"`, `"body"`)
88
+
89
+ **Returns:**
90
+
91
+ - `Node[]`: An array of parsed nodes
92
+
93
+ **Example:**
94
+
95
+ ```typescript
96
+ import { parseHTMLFragment } from "@tkeron/html-parser";
97
+
98
+ const nodes = parseHTMLFragment("<b>Hello</b> <i>World</i>", "div");
99
+ console.log(nodes.length); // 3 (b element, text node, i element)
100
+ ```
101
+
79
102
  ## Development
80
103
 
81
104
  This project is built with Bun. To get started:
package/index.ts CHANGED
@@ -1,13 +1,17 @@
1
1
  import { tokenize } from "./src/tokenizer/index.js";
2
- import { parse } from "./src/parser/index.js";
2
+ import { parse, parseFragment } from "./src/parser/index.js";
3
3
  import { astToDOM } from "./src/dom-simulator.js";
4
4
 
5
5
  export function parseHTML(html: string = ""): Document {
6
6
  const tokens = tokenize(html);
7
7
  const ast = parse(tokens);
8
- // If parse already returns a DOM document, return it directly
9
8
  if (ast && typeof ast.nodeType === "number" && ast.nodeType === 9) {
10
9
  return ast;
11
10
  }
12
11
  return astToDOM(ast);
13
12
  }
13
+
14
+ export function parseHTMLFragment(html: string, contextTagName: string): any[] {
15
+ const tokens = tokenize(html);
16
+ return parseFragment(tokens, contextTagName);
17
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.4.1",
3
+ "version": "1.5.1",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -1,4 +1,4 @@
1
- export { parse } from "./parse";
1
+ export { parse, parseFragment } from "./parse";
2
2
  export { domToAST } from "./dom-to-ast";
3
3
  export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
4
4
  export { ASTNodeType } from "./types";
@@ -1112,3 +1112,33 @@ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
1112
1112
  }
1113
1113
  return result;
1114
1114
  };
1115
+
1116
+ export const parseFragment = (tokens: Token[], contextTagName: string): any => {
1117
+ const root = createDocument();
1118
+ const contextElement = createElement(contextTagName.toLowerCase(), {});
1119
+ appendChild(root, contextElement);
1120
+
1121
+ const state: ParserState = {
1122
+ tokens,
1123
+ position: 0,
1124
+ length: tokens.length,
1125
+ stack: [root, contextElement],
1126
+ root,
1127
+ insertionMode: InsertionMode.InBody,
1128
+ errors: [],
1129
+ activeFormattingElements: [],
1130
+ };
1131
+
1132
+ while (state.position < state.length) {
1133
+ const token = getCurrentToken(state);
1134
+
1135
+ if (!token || token.type === TokenType.EOF) {
1136
+ break;
1137
+ }
1138
+
1139
+ parseToken(state, token);
1140
+ advance(state);
1141
+ }
1142
+
1143
+ return contextElement.childNodes;
1144
+ };
@@ -19,11 +19,11 @@ export const parseSelector = (selector: string): SelectorGroup[] => {
19
19
  remaining = remaining.slice(tagMatch[1].length);
20
20
  }
21
21
 
22
- const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
22
+ const idMatches = remaining.matchAll(/#([a-zA-Z0-9_-][a-zA-Z0-9_-]*)/g);
23
23
  for (const match of idMatches) {
24
24
  tokens.push({ type: "id", value: match[1] });
25
25
  }
26
- remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, "");
26
+ remaining = remaining.replace(/#[a-zA-Z0-9_-][a-zA-Z0-9_-]*/g, "");
27
27
 
28
28
  const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
29
29
  for (const match of classMatches) {
@@ -68,3 +68,52 @@ export function serializeToHtml5lib(
68
68
  serialize(doc, 0);
69
69
  return lines.join("\n") + "\n";
70
70
  }
71
+
72
+ export function serializeFragmentToHtml5lib(nodes: any[]): string {
73
+ const lines: string[] = [];
74
+
75
+ function serialize(node: any, depth: number): void {
76
+ const indent = "| " + " ".repeat(depth);
77
+
78
+ if (node.nodeType === 1) {
79
+ const tagName = node.tagName.toLowerCase();
80
+ const ns = node.namespaceURI;
81
+
82
+ let nsPrefix = "";
83
+ if (ns === "http://www.w3.org/2000/svg") {
84
+ nsPrefix = "svg ";
85
+ } else if (ns === "http://www.w3.org/1998/Math/MathML") {
86
+ nsPrefix = "math ";
87
+ }
88
+
89
+ lines.push(`${indent}<${nsPrefix}${tagName}>`);
90
+
91
+ const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
92
+ a.localeCompare(b),
93
+ );
94
+ for (const [name, value] of attrs) {
95
+ lines.push(`${indent} ${name}="${value}"`);
96
+ }
97
+
98
+ if (node.tagName.toLowerCase() === "template" && node.content) {
99
+ lines.push(`${indent} content`);
100
+ serialize(node.content, depth + 2);
101
+ }
102
+
103
+ for (const child of node.childNodes || []) {
104
+ serialize(child, depth + 1);
105
+ }
106
+ } else if (node.nodeType === 3) {
107
+ lines.push(`${indent}"${node.textContent}"`);
108
+ } else if (node.nodeType === 8) {
109
+ const commentData = node.data || node.nodeValue || node.textContent || "";
110
+ lines.push(`${indent}<!-- ${commentData} -->`);
111
+ }
112
+ }
113
+
114
+ for (const node of nodes) {
115
+ serialize(node, 0);
116
+ }
117
+
118
+ return lines.join("\n") + "\n";
119
+ }
@@ -0,0 +1,90 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import { parseHTML } from "../index";
3
+
4
+ describe("querySelector with underscore IDs", () => {
5
+ it("should find element with ID starting with single underscore", () => {
6
+ const doc = parseHTML(
7
+ "<html><body><div id='_test'>Content</div></body></html>",
8
+ );
9
+ const result = doc.querySelector("#_test");
10
+ expect(result).not.toBeNull();
11
+ expect(result?.tagName).toBe("DIV");
12
+ expect(result?.id).toBe("_test");
13
+ });
14
+
15
+ it("should find element with ID starting with double underscore", () => {
16
+ const doc = parseHTML(
17
+ "<html><body><div id='__test'>Content</div></body></html>",
18
+ );
19
+ const result = doc.querySelector("#__test");
20
+ expect(result).not.toBeNull();
21
+ expect(result?.tagName).toBe("DIV");
22
+ expect(result?.id).toBe("__test");
23
+ });
24
+
25
+ it("should find element with complex underscore ID", () => {
26
+ const doc = parseHTML(
27
+ "<html><body><div id='__tkeron_component_root__'>Content</div></body></html>",
28
+ );
29
+ const result = doc.querySelector("#__tkeron_component_root__");
30
+ expect(result).not.toBeNull();
31
+ expect(result?.tagName).toBe("DIV");
32
+ expect(result?.id).toBe("__tkeron_component_root__");
33
+ });
34
+
35
+ it("should find underscore ID from child element context", () => {
36
+ const doc = parseHTML(
37
+ "<html><body><div id='__root'><p>Nested</p></div></body></html>",
38
+ );
39
+ const body = doc.querySelector("body");
40
+ const result = body?.querySelector("#__root");
41
+ expect(result).not.toBeNull();
42
+ expect(result?.tagName).toBe("DIV");
43
+ });
44
+
45
+ it("should find nested element with underscore ID", () => {
46
+ const doc = parseHTML(
47
+ "<html><body><div><span id='_nested'>Text</span></div></body></html>",
48
+ );
49
+ const result = doc.querySelector("#_nested");
50
+ expect(result).not.toBeNull();
51
+ expect(result?.tagName).toBe("SPAN");
52
+ });
53
+
54
+ it("should return null for non-existent underscore ID", () => {
55
+ const doc = parseHTML(
56
+ "<html><body><div id='other'>Content</div></body></html>",
57
+ );
58
+ const result = doc.querySelector("#_nonexistent");
59
+ expect(result).toBeNull();
60
+ });
61
+
62
+ it("should work with querySelectorAll for underscore IDs", () => {
63
+ const doc = parseHTML(
64
+ "<html><body><div id='_a'>A</div><div id='_b'>B</div></body></html>",
65
+ );
66
+ const resultA = doc.querySelectorAll("#_a");
67
+ const resultB = doc.querySelectorAll("#_b");
68
+ expect(resultA.length).toBe(1);
69
+ expect(resultB.length).toBe(1);
70
+ });
71
+
72
+ it("should find ID starting with hyphen", () => {
73
+ const doc = parseHTML(
74
+ "<html><body><div id='-test'>Content</div></body></html>",
75
+ );
76
+ const result = doc.querySelector("#-test");
77
+ expect(result).not.toBeNull();
78
+ expect(result?.tagName).toBe("DIV");
79
+ expect(result?.id).toBe("-test");
80
+ });
81
+
82
+ it("should find ID with mixed underscore and hyphen at start", () => {
83
+ const doc = parseHTML(
84
+ "<html><body><div id='_-mixed'>Content</div></body></html>",
85
+ );
86
+ const result = doc.querySelector("#_-mixed");
87
+ expect(result).not.toBeNull();
88
+ expect(result?.id).toBe("_-mixed");
89
+ });
90
+ });
@@ -1,6 +1,9 @@
1
1
  import { expect, it, describe } from "bun:test";
2
- import { parseHTML } from "../index";
3
- import { serializeToHtml5lib } from "./helpers/tree-adapter";
2
+ import { parseHTML, parseHTMLFragment } from "../index";
3
+ import {
4
+ serializeToHtml5lib,
5
+ serializeFragmentToHtml5lib,
6
+ } from "./helpers/tree-adapter";
4
7
  import { readFileSync } from "fs";
5
8
 
6
9
  describe("Tree Construction Adoption01 Tests", () => {
@@ -15,10 +18,18 @@ describe("Tree Construction Adoption01 Tests", () => {
15
18
  let data = "";
16
19
  let document = "";
17
20
  let inDocument = false;
18
- let inData = true; // Start with data since we split on #data\n
21
+ let inData = true;
22
+ let isFragmentTest = false;
23
+ let fragmentContext = "";
19
24
 
20
25
  for (const line of lines) {
21
- if (line.startsWith("#document")) {
26
+ if (line.startsWith("#document-fragment")) {
27
+ isFragmentTest = true;
28
+ inDocument = false;
29
+ inData = false;
30
+ } else if (isFragmentTest && !fragmentContext && !line.startsWith("#")) {
31
+ fragmentContext = line.trim();
32
+ } else if (line.startsWith("#document")) {
22
33
  inDocument = true;
23
34
  inData = false;
24
35
  } else if (line.startsWith("#errors")) {
@@ -31,18 +42,21 @@ describe("Tree Construction Adoption01 Tests", () => {
31
42
  }
32
43
  }
33
44
 
34
- const passingTests = [
35
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
36
- ];
37
- const testFn = passingTests.includes(index + 1) ? it : it.skip;
38
-
39
- testFn(`Adoption test ${index + 1}`, () => {
40
- const doc = parseHTML(data);
41
- const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
42
- const serialized = serializeToHtml5lib(doc, {
43
- skipImplicitDoctype: !hasExplicitDoctype,
45
+ if (isFragmentTest) {
46
+ it(`Adoption test ${index + 1} (fragment: ${fragmentContext})`, () => {
47
+ const nodes = parseHTMLFragment(data, fragmentContext);
48
+ const serialized = serializeFragmentToHtml5lib(nodes);
49
+ expect(serialized).toBe(document);
44
50
  });
45
- expect(serialized).toBe(document);
46
- });
51
+ } else {
52
+ it(`Adoption test ${index + 1}`, () => {
53
+ const doc = parseHTML(data);
54
+ const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
55
+ const serialized = serializeToHtml5lib(doc, {
56
+ skipImplicitDoctype: !hasExplicitDoctype,
57
+ });
58
+ expect(serialized).toBe(document);
59
+ });
60
+ }
47
61
  });
48
62
  });