npm - @tkeron/html-parser - Versions diffs - 1.4.1 → 1.5.1 - Mend

@tkeron/html-parser 1.4.1 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +24 -1
package/index.ts +6 -2
package/package.json +1 -1
package/src/parser/index.ts +1 -1
package/src/parser/parse.ts +30 -0
package/src/selectors/parse-selector.ts +2 -2
package/tests/helpers/tree-adapter.ts +49 -0
package/tests/selector-underscore-ids.test.ts +90 -0
package/tests/tree-construction-adoption01.test.ts +30 -16

package/README.md CHANGED Viewed

@@ -9,8 +9,9 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
 - 🪶 **Lightweight**: Zero external dependencies
 - 🌐 **Standards Compliant**: Returns standard DOM Document objects
 - 🔧 **TypeScript Support**: Full TypeScript definitions included
-- ✅ **Well Tested**: Comprehensive test suite (5600+ tests passing)
+- ✅ **Well Tested**: Comprehensive test suite (5660+ tests passing)
 - 🎯 **HTML5 Spec**: Implements Adoption Agency Algorithm for proper formatting element handling
+- 🧩 **Fragment Parsing**: Parse HTML fragments with context element support
 ## Installation
@@ -76,6 +77,28 @@ Parses an HTML string and returns a DOM Document object.
 - `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
+### `parseHTMLFragment(html: string, contextTagName: string): Node[]`
+Parses an HTML string as a fragment within a context element. Useful for parsing innerHTML-style content.
+**Parameters:**
+- `html` (string): The HTML string to parse
+- `contextTagName` (string): The tag name of the context element (e.g., `"div"`, `"body"`)
+**Returns:**
+- `Node[]`: An array of parsed nodes
+**Example:**
+```typescript
+import { parseHTMLFragment } from "@tkeron/html-parser";
+const nodes = parseHTMLFragment("<b>Hello</b> <i>World</i>", "div");
+console.log(nodes.length); // 3 (b element, text node, i element)
+```
 ## Development
 This project is built with Bun. To get started:

package/index.ts CHANGED Viewed

@@ -1,13 +1,17 @@
 import { tokenize } from "./src/tokenizer/index.js";
-import { parse } from "./src/parser/index.js";
+import { parse, parseFragment } from "./src/parser/index.js";
 import { astToDOM } from "./src/dom-simulator.js";
 export function parseHTML(html: string = ""): Document {
   const tokens = tokenize(html);
   const ast = parse(tokens);
-  // If parse already returns a DOM document, return it directly
   if (ast && typeof ast.nodeType === "number" && ast.nodeType === 9) {
     return ast;
   }
   return astToDOM(ast);
 }
+export function parseHTMLFragment(html: string, contextTagName: string): any[] {
+  const tokens = tokenize(html);
+  return parseFragment(tokens, contextTagName);
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tkeron/html-parser",
-  "version": "1.4.1",
+  "version": "1.5.1",
   "description": "A fast and lightweight HTML parser for Bun",
   "main": "index.js",
   "module": "index.ts",

package/src/parser/index.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { parse } from "./parse";
+export { parse, parseFragment } from "./parse";
 export { domToAST } from "./dom-to-ast";
 export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
 export { ASTNodeType } from "./types";

package/src/parser/parse.ts CHANGED Viewed

@@ -1112,3 +1112,33 @@ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
   }
   return result;
 };
+export const parseFragment = (tokens: Token[], contextTagName: string): any => {
+  const root = createDocument();
+  const contextElement = createElement(contextTagName.toLowerCase(), {});
+  appendChild(root, contextElement);
+  const state: ParserState = {
+    tokens,
+    position: 0,
+    length: tokens.length,
+    stack: [root, contextElement],
+    root,
+    insertionMode: InsertionMode.InBody,
+    errors: [],
+    activeFormattingElements: [],
+  };
+  while (state.position < state.length) {
+    const token = getCurrentToken(state);
+    if (!token || token.type === TokenType.EOF) {
+      break;
+    }
+    parseToken(state, token);
+    advance(state);
+  }
+  return contextElement.childNodes;
+};

package/src/selectors/parse-selector.ts CHANGED Viewed

@@ -19,11 +19,11 @@ export const parseSelector = (selector: string): SelectorGroup[] => {
       remaining = remaining.slice(tagMatch[1].length);
     }
-    const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
+    const idMatches = remaining.matchAll(/#([a-zA-Z0-9_-][a-zA-Z0-9_-]*)/g);
     for (const match of idMatches) {
       tokens.push({ type: "id", value: match[1] });
     }
-    remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, "");
+    remaining = remaining.replace(/#[a-zA-Z0-9_-][a-zA-Z0-9_-]*/g, "");
     const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
     for (const match of classMatches) {

package/tests/helpers/tree-adapter.ts CHANGED Viewed

@@ -68,3 +68,52 @@ export function serializeToHtml5lib(
   serialize(doc, 0);
   return lines.join("\n") + "\n";
 }
+export function serializeFragmentToHtml5lib(nodes: any[]): string {
+  const lines: string[] = [];
+  function serialize(node: any, depth: number): void {
+    const indent = "| " + "  ".repeat(depth);
+    if (node.nodeType === 1) {
+      const tagName = node.tagName.toLowerCase();
+      const ns = node.namespaceURI;
+      let nsPrefix = "";
+      if (ns === "http://www.w3.org/2000/svg") {
+        nsPrefix = "svg ";
+      } else if (ns === "http://www.w3.org/1998/Math/MathML") {
+        nsPrefix = "math ";
+      }
+      lines.push(`${indent}<${nsPrefix}${tagName}>`);
+      const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
+        a.localeCompare(b),
+      );
+      for (const [name, value] of attrs) {
+        lines.push(`${indent}  ${name}="${value}"`);
+      }
+      if (node.tagName.toLowerCase() === "template" && node.content) {
+        lines.push(`${indent}  content`);
+        serialize(node.content, depth + 2);
+      }
+      for (const child of node.childNodes || []) {
+        serialize(child, depth + 1);
+      }
+    } else if (node.nodeType === 3) {
+      lines.push(`${indent}"${node.textContent}"`);
+    } else if (node.nodeType === 8) {
+      const commentData = node.data || node.nodeValue || node.textContent || "";
+      lines.push(`${indent}<!-- ${commentData} -->`);
+    }
+  }
+  for (const node of nodes) {
+    serialize(node, 0);
+  }
+  return lines.join("\n") + "\n";
+}

package/tests/selector-underscore-ids.test.ts ADDED Viewed

@@ -0,0 +1,90 @@
+import { describe, it, expect } from "bun:test";
+import { parseHTML } from "../index";
+describe("querySelector with underscore IDs", () => {
+  it("should find element with ID starting with single underscore", () => {
+    const doc = parseHTML(
+      "<html><body><div id='_test'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#_test");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("DIV");
+    expect(result?.id).toBe("_test");
+  });
+  it("should find element with ID starting with double underscore", () => {
+    const doc = parseHTML(
+      "<html><body><div id='__test'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#__test");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("DIV");
+    expect(result?.id).toBe("__test");
+  });
+  it("should find element with complex underscore ID", () => {
+    const doc = parseHTML(
+      "<html><body><div id='__tkeron_component_root__'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#__tkeron_component_root__");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("DIV");
+    expect(result?.id).toBe("__tkeron_component_root__");
+  });
+  it("should find underscore ID from child element context", () => {
+    const doc = parseHTML(
+      "<html><body><div id='__root'><p>Nested</p></div></body></html>",
+    );
+    const body = doc.querySelector("body");
+    const result = body?.querySelector("#__root");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("DIV");
+  });
+  it("should find nested element with underscore ID", () => {
+    const doc = parseHTML(
+      "<html><body><div><span id='_nested'>Text</span></div></body></html>",
+    );
+    const result = doc.querySelector("#_nested");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("SPAN");
+  });
+  it("should return null for non-existent underscore ID", () => {
+    const doc = parseHTML(
+      "<html><body><div id='other'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#_nonexistent");
+    expect(result).toBeNull();
+  });
+  it("should work with querySelectorAll for underscore IDs", () => {
+    const doc = parseHTML(
+      "<html><body><div id='_a'>A</div><div id='_b'>B</div></body></html>",
+    );
+    const resultA = doc.querySelectorAll("#_a");
+    const resultB = doc.querySelectorAll("#_b");
+    expect(resultA.length).toBe(1);
+    expect(resultB.length).toBe(1);
+  });
+  it("should find ID starting with hyphen", () => {
+    const doc = parseHTML(
+      "<html><body><div id='-test'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#-test");
+    expect(result).not.toBeNull();
+    expect(result?.tagName).toBe("DIV");
+    expect(result?.id).toBe("-test");
+  });
+  it("should find ID with mixed underscore and hyphen at start", () => {
+    const doc = parseHTML(
+      "<html><body><div id='_-mixed'>Content</div></body></html>",
+    );
+    const result = doc.querySelector("#_-mixed");
+    expect(result).not.toBeNull();
+    expect(result?.id).toBe("_-mixed");
+  });
+});

package/tests/tree-construction-adoption01.test.ts CHANGED Viewed

@@ -1,6 +1,9 @@
 import { expect, it, describe } from "bun:test";
-import { parseHTML } from "../index";
-import { serializeToHtml5lib } from "./helpers/tree-adapter";
+import { parseHTML, parseHTMLFragment } from "../index";
+import {
+  serializeToHtml5lib,
+  serializeFragmentToHtml5lib,
+} from "./helpers/tree-adapter";
 import { readFileSync } from "fs";
 describe("Tree Construction Adoption01 Tests", () => {
@@ -15,10 +18,18 @@ describe("Tree Construction Adoption01 Tests", () => {
     let data = "";
     let document = "";
     let inDocument = false;
-    let inData = true; // Start with data since we split on #data\n
+    let inData = true;
+    let isFragmentTest = false;
+    let fragmentContext = "";
     for (const line of lines) {
-      if (line.startsWith("#document")) {
+      if (line.startsWith("#document-fragment")) {
+        isFragmentTest = true;
+        inDocument = false;
+        inData = false;
+      } else if (isFragmentTest && !fragmentContext && !line.startsWith("#")) {
+        fragmentContext = line.trim();
+      } else if (line.startsWith("#document")) {
         inDocument = true;
         inData = false;
       } else if (line.startsWith("#errors")) {
@@ -31,18 +42,21 @@ describe("Tree Construction Adoption01 Tests", () => {
       }
     }
-    const passingTests = [
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-    ];
-    const testFn = passingTests.includes(index + 1) ? it : it.skip;
-    testFn(`Adoption test ${index + 1}`, () => {
-      const doc = parseHTML(data);
-      const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
-      const serialized = serializeToHtml5lib(doc, {
-        skipImplicitDoctype: !hasExplicitDoctype,
+    if (isFragmentTest) {
+      it(`Adoption test ${index + 1} (fragment: ${fragmentContext})`, () => {
+        const nodes = parseHTMLFragment(data, fragmentContext);
+        const serialized = serializeFragmentToHtml5lib(nodes);
+        expect(serialized).toBe(document);
       });
-      expect(serialized).toBe(document);
-    });
+    } else {
+      it(`Adoption test ${index + 1}`, () => {
+        const doc = parseHTML(data);
+        const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
+        const serialized = serializeToHtml5lib(doc, {
+          skipImplicitDoctype: !hasExplicitDoctype,
+        });
+        expect(serialized).toBe(document);
+      });
+    }
   });
 });