npm - @tkeron/html-parser - Versions diffs - 1.0.0 → 1.1.1 - Mend

@tkeron/html-parser 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +1 -1
package/src/dom-simulator.ts +8 -5
package/src/parser.ts +34 -2
package/src/tokenizer.ts +131 -75
package/tests/advanced.test.ts +3 -3
package/tests/custom-elements-head.test.ts +105 -0
package/tests/edge-cases.test.ts +457 -0
package/tests/helpers/tree-adapter.test.ts +1 -1
package/tests/helpers/tree-adapter.ts +21 -4
package/tests/innerhtml-void-elements.test.ts +84 -0
package/tests/parser.test.ts +2 -1
package/tests/tokenizer.test.ts +22 -26
package/tests/tree-construction-html5test-com.test.ts +16 -8
package/tests/custom-elements.test.ts +0 -755

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tkeron/html-parser",
-  "version": "1.0.0",
+  "version": "1.1.1",
   "description": "A fast and lightweight HTML parser for Bun",
   "main": "index.js",
   "module": "index.ts",

package/src/dom-simulator.ts CHANGED Viewed

@@ -28,7 +28,8 @@ export const enum NodeType {
 export function createElement(
   tagName: string,
-  attributes: Record<string, string> = {}
+  attributes: Record<string, string> = {},
+  namespaceURI?: string
 ): any {
   const innerHTML = "";
   const tagNameLower = tagName.toLowerCase();
@@ -46,6 +47,7 @@ export function createElement(
     nodeName: tagName.toUpperCase(),
     nodeValue: null,
     tagName: tagName.toUpperCase(),
+    namespaceURI: namespaceURI || null,
     attributes: { ...attributes },
     childNodes: [],
     children: [],
@@ -941,11 +943,12 @@ export function setInnerHTML(element: any, html: string): void {
   element.lastElementChild = null;
   if (html.trim()) {
-    const tokens = tokenize(html);
+    const wrappedHtml = '<div>' + html + '</div>';
+    const tokens = tokenize(wrappedHtml);
     const doc = parse(tokens);
-    const body = doc.body;
-    if (body && body.childNodes) {
-      const nodesToMove = [...body.childNodes];
+    const div = doc.querySelector('div');
+    if (div && div.childNodes) {
+      const nodesToMove = [...div.childNodes];
       for (const child of nodesToMove) {
         child.parentNode = null;
         appendChild(element, child);

package/src/parser.ts CHANGED Viewed

@@ -138,20 +138,35 @@ export function parse(tokens: Token[]): any {
       appendChild(html, body);
       const doctypes: any[] = [];
+      const commentsBeforeHtml: any[] = [];
+      const bodyContent: any[] = [];
       const children = [...state.root.childNodes];
+      let foundElement = false;
       for (const child of children) {
         if (child.nodeType === 10) {
           doctypes.push(child);
+        } else if (child.nodeType === 8 && !foundElement) {
+          commentsBeforeHtml.push(child);
         } else {
-          appendChild(body, child);
+          if (child.nodeType === 1) foundElement = true;
+          bodyContent.push(child);
         }
       }
+      for (const content of bodyContent) {
+        appendChild(body, content);
+      }
       state.root.childNodes = [];
       for (const doctype of doctypes) {
         doctype.parentNode = null;
         appendChild(state.root, doctype);
       }
+      for (const comment of commentsBeforeHtml) {
+        comment.parentNode = null;
+        appendChild(state.root, comment);
+      }
       appendChild(state.root, html);
       state.root.documentElement = html;
       state.root.head = head;
@@ -364,6 +379,10 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
     } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
       parseOpenTag(state, token);
     } else if (tagName === 'head') {
+      // Ignore duplicate <head> tags
+    } else if (tagName.includes('-')) {
+      // Custom elements (tags with hyphens) are valid in <head>
+      parseOpenTag(state, token);
     } else {
       state.stack.pop();
       state.insertionMode = InsertionMode.AfterHead;
@@ -378,6 +397,9 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
       if (currentTagName === tagName) {
         state.stack.pop();
       }
+    } else if (tagName.includes('-') && currentTagName === tagName) {
+      // Handle closing tags for custom elements in <head>
+      state.stack.pop();
     }
   } else if (token.type === TokenType.COMMENT) {
     parseComment(state, token);
@@ -410,6 +432,9 @@ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
   }
 }
+const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
+const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
 function parseTokenInInBodyMode(state: ParserState, token: Token): void {
   if (token.type === TokenType.TAG_OPEN) {
     const tagName = token.value.toLowerCase();
@@ -418,7 +443,14 @@ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
     const currentParent = getCurrentParent(state);
-    const element = createElement(tagName, token.attributes || {});
+    let namespaceURI: string | undefined;
+    if (tagName === 'svg') {
+      namespaceURI = SVG_NAMESPACE;
+    } else if (tagName === 'math') {
+      namespaceURI = MATHML_NAMESPACE;
+    }
+    const element = createElement(tagName, token.attributes || {}, namespaceURI);
     appendChild(currentParent, element);

package/src/tokenizer.ts CHANGED Viewed

@@ -93,16 +93,63 @@ function decodeEntities(text: string): string {
 function parseAttributes(attributeString: string): Record<string, string> {
   const attributes: Record<string, string> = {};
+  let i = 0;
-  const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
-  let match;
-  while ((match = attrRegex.exec(attributeString)) !== null) {
-    const [, name, doubleQuoted, singleQuoted, unquoted] = match;
-    if (name) {
-      const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
-      attributes[name.toLowerCase()] = decodeEntities(value);
+  while (i < attributeString.length) {
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
     }
+    if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
+      break;
+    }
+    let name = '';
+    while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
+      name += attributeString[i];
+      i++;
+    }
+    if (!name) {
+      i++;
+      continue;
+    }
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
+    }
+    let value = '';
+    if (i < attributeString.length && attributeString[i] === '=') {
+      i++;
+      while (i < attributeString.length && /\s/.test(attributeString[i])) {
+        i++;
+      }
+      if (i < attributeString.length) {
+        if (attributeString[i] === '"') {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== '"') {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else if (attributeString[i] === "'") {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== "'") {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else {
+          while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
+            value += attributeString[i];
+            i++;
+          }
+        }
+      }
+    }
+    attributes[name.toLowerCase()] = decodeEntities(value);
   }
   return attributes;
@@ -117,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
   };
 }
+const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
+const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
 export function tokenize(html: string): Token[] {
   const tokens: Token[] = [];
-  let position = 0;
-  const specialCases = [
-    {
-      pattern: /<!DOCTYPE\s+[^>]*>/gi,
-      type: TokenType.DOCTYPE,
-      getValue: (match: string) => {
-        const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
-        return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
-      }
-    },
-    {
-      pattern: /<!--([\s\S]*?)(?:-->|$)/g,
-      type: TokenType.COMMENT,
-      getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
-    },
-    {
-      pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
-      type: TokenType.CDATA,
-      getValue: (match: string) => match.slice(9, -3)
-    },
-    {
-      pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
-      type: TokenType.PROCESSING_INSTRUCTION,
-      getValue: (match: string) => match.slice(0, -2)
-    }
-  ];
-  const processedRanges: Array<[number, number]> = [];
-  for (const { pattern, type, getValue } of specialCases) {
-    const regex = new RegExp(pattern);
-    let match;
-    while ((match = regex.exec(html)) !== null) {
-      const start = match.index;
-      const end = start + match[0].length;
-      tokens.push({
-        type,
-        value: getValue(match[0]),
-        position: calculatePosition(html, start)
-      });
-      processedRanges.push([start, end]);
-    }
-  }
-  processedRanges.sort((a, b) => a[0] - b[0]);
   let currentPos = 0;
   while (currentPos < html.length) {
-    const inProcessedRange = processedRanges.some(([start, end]) =>
-      currentPos >= start && currentPos < end
-    );
-    if (inProcessedRange) {
-      const range = processedRanges.find(([start, end]) =>
-        currentPos >= start && currentPos < end
-      );
-      if (range) {
-        currentPos = range[1];
-      }
-      continue;
-    }
     const char = html[currentPos];
     if (char === '<') {
-      const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
+      const remaining = html.slice(currentPos);
+      const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
+      if (doctypeMatch) {
+        const match = doctypeMatch[0];
+        const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
+        tokens.push({
+          type: TokenType.DOCTYPE,
+          value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
+      if (commentMatch) {
+        const match = commentMatch[0];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
+      if (cdataMatch) {
+        const content = cdataMatch[1];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '[CDATA[' + content + ']]',
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += cdataMatch[0].length;
+        continue;
+      }
+      const piMatch = remaining.match(/^<\?([^>]*)/);
+      if (piMatch) {
+        let consumed = piMatch[0].length;
+        if (remaining[consumed] === '>') {
+          consumed++;
+        }
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '?' + piMatch[1],
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += consumed;
+        continue;
+      }
+      const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
       if (tagMatch) {
         const fullTag = tagMatch[0];
@@ -222,6 +262,24 @@ export function tokenize(html: string): Token[] {
         });
         currentPos += fullTag.length;
+        if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
+          const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
+          const restOfHtml = html.slice(currentPos);
+          const closeMatch = restOfHtml.match(closeTagPattern);
+          if (closeMatch && closeMatch.index !== undefined) {
+            const rawContent = restOfHtml.slice(0, closeMatch.index);
+            if (rawContent) {
+              tokens.push({
+                type: TokenType.TEXT,
+                value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
+                position: calculatePosition(html, currentPos)
+              });
+            }
+            currentPos += rawContent.length;
+          }
+        }
       } else {
         const textStart = currentPos;
         currentPos++;
@@ -257,8 +315,6 @@ export function tokenize(html: string): Token[] {
     }
   }
-  tokens.sort((a, b) => a.position.offset - b.position.offset);
   tokens.push({
     type: TokenType.EOF,
     value: '',

package/tests/advanced.test.ts CHANGED Viewed

@@ -60,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
       });
     });
-    it('should handle complex CDATA content', () => {
+    it('should handle complex CDATA content as bogus comment', () => {
       const complexContent = `
         function test() {
           return "<div>HTML inside JS</div>";
@@ -71,8 +71,8 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
       expect(tokens.length).toBeGreaterThan(0);
       const cdataToken = tokens[0]!;
-      expect(cdataToken.type).toBe(TokenType.CDATA);
-      expect(cdataToken.value).toBe(complexContent);
+      expect(cdataToken.type).toBe(TokenType.COMMENT);
+      expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
     });
     it('should handle performance with large documents', () => {

package/tests/custom-elements-head.test.ts ADDED Viewed

@@ -0,0 +1,105 @@
+import { describe, it, expect } from 'bun:test';
+import { parseHTML } from '../index';
+describe('Custom Elements in <head>', () => {
+  it('should keep <meta-tags> custom element in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
+    );
+    const metaTags = doc.head?.querySelector('meta-tags');
+    expect(metaTags).toBeTruthy();
+    expect(metaTags?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep <social-meta> custom element in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
+    );
+    const socialMeta = doc.head?.querySelector('social-meta');
+    expect(socialMeta).toBeTruthy();
+    expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep any <custom-element> with hyphen in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
+    );
+    const myComponent = doc.head?.querySelector('my-component');
+    expect(myComponent).toBeTruthy();
+    expect(myComponent?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should still eject non-custom elements like <div> to body', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
+    );
+    const divInHead = doc.head?.querySelector('div');
+    const divInBody = doc.body?.querySelector('div');
+    expect(divInHead).toBeFalsy();
+    expect(divInBody).toBeTruthy();
+  });
+  it('should handle nested custom elements in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
+    );
+    const myWrapper = doc.head?.querySelector('my-wrapper');
+    expect(myWrapper).toBeTruthy();
+    expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
+    const innerComp = myWrapper?.querySelector('inner-comp');
+    expect(innerComp).toBeTruthy();
+  });
+  it('should keep custom elements with attributes in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
+    );
+    const seoMeta = doc.head?.querySelector('seo-meta');
+    expect(seoMeta).toBeTruthy();
+    expect(seoMeta?.getAttribute('property')).toBe('og:title');
+    expect(seoMeta?.getAttribute('content')).toBe('Test');
+    expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep self-closing custom elements in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
+    );
+    const customVoid = doc.head?.querySelector('custom-void');
+    expect(customVoid).toBeTruthy();
+    expect(customVoid?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should handle custom elements mixed with standard head elements', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
+    );
+    const title = doc.head?.querySelector('title');
+    const metaTags = doc.head?.querySelector('meta-tags');
+    const link = doc.head?.querySelector('link');
+    expect(title).toBeTruthy();
+    expect(metaTags).toBeTruthy();
+    expect(link).toBeTruthy();
+  });
+  it('should handle custom element containing text in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
+    );
+    const inlineScript = doc.head?.querySelector('inline-script');
+    expect(inlineScript).toBeTruthy();
+    expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
+  });
+});