npm - @mastra/rag - Versions diffs - 0.1.0-alpha.83 → 0.1.0-alpha.85 - Mend

@mastra/rag 0.1.0-alpha.83 → 0.1.0-alpha.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +13 -0
package/dist/index.js +97 -55
package/package.json +3 -4
package/src/document/document.test.ts +171 -0
package/src/document/transformers/html.ts +137 -76

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,18 @@
 # @mastra/rag
+## 0.1.0-alpha.85
+### Patch Changes
+- b27bdb8: Swap jsdon for node-html-parser in rag
+## 0.1.0-alpha.84
+### Patch Changes
+- Updated dependencies [4d4f6b6]
+  - @mastra/core@0.2.0-alpha.92
 ## 0.1.0-alpha.83
 ### Patch Changes

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { Document, SummaryExtractor, QuestionsAnsweredExtractor, KeywordExtractor, TitleExtractor, IngestionPipeline } from 'llamaindex';
-import { JSDOM } from 'jsdom';
+import { parse } from 'node-html-better-parser';
 import { encodingForModel, getEncoding } from 'js-tiktoken';
 import { embed as embed$1, embedMany as embedMany$1 } from '@mastra/core/embeddings';
 import { MastraAgentRelevanceScorer, CohereRelevanceScorer } from '@mastra/core/relevance';
@@ -353,50 +353,81 @@ var HTMLHeaderTransformer = class {
     this.headersToSplitOn = [...headersToSplitOn].sort();
   }
   splitText({ text }) {
-    const dom = new JSDOM(text);
-    const { document } = dom.window;
+    const root = parse(text);
     const headerFilter = this.headersToSplitOn.map(([header]) => header);
     const headerMapping = Object.fromEntries(this.headersToSplitOn);
     const elements = [];
-    const headers = document.querySelectorAll(headerFilter.join(","));
+    const headers = root.querySelectorAll(headerFilter.join(","));
     headers.forEach((header) => {
       let content = "";
-      let nextElement = header.nextElementSibling;
-      while (nextElement && !headerFilter.includes(nextElement.tagName.toLowerCase())) {
-        content += nextElement.textContent + " ";
-        nextElement = nextElement.nextElementSibling;
+      const parentNode = header.parentNode;
+      if (parentNode && parentNode.childNodes) {
+        let foundHeader = false;
+        for (const node of parentNode.childNodes) {
+          if (node === header) {
+            foundHeader = true;
+            continue;
+          }
+          if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
+            break;
+          }
+          if (foundHeader) {
+            content += this.getTextContent(node) + " ";
+          }
+        }
       }
       elements.push({
         url: text,
         xpath: this.getXPath(header),
         content: content.trim(),
         metadata: {
-          [headerMapping?.[header.tagName.toLowerCase()]]: header.textContent?.trim() || ""
+          [headerMapping?.[header.tagName.toLowerCase()]]: header.text || ""
         }
       });
     });
     return this.returnEachElement ? elements.map(
       (el) => new Document({
         text: el.content,
-        metadata: el.metadata
+        metadata: { ...el.metadata, xpath: el.xpath }
       })
     ) : this.aggregateElementsToChunks(elements);
   }
   getXPath(element) {
+    if (!element) return "";
     const parts = [];
     let current = element;
-    while (current && current.nodeType === 1) {
+    while (current && current.tagName) {
       let index = 1;
-      for (let sibling = current.previousElementSibling; sibling; sibling = sibling.previousElementSibling) {
-        if (sibling.nodeName === current.nodeName) {
-          index++;
+      const parent = current.parentNode;
+      if (parent && parent.childNodes) {
+        for (const sibling of parent.childNodes) {
+          if (sibling === current) break;
+          if (sibling.tagName === current.tagName) {
+            index++;
+          }
         }
       }
       parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
-      current = current.parentElement;
+      current = current.parentNode;
     }
     return "/" + parts.join("/");
   }
+  getTextContent(element) {
+    if (!element) return "";
+    if (!element.tagName) {
+      return element.text || "";
+    }
+    let content = element.text || "";
+    if (element.childNodes) {
+      for (const child of element.childNodes) {
+        const childText = this.getTextContent(child);
+        if (childText) {
+          content += " " + childText;
+        }
+      }
+    }
+    return content.trim();
+  }
   aggregateElementsToChunks(elements) {
     const aggregatedChunks = [];
     for (const element of elements) {
@@ -409,7 +440,7 @@ var HTMLHeaderTransformer = class {
     return aggregatedChunks.map(
       (chunk) => new Document({
         text: chunk.content,
-        metadata: chunk.metadata
+        metadata: { ...chunk.metadata, xpath: chunk.xpath }
       })
     );
   }
@@ -450,7 +481,7 @@ var HTMLHeaderTransformer = class {
 };
 var HTMLSectionTransformer = class {
   constructor(headersToSplitOn, options = {}) {
-    this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
+    this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
     this.options = options;
   }
   splitText(text) {
@@ -459,11 +490,59 @@ var HTMLSectionTransformer = class {
       (section) => new Document({
         text: section.content,
         metadata: {
-          [this.headersToSplitOn[section.tagName]]: section.header
+          [this.headersToSplitOn[section.tagName.toLowerCase()]]: section.header,
+          xpath: section.xpath
         }
       })
     );
   }
+  getXPath(element) {
+    const parts = [];
+    let current = element;
+    while (current && current.nodeType === 1) {
+      let index = 1;
+      let sibling = current.previousSibling;
+      while (sibling) {
+        if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
+          index++;
+        }
+        sibling = sibling.previousSibling;
+      }
+      if (current.tagName) {
+        parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
+      }
+      current = current.parentNode;
+    }
+    return "/" + parts.join("/");
+  }
+  splitHtmlByHeaders(htmlDoc) {
+    const sections = [];
+    const root = parse(htmlDoc);
+    const headers = Object.keys(this.headersToSplitOn);
+    const headerElements = root.querySelectorAll(headers.join(","));
+    headerElements.forEach((headerElement, index) => {
+      const header = headerElement.text?.trim() || "";
+      const tagName = headerElement.tagName;
+      const xpath = this.getXPath(headerElement);
+      let content = "";
+      let currentElement = headerElement.nextElementSibling;
+      const nextHeader = headerElements[index + 1];
+      while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
+        if (currentElement.text) {
+          content += currentElement.text.trim() + " ";
+        }
+        currentElement = currentElement.nextElementSibling;
+      }
+      content = content.trim();
+      sections.push({
+        header,
+        content,
+        tagName,
+        xpath
+      });
+    });
+    return sections;
+  }
   async splitDocuments(documents) {
     const texts = [];
     const metadatas = [];
@@ -500,43 +579,6 @@ var HTMLSectionTransformer = class {
     }
     return documents;
   }
-  splitHtmlByHeaders(htmlDoc) {
-    const sections = [];
-    const dom = new JSDOM(htmlDoc);
-    const { document } = dom.window;
-    const headers = ["body", ...Object.keys(this.headersToSplitOn)];
-    const headerElements = Array.from(document.querySelectorAll(headers.join(",")));
-    for (let i = 0; i < headerElements.length; i++) {
-      const headerElement = headerElements[i];
-      let currentHeader;
-      let currentHeaderTag;
-      let sectionContent = [];
-      if (i === 0) {
-        currentHeader = "#TITLE#";
-        currentHeaderTag = "h1";
-      } else {
-        currentHeader = headerElement.textContent?.trim() || "";
-        currentHeaderTag = headerElement.tagName.toLowerCase();
-      }
-      let currentNode = headerElement.nextSibling;
-      const nextHeader = headerElements[i + 1];
-      while (currentNode && currentNode !== nextHeader) {
-        if (currentNode.textContent) {
-          sectionContent.push(currentNode.textContent);
-        }
-        currentNode = currentNode.nextSibling;
-      }
-      const content = sectionContent.join(" ").trim();
-      if (content) {
-        sections.push({
-          header: currentHeader,
-          content,
-          tagName: currentHeaderTag
-        });
-      }
-    }
-    return sections;
-  }
   transformDocuments(documents) {
     const texts = [];
     const metadatas = [];

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mastra/rag",
-  "version": "0.1.0-alpha.83",
+  "version": "0.1.0-alpha.85",
   "description": "",
   "type": "module",
   "main": "dist/index.js",
@@ -21,17 +21,16 @@
     "@llamaindex/env": "^0.1.20",
     "@paralleldrive/cuid2": "^2.2.2",
     "js-tiktoken": "^1.0.15",
-    "jsdom": "^25.0.1",
     "llamaindex": "^0.8.15",
+    "node-html-better-parser": "^1.4.7",
     "pathe": "^2.0.2",
     "zod": "^3.24.1",
-    "@mastra/core": "^0.2.0-alpha.91"
+    "@mastra/core": "^0.2.0-alpha.92"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.26.0",
     "@babel/preset-typescript": "^7.26.0",
     "@tsconfig/recommended": "^1.0.7",
-    "@types/jsdom": "^21.1.7",
     "@types/node": "^22.9.0",
     "tsup": "^8.0.1",
     "vitest": "^3.0.4"

package/src/document/document.test.ts CHANGED Viewed

@@ -461,6 +461,177 @@ describe('MDocument', () => {
         expect(doc?.metadata).toHaveProperty('Header 1');
       });
     });
+    it('should handle empty or invalid HTML', async () => {
+      const emptyHtml = '';
+      const invalidHtml = '<unclosed>test';
+      const noHeadersHtml = '<div>test</div>';
+      const doc1 = MDocument.fromHTML(emptyHtml, { meta: 'data' });
+      const doc2 = MDocument.fromHTML(invalidHtml, { meta: 'data' });
+      const doc3 = MDocument.fromHTML(noHeadersHtml, { meta: 'data' });
+      await doc1.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+        ],
+      });
+      await doc2.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+        ],
+      });
+      await doc3.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+        ],
+      });
+      expect(doc1.getDocs()).toHaveLength(0);
+      expect(doc2.getDocs()).toHaveLength(0);
+      expect(doc3.getDocs()).toHaveLength(0);
+    });
+    it('should handle complex nested header hierarchies', async () => {
+      const html = `
+        <html>
+          <body>
+            <h1>Main Title</h1>
+            <p>Main content</p>
+            <h2>Section 1</h2>
+            <p>Section 1 content</p>
+            <h3>Subsection 1.1</h3>
+            <p>Subsection 1.1 content</p>
+            <h2>Section 2</h2>
+            <h3>Subsection 2.1</h3>
+            <p>Subsection 2.1 content</p>
+          </body>
+        </html>
+      `;
+      const doc = MDocument.fromHTML(html, { meta: 'data' });
+      await doc.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+          ['h3', 'Header 3'],
+        ],
+      });
+      const docs = doc.getDocs();
+      expect(docs.length).toBeGreaterThan(3);
+      expect(docs.some(d => d.metadata?.['Header 1'] === 'Main Title')).toBe(true);
+      expect(docs.some(d => d.metadata?.['Header 2'] === 'Section 1')).toBe(true);
+      expect(docs.some(d => d.metadata?.['Header 3'] === 'Subsection 1.1')).toBe(true);
+    });
+    it('should handle headers with mixed content and special characters', async () => {
+      const html = `
+        <html>
+          <body>
+            <h1>Title with <strong>bold</strong> &amp; <em>emphasis</em></h1>
+            <p>Content 1</p>
+            <h2>Section with &lt;tags&gt; &amp; symbols</h2>
+            <p>Content 2</p>
+          </body>
+        </html>
+      `;
+      const doc = MDocument.fromHTML(html, { meta: 'data' });
+      await doc.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+        ],
+      });
+      const docs = doc.getDocs();
+      expect(docs.length).toBeGreaterThan(1);
+      expect(docs[0]?.metadata?.['Header 1']).toContain('bold');
+      expect(docs[0]?.metadata?.['Header 1']).toContain('&');
+      expect(docs[0]?.metadata?.['Header 1']).toContain('emphasis');
+      expect(docs[1]?.metadata?.['Header 2']).toContain('<tags>');
+    });
+    it('should handle headers with no content or whitespace content', async () => {
+      const html = `
+        <html>
+          <body>
+            <h1>Empty Section</h1>
+            <h2>Whitespace Section</h2>
+            <h2>Valid Section</h2>
+            <p>Content</p>
+          </body>
+        </html>
+      `;
+      const doc = MDocument.fromHTML(html, { meta: 'data' });
+      await doc.chunk({
+        strategy: 'html',
+        headers: [
+          ['h1', 'Header 1'],
+          ['h2', 'Header 2'],
+        ],
+      });
+      const docs = doc.getDocs();
+      expect(docs.some(d => d.metadata?.['Header 1'] === 'Empty Section')).toBe(true);
+      expect(docs.some(d => d.metadata?.['Header 2'] === 'Valid Section')).toBe(true);
+      expect(docs.find(d => d.metadata?.['Header 2'] === 'Valid Section')?.text).toContain('Content');
+    });
+    it('should generate correct XPaths for deeply nested elements', async () => {
+      const html = `
+        <html>
+          <body>
+            <div class="container">
+              <section id="main">
+                <div>
+                  <h1>Deeply Nested Title</h1>
+                  <p>Content</p>
+                </div>
+                <div>
+                  <h1>Second Title</h1>
+                  <p>More Content</p>
+                </div>
+              </section>
+            </div>
+          </body>
+        </html>
+      `;
+      const doc = MDocument.fromHTML(html, { meta: 'data' });
+      await doc.chunk({
+        strategy: 'html',
+        headers: [['h1', 'Header 1']],
+      });
+      const docs = doc.getDocs();
+      expect(docs).toHaveLength(2);
+      // First h1
+      expect(docs[0]?.metadata?.['Header 1']).toBe('Deeply Nested Title');
+      const xpath1 = docs[0]?.metadata?.xpath as string;
+      expect(xpath1).toBeDefined();
+      expect(xpath1).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[1\]\/h1\[1\]$/);
+      // Second h1
+      expect(docs[1]?.metadata?.['Header 1']).toBe('Second Title');
+      const xpath2 = docs[1]?.metadata?.xpath as string;
+      expect(xpath2).toBeDefined();
+      expect(xpath2).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[2\]\/h1\[1\]$/);
+    });
   });
   describe('chunkJson', () => {

package/src/document/transformers/html.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { JSDOM } from 'jsdom';
 import { Document } from 'llamaindex';
+import { parse } from 'node-html-better-parser';
 import { RecursiveCharacterTransformer } from './character';
@@ -20,22 +20,37 @@ export class HTMLHeaderTransformer {
   }
   splitText({ text }: { text: string }): Document[] {
-    const dom = new JSDOM(text);
-    const { document } = dom.window;
+    const root = parse(text);
     const headerFilter = this.headersToSplitOn.map(([header]) => header);
     const headerMapping = Object.fromEntries(this.headersToSplitOn);
     const elements: ElementType[] = [];
-    const headers = document.querySelectorAll(headerFilter.join(','));
+    const headers = root.querySelectorAll(headerFilter.join(','));
     headers.forEach(header => {
       let content = '';
-      let nextElement = header.nextElementSibling;
+      const parentNode = header.parentNode;
+      if (parentNode && parentNode.childNodes) {
+        let foundHeader = false;
+        for (const node of parentNode.childNodes) {
+          // Start collecting content after we find our header
+          if (node === header) {
+            foundHeader = true;
+            continue;
+          }
+          // If we found our header and hit another header, stop
+          if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
+            break;
+          }
-      while (nextElement && !headerFilter.includes(nextElement.tagName.toLowerCase())) {
-        content += nextElement.textContent + ' ';
-        nextElement = nextElement.nextElementSibling;
+          // Collect content between headers
+          if (foundHeader) {
+            content += this.getTextContent(node) + ' ';
+          }
+        }
       }
       elements.push({
@@ -43,7 +58,7 @@ export class HTMLHeaderTransformer {
         xpath: this.getXPath(header),
         content: content.trim(),
         metadata: {
-          [headerMapping?.[header.tagName.toLowerCase()]!]: header.textContent?.trim() || '',
+          [headerMapping?.[header.tagName.toLowerCase()]!]: header.text || '',
         },
       });
     });
@@ -53,30 +68,62 @@ export class HTMLHeaderTransformer {
           el =>
             new Document({
               text: el.content,
-              metadata: el.metadata,
+              metadata: { ...el.metadata, xpath: el.xpath },
             }),
         )
       : this.aggregateElementsToChunks(elements);
   }
-  private getXPath(element: Element): string {
+  private getXPath(element: any): string {
+    if (!element) return '';
     const parts: string[] = [];
-    let current: Element | null = element;
+    let current = element;
-    while (current && current.nodeType === 1) {
+    while (current && current.tagName) {
       let index = 1;
-      for (let sibling = current.previousElementSibling; sibling; sibling = sibling.previousElementSibling) {
-        if (sibling.nodeName === current.nodeName) {
-          index++;
+      const parent = current.parentNode;
+      if (parent && parent.childNodes) {
+        // Count preceding siblings with same tag
+        for (const sibling of parent.childNodes) {
+          if (sibling === current) break;
+          if (sibling.tagName === current.tagName) {
+            index++;
+          }
         }
       }
       parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
-      current = current.parentElement;
+      current = current.parentNode;
     }
     return '/' + parts.join('/');
   }
+  private getTextContent(element: any): string {
+    if (!element) return '';
+    // For text nodes, return their content
+    if (!element.tagName) {
+      return element.text || '';
+    }
+    // For element nodes, combine their text with children's text
+    let content = element.text || '';
+    if (element.childNodes) {
+      for (const child of element.childNodes) {
+        const childText = this.getTextContent(child);
+        if (childText) {
+          content += ' ' + childText;
+        }
+      }
+    }
+    return content.trim();
+  }
   private aggregateElementsToChunks(elements: ElementType[]): Document[] {
     const aggregatedChunks: ElementType[] = [];
@@ -97,7 +144,7 @@ export class HTMLHeaderTransformer {
       chunk =>
         new Document({
           text: chunk.content,
-          metadata: chunk.metadata,
+          metadata: { ...chunk.metadata, xpath: chunk.xpath },
         }),
     );
   }
@@ -110,7 +157,6 @@ export class HTMLHeaderTransformer {
       const chunks = this.splitText({ text: texts[i]! });
       for (const chunk of chunks) {
         const metadata = { ...(_metadatas[i] || {}) };
         const chunkMetadata = chunk.metadata;
         if (chunkMetadata) {
@@ -151,7 +197,7 @@ export class HTMLSectionTransformer {
   private options: Record<string, any>;
   constructor(headersToSplitOn: [string, string][], options: Record<string, any> = {}) {
-    this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
+    this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
     this.options = options;
   }
@@ -163,12 +209,82 @@ export class HTMLSectionTransformer {
         new Document({
           text: section.content,
           metadata: {
-            [this.headersToSplitOn[section.tagName]!]: section.header,
+            [this.headersToSplitOn[section.tagName.toLowerCase()]!]: section.header,
+            xpath: section.xpath,
           },
         }),
     );
   }
+  private getXPath(element: any): string {
+    const parts: string[] = [];
+    let current = element;
+    while (current && current.nodeType === 1) {
+      let index = 1;
+      let sibling = current.previousSibling;
+      while (sibling) {
+        if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
+          index++;
+        }
+        sibling = sibling.previousSibling;
+      }
+      if (current.tagName) {
+        parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
+      }
+      current = current.parentNode;
+    }
+    return '/' + parts.join('/');
+  }
+  private splitHtmlByHeaders(htmlDoc: string): Array<{
+    header: string;
+    content: string;
+    tagName: string;
+    xpath: string;
+  }> {
+    const sections: Array<{
+      header: string;
+      content: string;
+      tagName: string;
+      xpath: string;
+    }> = [];
+    const root = parse(htmlDoc);
+    const headers = Object.keys(this.headersToSplitOn);
+    const headerElements = root.querySelectorAll(headers.join(','));
+    headerElements.forEach((headerElement, index) => {
+      const header = headerElement.text?.trim() || '';
+      const tagName = headerElement.tagName;
+      const xpath = this.getXPath(headerElement);
+      let content = '';
+      let currentElement = headerElement.nextElementSibling;
+      const nextHeader = headerElements[index + 1];
+      while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
+        if (currentElement.text) {
+          content += currentElement.text.trim() + ' ';
+        }
+        currentElement = currentElement.nextElementSibling;
+      }
+      content = content.trim();
+      sections.push({
+        header,
+        content,
+        tagName,
+        xpath,
+      });
+    });
+    return sections;
+  }
   async splitDocuments(documents: Document[]): Promise<Document[]> {
     const texts: string[] = [];
     const metadatas: Record<string, any>[] = [];
@@ -214,61 +330,6 @@ export class HTMLSectionTransformer {
     return documents;
   }
-  private splitHtmlByHeaders(htmlDoc: string): Array<{
-    header: string;
-    content: string;
-    tagName: string;
-  }> {
-    const sections: Array<{
-      header: string;
-      content: string;
-      tagName: string;
-    }> = [];
-    const dom = new JSDOM(htmlDoc);
-    const { document } = dom.window;
-    const headers = ['body', ...Object.keys(this.headersToSplitOn)];
-    const headerElements = Array.from(document.querySelectorAll(headers.join(',')));
-    for (let i = 0; i < headerElements.length; i++) {
-      const headerElement = headerElements[i]!;
-      let currentHeader: string;
-      let currentHeaderTag: string;
-      let sectionContent: string[] = [];
-      if (i === 0) {
-        currentHeader = '#TITLE#';
-        currentHeaderTag = 'h1';
-      } else {
-        currentHeader = headerElement.textContent?.trim() || '';
-        currentHeaderTag = headerElement.tagName.toLowerCase();
-      }
-      // Get content until next header
-      let currentNode = headerElement.nextSibling;
-      const nextHeader = headerElements[i + 1];
-      while (currentNode && currentNode !== nextHeader) {
-        if (currentNode.textContent) {
-          sectionContent.push(currentNode.textContent);
-        }
-        currentNode = currentNode.nextSibling;
-      }
-      const content = sectionContent.join(' ').trim();
-      if (content) {
-        sections.push({
-          header: currentHeader,
-          content,
-          tagName: currentHeaderTag,
-        });
-      }
-    }
-    return sections;
-  }
   transformDocuments(documents: Document[]): Document[] {
     const texts: string[] = [];
     const metadatas: Record<string, any>[] = [];