npm - @mastra/rag - Versions diffs - 1.0.7 → 1.0.8-alpha.0 - Mend

@mastra/rag 1.0.7 → 1.0.8-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/.turbo/turbo-build.log +1 -1
package/CHANGELOG.md +11 -0
package/dist/document/document.d.ts +4 -1
package/dist/document/document.d.ts.map +1 -1
package/dist/document/transformers/semantic-markdown.d.ts +25 -0
package/dist/document/transformers/semantic-markdown.d.ts.map +1 -0
package/dist/document/types.d.ts +13 -1
package/dist/document/types.d.ts.map +1 -1
package/dist/document/validation.d.ts.map +1 -1
package/dist/index.cjs +197 -11
package/dist/index.cjs.map +1 -1
package/dist/index.js +197 -11
package/dist/index.js.map +1 -1
package/package.json +4 -4
package/src/document/document.test.ts +644 -1
package/src/document/document.ts +32 -12
package/src/document/transformers/semantic-markdown.ts +227 -0
package/src/document/types.ts +21 -2
package/src/document/validation.ts +11 -0

package/dist/index.js CHANGED Viewed

@@ -5644,6 +5644,168 @@ var MarkdownHeaderTransformer = class {
     return this.createDocuments(texts, metadatas);
   }
 };
+var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends TextTransformer {
+  tokenizer;
+  joinThreshold;
+  allowedSpecial;
+  disallowedSpecial;
+  constructor({
+    joinThreshold = 500,
+    encodingName = "cl100k_base",
+    modelName,
+    allowedSpecial = /* @__PURE__ */ new Set(),
+    disallowedSpecial = "all",
+    ...baseOptions
+  } = {}) {
+    super(baseOptions);
+    this.joinThreshold = joinThreshold;
+    this.allowedSpecial = allowedSpecial;
+    this.disallowedSpecial = disallowedSpecial;
+    try {
+      this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
+    } catch {
+      throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
+    }
+  }
+  countTokens(text) {
+    const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
+    const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
+    const processedText = this.stripWhitespace ? text.trim() : text;
+    return this.tokenizer.encode(processedText, allowed, disallowed).length;
+  }
+  splitMarkdownByHeaders(markdown) {
+    const sections = [];
+    const lines = markdown.split("\n");
+    let currentContent = "";
+    let currentTitle = "";
+    let currentDepth = 0;
+    let inCodeBlock = false;
+    const headerRegex = /^(#+)\s+(.+)$/;
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i];
+      const headerMatch = line.match(headerRegex);
+      if (line.startsWith("```") || line.startsWith("~~~")) {
+        inCodeBlock = !inCodeBlock;
+      }
+      if (headerMatch && !inCodeBlock) {
+        if (currentContent.trim() !== "" || currentTitle && currentDepth > 0) {
+          sections.push({
+            title: currentTitle,
+            content: currentContent.trim(),
+            depth: currentDepth,
+            length: this.countTokens(currentContent.trim())
+          });
+        }
+        currentContent = "";
+        currentDepth = headerMatch[1].length;
+        currentTitle = headerMatch[2];
+      } else {
+        currentContent += line + "\n";
+      }
+    }
+    if (currentContent.trim() !== "") {
+      sections.push({
+        title: currentTitle,
+        content: currentContent.trim(),
+        depth: currentDepth,
+        length: this.countTokens(currentContent.trim())
+      });
+    }
+    if (sections.length > 1 && sections[0].title === "" && sections[0].content.trim() === "") {
+      sections.shift();
+    }
+    return sections;
+  }
+  mergeSemanticSections(sections) {
+    if (sections.length === 0) return sections;
+    const workingSections = [...sections];
+    const deepest = Math.max(...workingSections.map((s) => s.depth));
+    for (let depth = deepest; depth > 0; depth--) {
+      for (let j = 1; j < workingSections.length; j++) {
+        const current = workingSections[j];
+        if (current.depth === depth) {
+          const prev = workingSections[j - 1];
+          if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
+            const title = `${"#".repeat(current.depth)} ${current.title}`;
+            const formattedTitle = `
+${title}`;
+            prev.content += `${formattedTitle}
+${current.content}`;
+            prev.length = this.countTokens(prev.content);
+            workingSections.splice(j, 1);
+            j--;
+          }
+        }
+      }
+    }
+    return workingSections;
+  }
+  splitText({ text }) {
+    if (!text.trim()) return [];
+    const initialSections = this.splitMarkdownByHeaders(text);
+    const mergedSections = this.mergeSemanticSections(initialSections);
+    return mergedSections.map((section) => {
+      if (section.title) {
+        const header = `${"#".repeat(section.depth)} ${section.title}`;
+        return `${header}
+${section.content}`;
+      }
+      return section.content;
+    });
+  }
+  createDocuments(texts, metadatas) {
+    const _metadatas = metadatas || Array(texts.length).fill({});
+    const documents = [];
+    texts.forEach((text, i) => {
+      this.splitText({ text }).forEach((chunk) => {
+        const metadata = {
+          ..._metadatas[i],
+          tokenCount: this.countTokens(chunk)
+        };
+        documents.push(
+          new Document({
+            text: chunk,
+            metadata
+          })
+        );
+      });
+    });
+    return documents;
+  }
+  transformDocuments(documents) {
+    const texts = [];
+    const metadatas = [];
+    for (const doc of documents) {
+      texts.push(doc.text);
+      metadatas.push(doc.metadata);
+    }
+    return this.createDocuments(texts, metadatas);
+  }
+  static fromTikToken({
+    encodingName = "cl100k_base",
+    modelName,
+    options = {}
+  }) {
+    let tokenizer;
+    try {
+      tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
+    } catch {
+      throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
+    }
+    const tikTokenCounter = (text) => {
+      const allowed = options.allowedSpecial === "all" ? "all" : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
+      const disallowed = options.disallowedSpecial === "all" ? "all" : options.disallowedSpecial ? Array.from(options.disallowedSpecial) : [];
+      return tokenizer.encode(text, allowed, disallowed).length;
+    };
+    return new _SemanticMarkdownTransformer({
+      ...options,
+      encodingName,
+      modelName,
+      lengthFunction: tikTokenCounter
+    });
+  }
+};
 // src/document/transformers/sentence.ts
 var SentenceTransformer = class extends TextTransformer {
@@ -6031,6 +6193,13 @@ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
   returnEachLine: z.boolean().optional(),
   stripHeaders: z.boolean().optional()
 }).strict();
+var semanticMarkdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  joinThreshold: z.number().positive().optional(),
+  encodingName: z.string().optional(),
+  modelName: z.string().optional(),
+  allowedSpecial: setOrAllSchema,
+  disallowedSpecial: setOrAllSchema
+}).strict();
 var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
 var validationSchemas = {
   character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
@@ -6040,6 +6209,7 @@ var validationSchemas = {
   json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
   html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
   markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
+  "semantic-markdown": semanticMarkdownChunkOptionsSchema.transform(handleDeprecatedSize),
   latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
 };
 function validateChunkParams(strategy, params) {
@@ -6169,18 +6339,25 @@ var MDocument = class _MDocument {
         return "recursive";
     }
   }
+  _strategyMap;
+  get strategyMap() {
+    if (!this._strategyMap) {
+      this._strategyMap = {
+        recursive: (options) => this.chunkRecursive(options),
+        character: (options) => this.chunkCharacter(options),
+        token: (options) => this.chunkToken(options),
+        markdown: (options) => this.chunkMarkdown(options),
+        html: (options) => this.chunkHTML(options),
+        json: (options) => this.chunkJSON(options),
+        latex: (options) => this.chunkLatex(options),
+        sentence: (options) => this.chunkSentence(options),
+        "semantic-markdown": (options) => this.chunkSemanticMarkdown(options)
+      };
+    }
+    return this._strategyMap;
+  }
   async chunkBy(strategy, options) {
-    const strategyMap = {
-      recursive: (options2) => this.chunkRecursive(options2),
-      character: (options2) => this.chunkCharacter(options2),
-      token: (options2) => this.chunkToken(options2),
-      markdown: (options2) => this.chunkMarkdown(options2),
-      html: (options2) => this.chunkHTML(options2),
-      json: (options2) => this.chunkJSON(options2),
-      latex: (options2) => this.chunkLatex(options2),
-      sentence: (options2) => this.chunkSentence(options2)
-    };
-    const chunkingFunc = strategyMap[strategy];
+    const chunkingFunc = this.strategyMap[strategy];
     if (chunkingFunc) {
       await chunkingFunc(options);
     } else {
@@ -6282,6 +6459,15 @@ var MDocument = class _MDocument {
     const textSplit = rt.transformDocuments(this.chunks);
     this.chunks = textSplit;
   }
+  async chunkSemanticMarkdown(options) {
+    const rt = SemanticMarkdownTransformer.fromTikToken({
+      options,
+      encodingName: options?.encodingName,
+      modelName: options?.modelName
+    });
+    const textSplit = rt.transformDocuments(this.chunks);
+    this.chunks = textSplit;
+  }
   async chunk(params) {
     const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
     const strategy = passedStrategy || this.defaultStrategy();