npm - @mastra/rag - Versions diffs - 1.0.6 → 1.0.7-alpha.0 - Mend

@mastra/rag 1.0.6 → 1.0.7-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/.turbo/turbo-build.log +1 -1
package/CHANGELOG.md +12 -0
package/dist/document/document.d.ts +9 -8
package/dist/document/document.d.ts.map +1 -1
package/dist/document/transformers/character.d.ts +4 -26
package/dist/document/transformers/character.d.ts.map +1 -1
package/dist/document/transformers/html.d.ts +8 -3
package/dist/document/transformers/html.d.ts.map +1 -1
package/dist/document/transformers/json.d.ts +4 -4
package/dist/document/transformers/json.d.ts.map +1 -1
package/dist/document/transformers/latex.d.ts +2 -8
package/dist/document/transformers/latex.d.ts.map +1 -1
package/dist/document/transformers/markdown.d.ts +2 -8
package/dist/document/transformers/markdown.d.ts.map +1 -1
package/dist/document/transformers/sentence.d.ts +31 -0
package/dist/document/transformers/sentence.d.ts.map +1 -0
package/dist/document/transformers/text.d.ts +3 -3
package/dist/document/transformers/text.d.ts.map +1 -1
package/dist/document/transformers/token.d.ts +4 -15
package/dist/document/transformers/token.d.ts.map +1 -1
package/dist/document/types.d.ts +85 -14
package/dist/document/types.d.ts.map +1 -1
package/dist/document/validation.d.ts +3 -0
package/dist/document/validation.d.ts.map +1 -0
package/dist/index.cjs +414 -80
package/dist/index.cjs.map +1 -1
package/dist/index.js +414 -80
package/dist/index.js.map +1 -1
package/dist/tools/document-chunker.d.ts.map +1 -1
package/package.json +5 -5
package/src/document/document.test.ts +294 -39
package/src/document/document.ts +69 -41
package/src/document/transformers/character.ts +15 -43
package/src/document/transformers/html.ts +9 -9
package/src/document/transformers/json.ts +8 -3
package/src/document/transformers/latex.ts +3 -11
package/src/document/transformers/markdown.ts +3 -11
package/src/document/transformers/sentence.ts +314 -0
package/src/document/transformers/text.ts +10 -10
package/src/document/transformers/token.ts +6 -17
package/src/document/types.ts +66 -15
package/src/document/validation.ts +147 -0
package/src/tools/document-chunker.ts +12 -8

package/dist/index.cjs CHANGED Viewed

@@ -4477,24 +4477,24 @@ var Language = /* @__PURE__ */ ((Language2) => {
 // src/document/transformers/text.ts
 var TextTransformer = class {
-  size;
+  maxSize;
   overlap;
   lengthFunction;
   keepSeparator;
   addStartIndex;
   stripWhitespace;
   constructor({
-    size = 4e3,
+    maxSize = 4e3,
     overlap = 200,
     lengthFunction = (text) => text.length,
     keepSeparator = false,
     addStartIndex = false,
     stripWhitespace = true
   }) {
-    if (overlap > size) {
-      throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${size}), should be smaller.`);
+    if (overlap > maxSize) {
+      throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${maxSize}), should be smaller.`);
     }
-    this.size = size;
+    this.maxSize = maxSize;
     this.overlap = overlap;
     this.lengthFunction = lengthFunction;
     this.keepSeparator = keepSeparator;
@@ -4560,9 +4560,9 @@ var TextTransformer = class {
     for (const d of splits) {
       const len = this.lengthFunction(d);
       const separatorLen = separator ? this.lengthFunction(separator) : 0;
-      if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.size) {
-        if (total > this.size) {
-          console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.size}`);
+      if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
+        if (total > this.maxSize) {
+          console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
         }
         if (currentDoc.length > 0) {
           const doc = this.joinDocs(currentDoc, separator);
@@ -4640,12 +4640,8 @@ function splitTextWithRegex(text, separator, keepSeparator) {
 var CharacterTransformer = class extends TextTransformer {
   separator;
   isSeparatorRegex;
-  constructor({
-    separator = "\n\n",
-    isSeparatorRegex = false,
-    options = {}
-  }) {
-    super(options);
+  constructor({ separator = "\n\n", isSeparatorRegex = false, ...baseOptions } = {}) {
+    super(baseOptions);
     this.separator = separator;
     this.isSeparatorRegex = isSeparatorRegex;
   }
@@ -4654,7 +4650,7 @@ var CharacterTransformer = class extends TextTransformer {
     const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
     const chunks = [];
     for (const split of initialSplits) {
-      if (this.lengthFunction(split) <= this.size) {
+      if (this.lengthFunction(split) <= this.maxSize) {
         chunks.push(split);
       } else {
         const subChunks = this.__splitChunk(split);
@@ -4668,7 +4664,7 @@ var CharacterTransformer = class extends TextTransformer {
     let currentPosition = 0;
     while (currentPosition < text.length) {
       let chunkEnd = currentPosition;
-      while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.size) {
+      while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
         chunkEnd++;
       }
       const currentChunk = text.slice(currentPosition, chunkEnd);
@@ -4683,12 +4679,8 @@ var CharacterTransformer = class extends TextTransformer {
 var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends TextTransformer {
   separators;
   isSeparatorRegex;
-  constructor({
-    separators,
-    isSeparatorRegex = false,
-    options = {}
-  }) {
-    super(options);
+  constructor({ separators, isSeparatorRegex = false, language, ...baseOptions } = {}) {
+    super(baseOptions);
     this.separators = separators || ["\n\n", "\n", " ", ""];
     this.isSeparatorRegex = isSeparatorRegex;
   }
@@ -4714,7 +4706,7 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
     const goodSplits = [];
     const mergeSeparator = this.keepSeparator ? "" : separator;
     for (const s of splits) {
-      if (this.lengthFunction(s) < this.size) {
+      if (this.lengthFunction(s) < this.maxSize) {
         goodSplits.push(s);
       } else {
         if (goodSplits.length > 0) {
@@ -4741,7 +4733,12 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
   }
   static fromLanguage(language, options = {}) {
     const separators = _RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
-    return new _RecursiveCharacterTransformer({ separators, isSeparatorRegex: true, options });
+    return new _RecursiveCharacterTransformer({
+      ...options,
+      separators,
+      isSeparatorRegex: true,
+      language
+    });
   }
   static getSeparatorsForLanguage(language) {
     switch (language) {
@@ -4826,9 +4823,9 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
 var HTMLHeaderTransformer = class {
   headersToSplitOn;
   returnEachElement;
-  constructor(headersToSplitOn, returnEachElement = false) {
-    this.returnEachElement = returnEachElement;
-    this.headersToSplitOn = [...headersToSplitOn].sort();
+  constructor(options) {
+    this.returnEachElement = options.returnEachLine ?? false;
+    this.headersToSplitOn = [...options.headers].sort();
   }
   splitText({ text }) {
     const root = nodeHtmlBetterParser.parse(text);
@@ -4959,10 +4956,10 @@ var HTMLHeaderTransformer = class {
 };
 var HTMLSectionTransformer = class {
   headersToSplitOn;
-  options;
-  constructor(headersToSplitOn, options = {}) {
-    this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name14]) => [tag.toLowerCase(), name14]));
-    this.options = options;
+  textSplitter;
+  constructor(options) {
+    this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name14]) => [tag.toLowerCase(), name14]));
+    this.textSplitter = new RecursiveCharacterTransformer(options);
   }
   splitText(text) {
     const sections = this.splitHtmlByHeaders(text);
@@ -5031,8 +5028,7 @@ var HTMLSectionTransformer = class {
       metadatas.push(doc.metadata);
     }
     const results = await this.createDocuments(texts, metadatas);
-    const textSplitter = new RecursiveCharacterTransformer({ options: this.options });
-    return textSplitter.splitDocuments(results);
+    return this.textSplitter.splitDocuments(results);
   }
   createDocuments(texts, metadatas) {
     const _metadatas = metadatas || Array(texts.length).fill({});
@@ -5074,9 +5070,13 @@ var HTMLSectionTransformer = class {
 var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
   maxSize;
   minSize;
-  constructor({ maxSize = 2e3, minSize }) {
+  ensureAscii;
+  convertLists;
+  constructor({ maxSize = 2e3, minSize, ensureAscii = false, convertLists = true }) {
     this.maxSize = maxSize;
     this.minSize = minSize ?? Math.max(maxSize - 200, 50);
+    this.ensureAscii = ensureAscii;
+    this.convertLists = convertLists;
   }
   static jsonSize(data) {
     const seen = /* @__PURE__ */ new WeakSet();
@@ -5208,7 +5208,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
    */
   isWithinSizeLimit(value, currentSize = 0) {
     const size = _RecursiveJsonTransformer.jsonSize(value);
-    return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize;
+    return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
   }
   /**
    * Splits arrays into chunks based on size limits
@@ -5475,7 +5475,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
 var LatexTransformer = class extends RecursiveCharacterTransformer {
   constructor(options = {}) {
     const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("latex" /* LATEX */);
-    super({ separators, isSeparatorRegex: true, options });
+    super({ ...options, separators, isSeparatorRegex: true });
   }
 };
@@ -5483,7 +5483,7 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
 var MarkdownTransformer = class extends RecursiveCharacterTransformer {
   constructor(options = {}) {
     const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
-    super({ separators, isSeparatorRegex: true, options });
+    super({ ...options, separators, isSeparatorRegex: true });
   }
 };
 var MarkdownHeaderTransformer = class {
@@ -5650,6 +5650,239 @@ var MarkdownHeaderTransformer = class {
     return this.createDocuments(texts, metadatas);
   }
 };
+// src/document/transformers/sentence.ts
+var SentenceTransformer = class extends TextTransformer {
+  minSize;
+  maxSize;
+  targetSize;
+  sentenceEnders;
+  fallbackToWords;
+  fallbackToCharacters;
+  keepSeparator;
+  constructor(options) {
+    const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
+    const baseOptions = {
+      ...options,
+      overlap: parentOverlap
+      // Use adjusted overlap for parent
+    };
+    super(baseOptions);
+    this.maxSize = options.maxSize;
+    this.minSize = options.minSize ?? 50;
+    this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
+    this.sentenceEnders = options.sentenceEnders ?? [".", "!", "?"];
+    this.fallbackToWords = options.fallbackToWords ?? true;
+    this.fallbackToCharacters = options.fallbackToCharacters ?? true;
+    this.keepSeparator = options.keepSeparator ?? false;
+    this.overlap = options.overlap ?? 0;
+  }
+  detectSentenceBoundaries(text) {
+    if (!text) return [];
+    const sentences = [];
+    let currentSentence = "";
+    let i = 0;
+    while (i < text.length) {
+      const char = text[i];
+      if (!char) break;
+      currentSentence += char;
+      if (this.sentenceEnders.includes(char)) {
+        const remainingText = text.slice(i + 1);
+        if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
+          sentences.push(currentSentence.trim());
+          currentSentence = "";
+        }
+      }
+      i++;
+    }
+    if (currentSentence.trim()) {
+      sentences.push(currentSentence.trim());
+    }
+    return sentences.filter((s) => s.length > 0);
+  }
+  isRealSentenceBoundary(currentSentence, remainingText) {
+    if (!remainingText.trim()) {
+      return true;
+    }
+    if (!/^\s+[A-Z]/.test(remainingText)) {
+      return false;
+    }
+    const words = currentSentence.trim().split(/\s+/);
+    const lastWord = words[words.length - 1] || "";
+    const baseWord = lastWord.slice(0, -1);
+    if (this.isCommonAbbreviation(baseWord)) {
+      return false;
+    }
+    return true;
+  }
+  isCommonAbbreviation(word) {
+    const titles = ["Dr", "Mr", "Mrs", "Ms", "Prof", "Sr", "Jr"];
+    if (titles.includes(word)) {
+      return true;
+    }
+    if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
+      return true;
+    }
+    if (/^[A-Z]$/.test(word)) {
+      return true;
+    }
+    if (/^\d+$/.test(word)) {
+      return true;
+    }
+    if (/^[ap]\.?m$/i.test(word)) {
+      return true;
+    }
+    return false;
+  }
+  /**
+   * Group sentences into chunks with integrated overlap processing
+   */
+  groupSentencesIntoChunks(sentences) {
+    const chunks = [];
+    let currentChunk = [];
+    let currentSize = 0;
+    const separator = " ";
+    for (const sentence of sentences) {
+      const sentenceLength = this.lengthFunction(sentence);
+      const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
+      const totalLength = currentSize + sentenceLength + separatorLength;
+      if (sentenceLength > this.maxSize) {
+        if (currentChunk.length > 0) {
+          chunks.push(currentChunk.join(separator));
+          currentChunk = [];
+          currentSize = 0;
+        }
+        const fallbackChunks = this.handleOversizedSentence(sentence);
+        chunks.push(...fallbackChunks);
+        continue;
+      }
+      if (currentChunk.length > 0 && totalLength > this.maxSize) {
+        chunks.push(currentChunk.join(separator));
+        const overlapSentences = this.calculateSentenceOverlap(currentChunk);
+        currentChunk = overlapSentences;
+        currentSize = this.calculateChunkSize(currentChunk);
+      }
+      currentChunk.push(sentence);
+      currentSize += sentenceLength + separatorLength;
+      if (currentSize >= this.targetSize) {
+        chunks.push(currentChunk.join(separator));
+        const overlapSentences = this.calculateSentenceOverlap(currentChunk);
+        currentChunk = overlapSentences;
+        currentSize = this.calculateChunkSize(currentChunk);
+      }
+    }
+    if (currentChunk.length > 0) {
+      chunks.push(currentChunk.join(separator));
+    }
+    return chunks;
+  }
+  /**
+   * Handle oversized sentences with fallback strategies
+   */
+  handleOversizedSentence(sentence) {
+    if (this.fallbackToWords) {
+      const wordChunks = this.splitSentenceIntoWords(sentence);
+      if (wordChunks.length > 1) {
+        return wordChunks;
+      }
+    }
+    if (this.fallbackToCharacters) {
+      return this.splitSentenceIntoCharacters(sentence);
+    }
+    console.warn(
+      `Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`
+    );
+    return [sentence];
+  }
+  splitSentenceIntoWords(sentence) {
+    const words = sentence.split(/\s+/);
+    const chunks = [];
+    let currentChunk = "";
+    for (const word of words) {
+      const testChunk = currentChunk ? currentChunk + " " + word : word;
+      if (this.lengthFunction(testChunk) <= this.maxSize) {
+        currentChunk = testChunk;
+      } else {
+        if (currentChunk) {
+          chunks.push(currentChunk);
+        }
+        if (this.lengthFunction(word) > this.maxSize) {
+          if (this.fallbackToCharacters) {
+            chunks.push(...this.splitSentenceIntoCharacters(word));
+          } else {
+            chunks.push(word);
+          }
+          currentChunk = "";
+        } else {
+          currentChunk = word;
+        }
+      }
+    }
+    if (currentChunk) {
+      chunks.push(currentChunk);
+    }
+    return chunks;
+  }
+  splitSentenceIntoCharacters(text) {
+    const chunks = [];
+    let currentChunk = "";
+    for (const char of text) {
+      if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
+        currentChunk += char;
+      } else {
+        if (currentChunk) {
+          chunks.push(currentChunk);
+        }
+        currentChunk = char;
+      }
+    }
+    if (currentChunk) {
+      chunks.push(currentChunk);
+    }
+    return chunks;
+  }
+  calculateSentenceOverlap(currentChunk) {
+    if (this.overlap === 0 || currentChunk.length === 0) {
+      return [];
+    }
+    const overlapSentences = [];
+    let overlapSize = 0;
+    const separator = " ";
+    for (let i = currentChunk.length - 1; i >= 0; i--) {
+      const sentence = currentChunk[i];
+      if (!sentence) continue;
+      const sentenceLength = this.lengthFunction(sentence);
+      const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
+      if (overlapSize + sentenceLength + separatorLength > this.overlap) {
+        break;
+      }
+      overlapSentences.unshift(sentence);
+      overlapSize += sentenceLength + separatorLength;
+    }
+    return overlapSentences;
+  }
+  calculateChunkSize(sentences) {
+    if (!sentences || sentences.length === 0) {
+      return 0;
+    }
+    let totalSize = 0;
+    const separator = " ";
+    for (let i = 0; i < sentences.length; i++) {
+      const sentence = sentences[i];
+      totalSize += this.lengthFunction(sentence);
+      if (i < sentences.length - 1) {
+        totalSize += this.lengthFunction(separator);
+      }
+    }
+    return totalSize;
+  }
+  splitText({ text }) {
+    if (!text) return [];
+    const sentences = this.detectSentenceBoundaries(text);
+    const chunks = this.groupSentencesIntoChunks(sentences);
+    return chunks.filter((chunk) => chunk.trim().length > 0);
+  }
+};
 function splitTextOnTokens({ text, tokenizer }) {
   const splits = [];
   const inputIds = tokenizer.encode(text);
@@ -5700,7 +5933,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
     };
     const tokenizer = {
       overlap: this.overlap,
-      tokensPerChunk: this.size,
+      tokensPerChunk: this.maxSize,
       decode,
       encode
     };
@@ -5732,13 +5965,105 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
       allowedSpecial: options.allowedSpecial,
       disallowedSpecial: options.disallowedSpecial,
       options: {
-        size: options.size,
+        maxSize: options.maxSize,
         overlap: options.overlap,
         lengthFunction: tikTokenEncoder
       }
     });
   }
 };
+function handleDeprecatedSize(data) {
+  if (data.size !== void 0) {
+    console.warn(
+      "[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version."
+    );
+    if (data.maxSize === void 0) {
+      data.maxSize = data.size;
+    }
+  }
+  const { size, ...rest } = data;
+  return rest;
+}
+var baseChunkOptionsSchema = zod.z.object({
+  size: zod.z.number().positive().optional(),
+  maxSize: zod.z.number().positive().optional(),
+  overlap: zod.z.number().min(0).optional(),
+  lengthFunction: zod.z.function().optional(),
+  keepSeparator: zod.z.union([zod.z.boolean(), zod.z.literal("start"), zod.z.literal("end")]).optional(),
+  addStartIndex: zod.z.boolean().optional(),
+  stripWhitespace: zod.z.boolean().optional()
+});
+var characterChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  separator: zod.z.string().optional(),
+  isSeparatorRegex: zod.z.boolean().optional()
+}).strict();
+var recursiveChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  separators: zod.z.array(zod.z.string()).optional(),
+  isSeparatorRegex: zod.z.boolean().optional(),
+  language: zod.z.string().optional()
+}).strict();
+var sentenceChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  maxSize: zod.z.number().positive(),
+  minSize: zod.z.number().positive().optional(),
+  targetSize: zod.z.number().positive().optional(),
+  sentenceEnders: zod.z.array(zod.z.string()).optional(),
+  fallbackToWords: zod.z.boolean().optional(),
+  fallbackToCharacters: zod.z.boolean().optional()
+}).strict();
+var isSetLike = (value) => {
+  return typeof value === "object" && value !== null && typeof value.has === "function" && typeof value.add === "function" && typeof value.delete === "function" && typeof value.clear === "function" && typeof value.size === "number";
+};
+var setOrAllSchema = zod.z.any().refine((value) => value === "all" || isSetLike(value), {
+  message: "Must be a Set object or the literal 'all'"
+}).optional();
+var tokenChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  encodingName: zod.z.string().optional(),
+  modelName: zod.z.string().optional(),
+  allowedSpecial: setOrAllSchema,
+  disallowedSpecial: setOrAllSchema
+}).strict();
+var jsonChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  minSize: zod.z.number().positive().optional(),
+  ensureAscii: zod.z.boolean().optional(),
+  convertLists: zod.z.boolean().optional()
+}).strict();
+var htmlChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
+  sections: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
+  returnEachLine: zod.z.boolean().optional()
+}).strict();
+var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
+  headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
+  returnEachLine: zod.z.boolean().optional(),
+  stripHeaders: zod.z.boolean().optional()
+}).strict();
+var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
+var validationSchemas = {
+  character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
+  recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
+  sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
+  token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
+  json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
+  html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
+  markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
+  latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
+};
+function validateChunkParams(strategy, params) {
+  const schema = validationSchemas[strategy];
+  if (!schema) {
+    throw new Error(`Unknown chunking strategy: ${strategy}`);
+  }
+  const result = schema.safeParse(params);
+  if (!result.success) {
+    const unrecognizedError = result.error.errors.find((e) => e.code === "unrecognized_keys");
+    if (unrecognizedError && "keys" in unrecognizedError) {
+      const keys = unrecognizedError.keys.join(", ");
+      throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
+    }
+    const errorMessage = result.error.errors.map((e) => `${e.path.length > 0 ? e.path.join(".") : "parameter"}: ${e.message}`).join(", ");
+    throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
+  }
+}
 // src/document/document.ts
 var MDocument = class _MDocument {
@@ -5851,30 +6176,21 @@ var MDocument = class _MDocument {
     }
   }
   async chunkBy(strategy, options) {
-    switch (strategy) {
-      case "recursive":
-        await this.chunkRecursive(options);
-        break;
-      case "character":
-        await this.chunkCharacter(options);
-        break;
-      case "token":
-        await this.chunkToken(options);
-        break;
-      case "markdown":
-        await this.chunkMarkdown(options);
-        break;
-      case "html":
-        await this.chunkHTML(options);
-        break;
-      case "json":
-        await this.chunkJSON(options);
-        break;
-      case "latex":
-        await this.chunkLatex(options);
-        break;
-      default:
-        throw new Error(`Unknown strategy: ${strategy}`);
+    const strategyMap = {
+      recursive: (options2) => this.chunkRecursive(options2),
+      character: (options2) => this.chunkCharacter(options2),
+      token: (options2) => this.chunkToken(options2),
+      markdown: (options2) => this.chunkMarkdown(options2),
+      html: (options2) => this.chunkHTML(options2),
+      json: (options2) => this.chunkJSON(options2),
+      latex: (options2) => this.chunkLatex(options2),
+      sentence: (options2) => this.chunkSentence(options2)
+    };
+    const chunkingFunc = strategyMap[strategy];
+    if (chunkingFunc) {
+      await chunkingFunc(options);
+    } else {
+      throw new Error(`Unknown strategy: ${strategy}`);
     }
   }
   async chunkRecursive(options) {
@@ -5884,32 +6200,28 @@ var MDocument = class _MDocument {
       this.chunks = textSplit2;
       return;
     }
-    const rt = new RecursiveCharacterTransformer({
-      separators: options?.separators,
-      isSeparatorRegex: options?.isSeparatorRegex,
-      options
-    });
+    const rt = new RecursiveCharacterTransformer(options);
     const textSplit = rt.transformDocuments(this.chunks);
     this.chunks = textSplit;
   }
   async chunkCharacter(options) {
     const rt = new CharacterTransformer({
+      ...options,
       separator: options?.separator,
-      isSeparatorRegex: options?.isSeparatorRegex,
-      options
+      isSeparatorRegex: options?.isSeparatorRegex
     });
     const textSplit = rt.transformDocuments(this.chunks);
     this.chunks = textSplit;
   }
   async chunkHTML(options) {
     if (options?.headers?.length) {
-      const rt = new HTMLHeaderTransformer(options.headers, options?.returnEachLine);
+      const rt = new HTMLHeaderTransformer(options);
       const textSplit = rt.transformDocuments(this.chunks);
       this.chunks = textSplit;
       return;
     }
     if (options?.sections?.length) {
-      const rt = new HTMLSectionTransformer(options.sections);
+      const rt = new HTMLSectionTransformer(options);
       const textSplit = rt.transformDocuments(this.chunks);
       this.chunks = textSplit;
       return;
@@ -5956,9 +6268,30 @@ var MDocument = class _MDocument {
     const textSplit = rt.transformDocuments(this.chunks);
     this.chunks = textSplit;
   }
+  async chunkSentence(options) {
+    if (!options?.maxSize) {
+      throw new Error("Sentence chunking requires maxSize to be specified");
+    }
+    const rt = new SentenceTransformer({
+      minSize: options?.minSize,
+      maxSize: options?.maxSize,
+      targetSize: options?.targetSize,
+      overlap: options?.overlap,
+      sentenceEnders: options?.sentenceEnders,
+      fallbackToWords: options?.fallbackToWords,
+      fallbackToCharacters: options?.fallbackToCharacters,
+      keepSeparator: options?.keepSeparator,
+      lengthFunction: options?.lengthFunction,
+      addStartIndex: options?.addStartIndex,
+      stripWhitespace: options?.stripWhitespace
+    });
+    const textSplit = rt.transformDocuments(this.chunks);
+    this.chunks = textSplit;
+  }
   async chunk(params) {
     const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
     const strategy = passedStrategy || this.defaultStrategy();
+    validateChunkParams(strategy, chunkOptions);
     await this.chunkBy(strategy, chunkOptions);
     if (extract) {
       await this.extractMetadata(extract);
@@ -6351,19 +6684,20 @@ var GraphRAG = class {
     }));
   }
 };
+var DEFAULT_CHUNK_PARAMS = {
+  strategy: "recursive",
+  maxSize: 512,
+  overlap: 50,
+  separators: ["\n"]
+};
 var createDocumentChunkerTool = ({
   doc,
-  params = {
-    strategy: "recursive",
-    size: 512,
-    overlap: 50,
-    separator: "\n"
-  }
+  params = DEFAULT_CHUNK_PARAMS
 }) => {
   return tools.createTool({
-    id: `Document Chunker ${params.strategy} ${params.size}`,
+    id: `Document Chunker ${params.strategy} ${params.maxSize}`,
     inputSchema: zod.z.object({}),
-    description: `Chunks document using ${params.strategy} strategy with size ${params.size} and ${params.overlap} overlap`,
+    description: `Chunks document using ${params.strategy} strategy with maxSize ${params.maxSize} and ${params.overlap || 0} overlap`,
     execute: async () => {
       const chunks = await doc.chunk(params);
       return {