@mastra/rag 1.0.6 → 1.0.7-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +12 -0
  3. package/dist/document/document.d.ts +9 -8
  4. package/dist/document/document.d.ts.map +1 -1
  5. package/dist/document/transformers/character.d.ts +4 -26
  6. package/dist/document/transformers/character.d.ts.map +1 -1
  7. package/dist/document/transformers/html.d.ts +8 -3
  8. package/dist/document/transformers/html.d.ts.map +1 -1
  9. package/dist/document/transformers/json.d.ts +4 -4
  10. package/dist/document/transformers/json.d.ts.map +1 -1
  11. package/dist/document/transformers/latex.d.ts +2 -8
  12. package/dist/document/transformers/latex.d.ts.map +1 -1
  13. package/dist/document/transformers/markdown.d.ts +2 -8
  14. package/dist/document/transformers/markdown.d.ts.map +1 -1
  15. package/dist/document/transformers/sentence.d.ts +31 -0
  16. package/dist/document/transformers/sentence.d.ts.map +1 -0
  17. package/dist/document/transformers/text.d.ts +3 -3
  18. package/dist/document/transformers/text.d.ts.map +1 -1
  19. package/dist/document/transformers/token.d.ts +4 -15
  20. package/dist/document/transformers/token.d.ts.map +1 -1
  21. package/dist/document/types.d.ts +85 -14
  22. package/dist/document/types.d.ts.map +1 -1
  23. package/dist/document/validation.d.ts +3 -0
  24. package/dist/document/validation.d.ts.map +1 -0
  25. package/dist/index.cjs +414 -80
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.js +414 -80
  28. package/dist/index.js.map +1 -1
  29. package/dist/tools/document-chunker.d.ts.map +1 -1
  30. package/package.json +5 -5
  31. package/src/document/document.test.ts +294 -39
  32. package/src/document/document.ts +69 -41
  33. package/src/document/transformers/character.ts +15 -43
  34. package/src/document/transformers/html.ts +9 -9
  35. package/src/document/transformers/json.ts +8 -3
  36. package/src/document/transformers/latex.ts +3 -11
  37. package/src/document/transformers/markdown.ts +3 -11
  38. package/src/document/transformers/sentence.ts +314 -0
  39. package/src/document/transformers/text.ts +10 -10
  40. package/src/document/transformers/token.ts +6 -17
  41. package/src/document/types.ts +66 -15
  42. package/src/document/validation.ts +147 -0
  43. package/src/tools/document-chunker.ts +12 -8
package/dist/index.cjs CHANGED
@@ -4477,24 +4477,24 @@ var Language = /* @__PURE__ */ ((Language2) => {
4477
4477
 
4478
4478
  // src/document/transformers/text.ts
4479
4479
  var TextTransformer = class {
4480
- size;
4480
+ maxSize;
4481
4481
  overlap;
4482
4482
  lengthFunction;
4483
4483
  keepSeparator;
4484
4484
  addStartIndex;
4485
4485
  stripWhitespace;
4486
4486
  constructor({
4487
- size = 4e3,
4487
+ maxSize = 4e3,
4488
4488
  overlap = 200,
4489
4489
  lengthFunction = (text) => text.length,
4490
4490
  keepSeparator = false,
4491
4491
  addStartIndex = false,
4492
4492
  stripWhitespace = true
4493
4493
  }) {
4494
- if (overlap > size) {
4495
- throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${size}), should be smaller.`);
4494
+ if (overlap > maxSize) {
4495
+ throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${maxSize}), should be smaller.`);
4496
4496
  }
4497
- this.size = size;
4497
+ this.maxSize = maxSize;
4498
4498
  this.overlap = overlap;
4499
4499
  this.lengthFunction = lengthFunction;
4500
4500
  this.keepSeparator = keepSeparator;
@@ -4560,9 +4560,9 @@ var TextTransformer = class {
4560
4560
  for (const d of splits) {
4561
4561
  const len = this.lengthFunction(d);
4562
4562
  const separatorLen = separator ? this.lengthFunction(separator) : 0;
4563
- if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.size) {
4564
- if (total > this.size) {
4565
- console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.size}`);
4563
+ if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
4564
+ if (total > this.maxSize) {
4565
+ console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
4566
4566
  }
4567
4567
  if (currentDoc.length > 0) {
4568
4568
  const doc = this.joinDocs(currentDoc, separator);
@@ -4640,12 +4640,8 @@ function splitTextWithRegex(text, separator, keepSeparator) {
4640
4640
  var CharacterTransformer = class extends TextTransformer {
4641
4641
  separator;
4642
4642
  isSeparatorRegex;
4643
- constructor({
4644
- separator = "\n\n",
4645
- isSeparatorRegex = false,
4646
- options = {}
4647
- }) {
4648
- super(options);
4643
+ constructor({ separator = "\n\n", isSeparatorRegex = false, ...baseOptions } = {}) {
4644
+ super(baseOptions);
4649
4645
  this.separator = separator;
4650
4646
  this.isSeparatorRegex = isSeparatorRegex;
4651
4647
  }
@@ -4654,7 +4650,7 @@ var CharacterTransformer = class extends TextTransformer {
4654
4650
  const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
4655
4651
  const chunks = [];
4656
4652
  for (const split of initialSplits) {
4657
- if (this.lengthFunction(split) <= this.size) {
4653
+ if (this.lengthFunction(split) <= this.maxSize) {
4658
4654
  chunks.push(split);
4659
4655
  } else {
4660
4656
  const subChunks = this.__splitChunk(split);
@@ -4668,7 +4664,7 @@ var CharacterTransformer = class extends TextTransformer {
4668
4664
  let currentPosition = 0;
4669
4665
  while (currentPosition < text.length) {
4670
4666
  let chunkEnd = currentPosition;
4671
- while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.size) {
4667
+ while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
4672
4668
  chunkEnd++;
4673
4669
  }
4674
4670
  const currentChunk = text.slice(currentPosition, chunkEnd);
@@ -4683,12 +4679,8 @@ var CharacterTransformer = class extends TextTransformer {
4683
4679
  var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends TextTransformer {
4684
4680
  separators;
4685
4681
  isSeparatorRegex;
4686
- constructor({
4687
- separators,
4688
- isSeparatorRegex = false,
4689
- options = {}
4690
- }) {
4691
- super(options);
4682
+ constructor({ separators, isSeparatorRegex = false, language, ...baseOptions } = {}) {
4683
+ super(baseOptions);
4692
4684
  this.separators = separators || ["\n\n", "\n", " ", ""];
4693
4685
  this.isSeparatorRegex = isSeparatorRegex;
4694
4686
  }
@@ -4714,7 +4706,7 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4714
4706
  const goodSplits = [];
4715
4707
  const mergeSeparator = this.keepSeparator ? "" : separator;
4716
4708
  for (const s of splits) {
4717
- if (this.lengthFunction(s) < this.size) {
4709
+ if (this.lengthFunction(s) < this.maxSize) {
4718
4710
  goodSplits.push(s);
4719
4711
  } else {
4720
4712
  if (goodSplits.length > 0) {
@@ -4741,7 +4733,12 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4741
4733
  }
4742
4734
  static fromLanguage(language, options = {}) {
4743
4735
  const separators = _RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
4744
- return new _RecursiveCharacterTransformer({ separators, isSeparatorRegex: true, options });
4736
+ return new _RecursiveCharacterTransformer({
4737
+ ...options,
4738
+ separators,
4739
+ isSeparatorRegex: true,
4740
+ language
4741
+ });
4745
4742
  }
4746
4743
  static getSeparatorsForLanguage(language) {
4747
4744
  switch (language) {
@@ -4826,9 +4823,9 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4826
4823
  var HTMLHeaderTransformer = class {
4827
4824
  headersToSplitOn;
4828
4825
  returnEachElement;
4829
- constructor(headersToSplitOn, returnEachElement = false) {
4830
- this.returnEachElement = returnEachElement;
4831
- this.headersToSplitOn = [...headersToSplitOn].sort();
4826
+ constructor(options) {
4827
+ this.returnEachElement = options.returnEachLine ?? false;
4828
+ this.headersToSplitOn = [...options.headers].sort();
4832
4829
  }
4833
4830
  splitText({ text }) {
4834
4831
  const root = nodeHtmlBetterParser.parse(text);
@@ -4959,10 +4956,10 @@ var HTMLHeaderTransformer = class {
4959
4956
  };
4960
4957
  var HTMLSectionTransformer = class {
4961
4958
  headersToSplitOn;
4962
- options;
4963
- constructor(headersToSplitOn, options = {}) {
4964
- this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name14]) => [tag.toLowerCase(), name14]));
4965
- this.options = options;
4959
+ textSplitter;
4960
+ constructor(options) {
4961
+ this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name14]) => [tag.toLowerCase(), name14]));
4962
+ this.textSplitter = new RecursiveCharacterTransformer(options);
4966
4963
  }
4967
4964
  splitText(text) {
4968
4965
  const sections = this.splitHtmlByHeaders(text);
@@ -5031,8 +5028,7 @@ var HTMLSectionTransformer = class {
5031
5028
  metadatas.push(doc.metadata);
5032
5029
  }
5033
5030
  const results = await this.createDocuments(texts, metadatas);
5034
- const textSplitter = new RecursiveCharacterTransformer({ options: this.options });
5035
- return textSplitter.splitDocuments(results);
5031
+ return this.textSplitter.splitDocuments(results);
5036
5032
  }
5037
5033
  createDocuments(texts, metadatas) {
5038
5034
  const _metadatas = metadatas || Array(texts.length).fill({});
@@ -5074,9 +5070,13 @@ var HTMLSectionTransformer = class {
5074
5070
  var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5075
5071
  maxSize;
5076
5072
  minSize;
5077
- constructor({ maxSize = 2e3, minSize }) {
5073
+ ensureAscii;
5074
+ convertLists;
5075
+ constructor({ maxSize = 2e3, minSize, ensureAscii = false, convertLists = true }) {
5078
5076
  this.maxSize = maxSize;
5079
5077
  this.minSize = minSize ?? Math.max(maxSize - 200, 50);
5078
+ this.ensureAscii = ensureAscii;
5079
+ this.convertLists = convertLists;
5080
5080
  }
5081
5081
  static jsonSize(data) {
5082
5082
  const seen = /* @__PURE__ */ new WeakSet();
@@ -5208,7 +5208,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5208
5208
  */
5209
5209
  isWithinSizeLimit(value, currentSize = 0) {
5210
5210
  const size = _RecursiveJsonTransformer.jsonSize(value);
5211
- return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize;
5211
+ return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
5212
5212
  }
5213
5213
  /**
5214
5214
  * Splits arrays into chunks based on size limits
@@ -5475,7 +5475,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5475
5475
  var LatexTransformer = class extends RecursiveCharacterTransformer {
5476
5476
  constructor(options = {}) {
5477
5477
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("latex" /* LATEX */);
5478
- super({ separators, isSeparatorRegex: true, options });
5478
+ super({ ...options, separators, isSeparatorRegex: true });
5479
5479
  }
5480
5480
  };
5481
5481
 
@@ -5483,7 +5483,7 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
5483
5483
  var MarkdownTransformer = class extends RecursiveCharacterTransformer {
5484
5484
  constructor(options = {}) {
5485
5485
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
5486
- super({ separators, isSeparatorRegex: true, options });
5486
+ super({ ...options, separators, isSeparatorRegex: true });
5487
5487
  }
5488
5488
  };
5489
5489
  var MarkdownHeaderTransformer = class {
@@ -5650,6 +5650,239 @@ var MarkdownHeaderTransformer = class {
5650
5650
  return this.createDocuments(texts, metadatas);
5651
5651
  }
5652
5652
  };
5653
+
5654
+ // src/document/transformers/sentence.ts
5655
+ var SentenceTransformer = class extends TextTransformer {
5656
+ minSize;
5657
+ maxSize;
5658
+ targetSize;
5659
+ sentenceEnders;
5660
+ fallbackToWords;
5661
+ fallbackToCharacters;
5662
+ keepSeparator;
5663
+ constructor(options) {
5664
+ const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
5665
+ const baseOptions = {
5666
+ ...options,
5667
+ overlap: parentOverlap
5668
+ // Use adjusted overlap for parent
5669
+ };
5670
+ super(baseOptions);
5671
+ this.maxSize = options.maxSize;
5672
+ this.minSize = options.minSize ?? 50;
5673
+ this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
5674
+ this.sentenceEnders = options.sentenceEnders ?? [".", "!", "?"];
5675
+ this.fallbackToWords = options.fallbackToWords ?? true;
5676
+ this.fallbackToCharacters = options.fallbackToCharacters ?? true;
5677
+ this.keepSeparator = options.keepSeparator ?? false;
5678
+ this.overlap = options.overlap ?? 0;
5679
+ }
5680
+ detectSentenceBoundaries(text) {
5681
+ if (!text) return [];
5682
+ const sentences = [];
5683
+ let currentSentence = "";
5684
+ let i = 0;
5685
+ while (i < text.length) {
5686
+ const char = text[i];
5687
+ if (!char) break;
5688
+ currentSentence += char;
5689
+ if (this.sentenceEnders.includes(char)) {
5690
+ const remainingText = text.slice(i + 1);
5691
+ if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
5692
+ sentences.push(currentSentence.trim());
5693
+ currentSentence = "";
5694
+ }
5695
+ }
5696
+ i++;
5697
+ }
5698
+ if (currentSentence.trim()) {
5699
+ sentences.push(currentSentence.trim());
5700
+ }
5701
+ return sentences.filter((s) => s.length > 0);
5702
+ }
5703
+ isRealSentenceBoundary(currentSentence, remainingText) {
5704
+ if (!remainingText.trim()) {
5705
+ return true;
5706
+ }
5707
+ if (!/^\s+[A-Z]/.test(remainingText)) {
5708
+ return false;
5709
+ }
5710
+ const words = currentSentence.trim().split(/\s+/);
5711
+ const lastWord = words[words.length - 1] || "";
5712
+ const baseWord = lastWord.slice(0, -1);
5713
+ if (this.isCommonAbbreviation(baseWord)) {
5714
+ return false;
5715
+ }
5716
+ return true;
5717
+ }
5718
+ isCommonAbbreviation(word) {
5719
+ const titles = ["Dr", "Mr", "Mrs", "Ms", "Prof", "Sr", "Jr"];
5720
+ if (titles.includes(word)) {
5721
+ return true;
5722
+ }
5723
+ if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
5724
+ return true;
5725
+ }
5726
+ if (/^[A-Z]$/.test(word)) {
5727
+ return true;
5728
+ }
5729
+ if (/^\d+$/.test(word)) {
5730
+ return true;
5731
+ }
5732
+ if (/^[ap]\.?m$/i.test(word)) {
5733
+ return true;
5734
+ }
5735
+ return false;
5736
+ }
5737
+ /**
5738
+ * Group sentences into chunks with integrated overlap processing
5739
+ */
5740
+ groupSentencesIntoChunks(sentences) {
5741
+ const chunks = [];
5742
+ let currentChunk = [];
5743
+ let currentSize = 0;
5744
+ const separator = " ";
5745
+ for (const sentence of sentences) {
5746
+ const sentenceLength = this.lengthFunction(sentence);
5747
+ const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
5748
+ const totalLength = currentSize + sentenceLength + separatorLength;
5749
+ if (sentenceLength > this.maxSize) {
5750
+ if (currentChunk.length > 0) {
5751
+ chunks.push(currentChunk.join(separator));
5752
+ currentChunk = [];
5753
+ currentSize = 0;
5754
+ }
5755
+ const fallbackChunks = this.handleOversizedSentence(sentence);
5756
+ chunks.push(...fallbackChunks);
5757
+ continue;
5758
+ }
5759
+ if (currentChunk.length > 0 && totalLength > this.maxSize) {
5760
+ chunks.push(currentChunk.join(separator));
5761
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
5762
+ currentChunk = overlapSentences;
5763
+ currentSize = this.calculateChunkSize(currentChunk);
5764
+ }
5765
+ currentChunk.push(sentence);
5766
+ currentSize += sentenceLength + separatorLength;
5767
+ if (currentSize >= this.targetSize) {
5768
+ chunks.push(currentChunk.join(separator));
5769
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
5770
+ currentChunk = overlapSentences;
5771
+ currentSize = this.calculateChunkSize(currentChunk);
5772
+ }
5773
+ }
5774
+ if (currentChunk.length > 0) {
5775
+ chunks.push(currentChunk.join(separator));
5776
+ }
5777
+ return chunks;
5778
+ }
5779
+ /**
5780
+ * Handle oversized sentences with fallback strategies
5781
+ */
5782
+ handleOversizedSentence(sentence) {
5783
+ if (this.fallbackToWords) {
5784
+ const wordChunks = this.splitSentenceIntoWords(sentence);
5785
+ if (wordChunks.length > 1) {
5786
+ return wordChunks;
5787
+ }
5788
+ }
5789
+ if (this.fallbackToCharacters) {
5790
+ return this.splitSentenceIntoCharacters(sentence);
5791
+ }
5792
+ console.warn(
5793
+ `Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`
5794
+ );
5795
+ return [sentence];
5796
+ }
5797
+ splitSentenceIntoWords(sentence) {
5798
+ const words = sentence.split(/\s+/);
5799
+ const chunks = [];
5800
+ let currentChunk = "";
5801
+ for (const word of words) {
5802
+ const testChunk = currentChunk ? currentChunk + " " + word : word;
5803
+ if (this.lengthFunction(testChunk) <= this.maxSize) {
5804
+ currentChunk = testChunk;
5805
+ } else {
5806
+ if (currentChunk) {
5807
+ chunks.push(currentChunk);
5808
+ }
5809
+ if (this.lengthFunction(word) > this.maxSize) {
5810
+ if (this.fallbackToCharacters) {
5811
+ chunks.push(...this.splitSentenceIntoCharacters(word));
5812
+ } else {
5813
+ chunks.push(word);
5814
+ }
5815
+ currentChunk = "";
5816
+ } else {
5817
+ currentChunk = word;
5818
+ }
5819
+ }
5820
+ }
5821
+ if (currentChunk) {
5822
+ chunks.push(currentChunk);
5823
+ }
5824
+ return chunks;
5825
+ }
5826
+ splitSentenceIntoCharacters(text) {
5827
+ const chunks = [];
5828
+ let currentChunk = "";
5829
+ for (const char of text) {
5830
+ if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
5831
+ currentChunk += char;
5832
+ } else {
5833
+ if (currentChunk) {
5834
+ chunks.push(currentChunk);
5835
+ }
5836
+ currentChunk = char;
5837
+ }
5838
+ }
5839
+ if (currentChunk) {
5840
+ chunks.push(currentChunk);
5841
+ }
5842
+ return chunks;
5843
+ }
5844
+ calculateSentenceOverlap(currentChunk) {
5845
+ if (this.overlap === 0 || currentChunk.length === 0) {
5846
+ return [];
5847
+ }
5848
+ const overlapSentences = [];
5849
+ let overlapSize = 0;
5850
+ const separator = " ";
5851
+ for (let i = currentChunk.length - 1; i >= 0; i--) {
5852
+ const sentence = currentChunk[i];
5853
+ if (!sentence) continue;
5854
+ const sentenceLength = this.lengthFunction(sentence);
5855
+ const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
5856
+ if (overlapSize + sentenceLength + separatorLength > this.overlap) {
5857
+ break;
5858
+ }
5859
+ overlapSentences.unshift(sentence);
5860
+ overlapSize += sentenceLength + separatorLength;
5861
+ }
5862
+ return overlapSentences;
5863
+ }
5864
+ calculateChunkSize(sentences) {
5865
+ if (!sentences || sentences.length === 0) {
5866
+ return 0;
5867
+ }
5868
+ let totalSize = 0;
5869
+ const separator = " ";
5870
+ for (let i = 0; i < sentences.length; i++) {
5871
+ const sentence = sentences[i];
5872
+ totalSize += this.lengthFunction(sentence);
5873
+ if (i < sentences.length - 1) {
5874
+ totalSize += this.lengthFunction(separator);
5875
+ }
5876
+ }
5877
+ return totalSize;
5878
+ }
5879
+ splitText({ text }) {
5880
+ if (!text) return [];
5881
+ const sentences = this.detectSentenceBoundaries(text);
5882
+ const chunks = this.groupSentencesIntoChunks(sentences);
5883
+ return chunks.filter((chunk) => chunk.trim().length > 0);
5884
+ }
5885
+ };
5653
5886
  function splitTextOnTokens({ text, tokenizer }) {
5654
5887
  const splits = [];
5655
5888
  const inputIds = tokenizer.encode(text);
@@ -5700,7 +5933,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
5700
5933
  };
5701
5934
  const tokenizer = {
5702
5935
  overlap: this.overlap,
5703
- tokensPerChunk: this.size,
5936
+ tokensPerChunk: this.maxSize,
5704
5937
  decode,
5705
5938
  encode
5706
5939
  };
@@ -5732,13 +5965,105 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
5732
5965
  allowedSpecial: options.allowedSpecial,
5733
5966
  disallowedSpecial: options.disallowedSpecial,
5734
5967
  options: {
5735
- size: options.size,
5968
+ maxSize: options.maxSize,
5736
5969
  overlap: options.overlap,
5737
5970
  lengthFunction: tikTokenEncoder
5738
5971
  }
5739
5972
  });
5740
5973
  }
5741
5974
  };
5975
+ function handleDeprecatedSize(data) {
5976
+ if (data.size !== void 0) {
5977
+ console.warn(
5978
+ "[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version."
5979
+ );
5980
+ if (data.maxSize === void 0) {
5981
+ data.maxSize = data.size;
5982
+ }
5983
+ }
5984
+ const { size, ...rest } = data;
5985
+ return rest;
5986
+ }
5987
+ var baseChunkOptionsSchema = zod.z.object({
5988
+ size: zod.z.number().positive().optional(),
5989
+ maxSize: zod.z.number().positive().optional(),
5990
+ overlap: zod.z.number().min(0).optional(),
5991
+ lengthFunction: zod.z.function().optional(),
5992
+ keepSeparator: zod.z.union([zod.z.boolean(), zod.z.literal("start"), zod.z.literal("end")]).optional(),
5993
+ addStartIndex: zod.z.boolean().optional(),
5994
+ stripWhitespace: zod.z.boolean().optional()
5995
+ });
5996
+ var characterChunkOptionsSchema = baseChunkOptionsSchema.extend({
5997
+ separator: zod.z.string().optional(),
5998
+ isSeparatorRegex: zod.z.boolean().optional()
5999
+ }).strict();
6000
+ var recursiveChunkOptionsSchema = baseChunkOptionsSchema.extend({
6001
+ separators: zod.z.array(zod.z.string()).optional(),
6002
+ isSeparatorRegex: zod.z.boolean().optional(),
6003
+ language: zod.z.string().optional()
6004
+ }).strict();
6005
+ var sentenceChunkOptionsSchema = baseChunkOptionsSchema.extend({
6006
+ maxSize: zod.z.number().positive(),
6007
+ minSize: zod.z.number().positive().optional(),
6008
+ targetSize: zod.z.number().positive().optional(),
6009
+ sentenceEnders: zod.z.array(zod.z.string()).optional(),
6010
+ fallbackToWords: zod.z.boolean().optional(),
6011
+ fallbackToCharacters: zod.z.boolean().optional()
6012
+ }).strict();
6013
+ var isSetLike = (value) => {
6014
+ return typeof value === "object" && value !== null && typeof value.has === "function" && typeof value.add === "function" && typeof value.delete === "function" && typeof value.clear === "function" && typeof value.size === "number";
6015
+ };
6016
+ var setOrAllSchema = zod.z.any().refine((value) => value === "all" || isSetLike(value), {
6017
+ message: "Must be a Set object or the literal 'all'"
6018
+ }).optional();
6019
+ var tokenChunkOptionsSchema = baseChunkOptionsSchema.extend({
6020
+ encodingName: zod.z.string().optional(),
6021
+ modelName: zod.z.string().optional(),
6022
+ allowedSpecial: setOrAllSchema,
6023
+ disallowedSpecial: setOrAllSchema
6024
+ }).strict();
6025
+ var jsonChunkOptionsSchema = baseChunkOptionsSchema.extend({
6026
+ minSize: zod.z.number().positive().optional(),
6027
+ ensureAscii: zod.z.boolean().optional(),
6028
+ convertLists: zod.z.boolean().optional()
6029
+ }).strict();
6030
+ var htmlChunkOptionsSchema = baseChunkOptionsSchema.extend({
6031
+ headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
6032
+ sections: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
6033
+ returnEachLine: zod.z.boolean().optional()
6034
+ }).strict();
6035
+ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
6036
+ headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
6037
+ returnEachLine: zod.z.boolean().optional(),
6038
+ stripHeaders: zod.z.boolean().optional()
6039
+ }).strict();
6040
+ var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
6041
+ var validationSchemas = {
6042
+ character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
6043
+ recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
6044
+ sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
6045
+ token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
6046
+ json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
6047
+ html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
6048
+ markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
6049
+ latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
6050
+ };
6051
+ function validateChunkParams(strategy, params) {
6052
+ const schema = validationSchemas[strategy];
6053
+ if (!schema) {
6054
+ throw new Error(`Unknown chunking strategy: ${strategy}`);
6055
+ }
6056
+ const result = schema.safeParse(params);
6057
+ if (!result.success) {
6058
+ const unrecognizedError = result.error.errors.find((e) => e.code === "unrecognized_keys");
6059
+ if (unrecognizedError && "keys" in unrecognizedError) {
6060
+ const keys = unrecognizedError.keys.join(", ");
6061
+ throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
6062
+ }
6063
+ const errorMessage = result.error.errors.map((e) => `${e.path.length > 0 ? e.path.join(".") : "parameter"}: ${e.message}`).join(", ");
6064
+ throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
6065
+ }
6066
+ }
5742
6067
 
5743
6068
  // src/document/document.ts
5744
6069
  var MDocument = class _MDocument {
@@ -5851,30 +6176,21 @@ var MDocument = class _MDocument {
5851
6176
  }
5852
6177
  }
5853
6178
  async chunkBy(strategy, options) {
5854
- switch (strategy) {
5855
- case "recursive":
5856
- await this.chunkRecursive(options);
5857
- break;
5858
- case "character":
5859
- await this.chunkCharacter(options);
5860
- break;
5861
- case "token":
5862
- await this.chunkToken(options);
5863
- break;
5864
- case "markdown":
5865
- await this.chunkMarkdown(options);
5866
- break;
5867
- case "html":
5868
- await this.chunkHTML(options);
5869
- break;
5870
- case "json":
5871
- await this.chunkJSON(options);
5872
- break;
5873
- case "latex":
5874
- await this.chunkLatex(options);
5875
- break;
5876
- default:
5877
- throw new Error(`Unknown strategy: ${strategy}`);
6179
+ const strategyMap = {
6180
+ recursive: (options2) => this.chunkRecursive(options2),
6181
+ character: (options2) => this.chunkCharacter(options2),
6182
+ token: (options2) => this.chunkToken(options2),
6183
+ markdown: (options2) => this.chunkMarkdown(options2),
6184
+ html: (options2) => this.chunkHTML(options2),
6185
+ json: (options2) => this.chunkJSON(options2),
6186
+ latex: (options2) => this.chunkLatex(options2),
6187
+ sentence: (options2) => this.chunkSentence(options2)
6188
+ };
6189
+ const chunkingFunc = strategyMap[strategy];
6190
+ if (chunkingFunc) {
6191
+ await chunkingFunc(options);
6192
+ } else {
6193
+ throw new Error(`Unknown strategy: ${strategy}`);
5878
6194
  }
5879
6195
  }
5880
6196
  async chunkRecursive(options) {
@@ -5884,32 +6200,28 @@ var MDocument = class _MDocument {
5884
6200
  this.chunks = textSplit2;
5885
6201
  return;
5886
6202
  }
5887
- const rt = new RecursiveCharacterTransformer({
5888
- separators: options?.separators,
5889
- isSeparatorRegex: options?.isSeparatorRegex,
5890
- options
5891
- });
6203
+ const rt = new RecursiveCharacterTransformer(options);
5892
6204
  const textSplit = rt.transformDocuments(this.chunks);
5893
6205
  this.chunks = textSplit;
5894
6206
  }
5895
6207
  async chunkCharacter(options) {
5896
6208
  const rt = new CharacterTransformer({
6209
+ ...options,
5897
6210
  separator: options?.separator,
5898
- isSeparatorRegex: options?.isSeparatorRegex,
5899
- options
6211
+ isSeparatorRegex: options?.isSeparatorRegex
5900
6212
  });
5901
6213
  const textSplit = rt.transformDocuments(this.chunks);
5902
6214
  this.chunks = textSplit;
5903
6215
  }
5904
6216
  async chunkHTML(options) {
5905
6217
  if (options?.headers?.length) {
5906
- const rt = new HTMLHeaderTransformer(options.headers, options?.returnEachLine);
6218
+ const rt = new HTMLHeaderTransformer(options);
5907
6219
  const textSplit = rt.transformDocuments(this.chunks);
5908
6220
  this.chunks = textSplit;
5909
6221
  return;
5910
6222
  }
5911
6223
  if (options?.sections?.length) {
5912
- const rt = new HTMLSectionTransformer(options.sections);
6224
+ const rt = new HTMLSectionTransformer(options);
5913
6225
  const textSplit = rt.transformDocuments(this.chunks);
5914
6226
  this.chunks = textSplit;
5915
6227
  return;
@@ -5956,9 +6268,30 @@ var MDocument = class _MDocument {
5956
6268
  const textSplit = rt.transformDocuments(this.chunks);
5957
6269
  this.chunks = textSplit;
5958
6270
  }
6271
+ async chunkSentence(options) {
6272
+ if (!options?.maxSize) {
6273
+ throw new Error("Sentence chunking requires maxSize to be specified");
6274
+ }
6275
+ const rt = new SentenceTransformer({
6276
+ minSize: options?.minSize,
6277
+ maxSize: options?.maxSize,
6278
+ targetSize: options?.targetSize,
6279
+ overlap: options?.overlap,
6280
+ sentenceEnders: options?.sentenceEnders,
6281
+ fallbackToWords: options?.fallbackToWords,
6282
+ fallbackToCharacters: options?.fallbackToCharacters,
6283
+ keepSeparator: options?.keepSeparator,
6284
+ lengthFunction: options?.lengthFunction,
6285
+ addStartIndex: options?.addStartIndex,
6286
+ stripWhitespace: options?.stripWhitespace
6287
+ });
6288
+ const textSplit = rt.transformDocuments(this.chunks);
6289
+ this.chunks = textSplit;
6290
+ }
5959
6291
  async chunk(params) {
5960
6292
  const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
5961
6293
  const strategy = passedStrategy || this.defaultStrategy();
6294
+ validateChunkParams(strategy, chunkOptions);
5962
6295
  await this.chunkBy(strategy, chunkOptions);
5963
6296
  if (extract) {
5964
6297
  await this.extractMetadata(extract);
@@ -6351,19 +6684,20 @@ var GraphRAG = class {
6351
6684
  }));
6352
6685
  }
6353
6686
  };
6687
+ var DEFAULT_CHUNK_PARAMS = {
6688
+ strategy: "recursive",
6689
+ maxSize: 512,
6690
+ overlap: 50,
6691
+ separators: ["\n"]
6692
+ };
6354
6693
  var createDocumentChunkerTool = ({
6355
6694
  doc,
6356
- params = {
6357
- strategy: "recursive",
6358
- size: 512,
6359
- overlap: 50,
6360
- separator: "\n"
6361
- }
6695
+ params = DEFAULT_CHUNK_PARAMS
6362
6696
  }) => {
6363
6697
  return tools.createTool({
6364
- id: `Document Chunker ${params.strategy} ${params.size}`,
6698
+ id: `Document Chunker ${params.strategy} ${params.maxSize}`,
6365
6699
  inputSchema: zod.z.object({}),
6366
- description: `Chunks document using ${params.strategy} strategy with size ${params.size} and ${params.overlap} overlap`,
6700
+ description: `Chunks document using ${params.strategy} strategy with maxSize ${params.maxSize} and ${params.overlap || 0} overlap`,
6367
6701
  execute: async () => {
6368
6702
  const chunks = await doc.chunk(params);
6369
6703
  return {