@mastra/rag 1.0.6 → 1.0.7-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +12 -0
  3. package/dist/document/document.d.ts +9 -8
  4. package/dist/document/document.d.ts.map +1 -1
  5. package/dist/document/transformers/character.d.ts +4 -26
  6. package/dist/document/transformers/character.d.ts.map +1 -1
  7. package/dist/document/transformers/html.d.ts +8 -3
  8. package/dist/document/transformers/html.d.ts.map +1 -1
  9. package/dist/document/transformers/json.d.ts +4 -4
  10. package/dist/document/transformers/json.d.ts.map +1 -1
  11. package/dist/document/transformers/latex.d.ts +2 -8
  12. package/dist/document/transformers/latex.d.ts.map +1 -1
  13. package/dist/document/transformers/markdown.d.ts +2 -8
  14. package/dist/document/transformers/markdown.d.ts.map +1 -1
  15. package/dist/document/transformers/sentence.d.ts +31 -0
  16. package/dist/document/transformers/sentence.d.ts.map +1 -0
  17. package/dist/document/transformers/text.d.ts +3 -3
  18. package/dist/document/transformers/text.d.ts.map +1 -1
  19. package/dist/document/transformers/token.d.ts +4 -15
  20. package/dist/document/transformers/token.d.ts.map +1 -1
  21. package/dist/document/types.d.ts +85 -14
  22. package/dist/document/types.d.ts.map +1 -1
  23. package/dist/document/validation.d.ts +3 -0
  24. package/dist/document/validation.d.ts.map +1 -0
  25. package/dist/index.cjs +414 -80
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.js +414 -80
  28. package/dist/index.js.map +1 -1
  29. package/dist/tools/document-chunker.d.ts.map +1 -1
  30. package/package.json +5 -5
  31. package/src/document/document.test.ts +294 -39
  32. package/src/document/document.ts +69 -41
  33. package/src/document/transformers/character.ts +15 -43
  34. package/src/document/transformers/html.ts +9 -9
  35. package/src/document/transformers/json.ts +8 -3
  36. package/src/document/transformers/latex.ts +3 -11
  37. package/src/document/transformers/markdown.ts +3 -11
  38. package/src/document/transformers/sentence.ts +314 -0
  39. package/src/document/transformers/text.ts +10 -10
  40. package/src/document/transformers/token.ts +6 -17
  41. package/src/document/types.ts +66 -15
  42. package/src/document/validation.ts +147 -0
  43. package/src/tools/document-chunker.ts +12 -8
package/dist/index.js CHANGED
@@ -4471,24 +4471,24 @@ var Language = /* @__PURE__ */ ((Language2) => {
4471
4471
 
4472
4472
  // src/document/transformers/text.ts
4473
4473
  var TextTransformer = class {
4474
- size;
4474
+ maxSize;
4475
4475
  overlap;
4476
4476
  lengthFunction;
4477
4477
  keepSeparator;
4478
4478
  addStartIndex;
4479
4479
  stripWhitespace;
4480
4480
  constructor({
4481
- size = 4e3,
4481
+ maxSize = 4e3,
4482
4482
  overlap = 200,
4483
4483
  lengthFunction = (text) => text.length,
4484
4484
  keepSeparator = false,
4485
4485
  addStartIndex = false,
4486
4486
  stripWhitespace = true
4487
4487
  }) {
4488
- if (overlap > size) {
4489
- throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${size}), should be smaller.`);
4488
+ if (overlap > maxSize) {
4489
+ throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${maxSize}), should be smaller.`);
4490
4490
  }
4491
- this.size = size;
4491
+ this.maxSize = maxSize;
4492
4492
  this.overlap = overlap;
4493
4493
  this.lengthFunction = lengthFunction;
4494
4494
  this.keepSeparator = keepSeparator;
@@ -4554,9 +4554,9 @@ var TextTransformer = class {
4554
4554
  for (const d of splits) {
4555
4555
  const len = this.lengthFunction(d);
4556
4556
  const separatorLen = separator ? this.lengthFunction(separator) : 0;
4557
- if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.size) {
4558
- if (total > this.size) {
4559
- console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.size}`);
4557
+ if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
4558
+ if (total > this.maxSize) {
4559
+ console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
4560
4560
  }
4561
4561
  if (currentDoc.length > 0) {
4562
4562
  const doc = this.joinDocs(currentDoc, separator);
@@ -4634,12 +4634,8 @@ function splitTextWithRegex(text, separator, keepSeparator) {
4634
4634
  var CharacterTransformer = class extends TextTransformer {
4635
4635
  separator;
4636
4636
  isSeparatorRegex;
4637
- constructor({
4638
- separator = "\n\n",
4639
- isSeparatorRegex = false,
4640
- options = {}
4641
- }) {
4642
- super(options);
4637
+ constructor({ separator = "\n\n", isSeparatorRegex = false, ...baseOptions } = {}) {
4638
+ super(baseOptions);
4643
4639
  this.separator = separator;
4644
4640
  this.isSeparatorRegex = isSeparatorRegex;
4645
4641
  }
@@ -4648,7 +4644,7 @@ var CharacterTransformer = class extends TextTransformer {
4648
4644
  const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
4649
4645
  const chunks = [];
4650
4646
  for (const split of initialSplits) {
4651
- if (this.lengthFunction(split) <= this.size) {
4647
+ if (this.lengthFunction(split) <= this.maxSize) {
4652
4648
  chunks.push(split);
4653
4649
  } else {
4654
4650
  const subChunks = this.__splitChunk(split);
@@ -4662,7 +4658,7 @@ var CharacterTransformer = class extends TextTransformer {
4662
4658
  let currentPosition = 0;
4663
4659
  while (currentPosition < text.length) {
4664
4660
  let chunkEnd = currentPosition;
4665
- while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.size) {
4661
+ while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
4666
4662
  chunkEnd++;
4667
4663
  }
4668
4664
  const currentChunk = text.slice(currentPosition, chunkEnd);
@@ -4677,12 +4673,8 @@ var CharacterTransformer = class extends TextTransformer {
4677
4673
  var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends TextTransformer {
4678
4674
  separators;
4679
4675
  isSeparatorRegex;
4680
- constructor({
4681
- separators,
4682
- isSeparatorRegex = false,
4683
- options = {}
4684
- }) {
4685
- super(options);
4676
+ constructor({ separators, isSeparatorRegex = false, language, ...baseOptions } = {}) {
4677
+ super(baseOptions);
4686
4678
  this.separators = separators || ["\n\n", "\n", " ", ""];
4687
4679
  this.isSeparatorRegex = isSeparatorRegex;
4688
4680
  }
@@ -4708,7 +4700,7 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4708
4700
  const goodSplits = [];
4709
4701
  const mergeSeparator = this.keepSeparator ? "" : separator;
4710
4702
  for (const s of splits) {
4711
- if (this.lengthFunction(s) < this.size) {
4703
+ if (this.lengthFunction(s) < this.maxSize) {
4712
4704
  goodSplits.push(s);
4713
4705
  } else {
4714
4706
  if (goodSplits.length > 0) {
@@ -4735,7 +4727,12 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4735
4727
  }
4736
4728
  static fromLanguage(language, options = {}) {
4737
4729
  const separators = _RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
4738
- return new _RecursiveCharacterTransformer({ separators, isSeparatorRegex: true, options });
4730
+ return new _RecursiveCharacterTransformer({
4731
+ ...options,
4732
+ separators,
4733
+ isSeparatorRegex: true,
4734
+ language
4735
+ });
4739
4736
  }
4740
4737
  static getSeparatorsForLanguage(language) {
4741
4738
  switch (language) {
@@ -4820,9 +4817,9 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
4820
4817
  var HTMLHeaderTransformer = class {
4821
4818
  headersToSplitOn;
4822
4819
  returnEachElement;
4823
- constructor(headersToSplitOn, returnEachElement = false) {
4824
- this.returnEachElement = returnEachElement;
4825
- this.headersToSplitOn = [...headersToSplitOn].sort();
4820
+ constructor(options) {
4821
+ this.returnEachElement = options.returnEachLine ?? false;
4822
+ this.headersToSplitOn = [...options.headers].sort();
4826
4823
  }
4827
4824
  splitText({ text }) {
4828
4825
  const root = parse(text);
@@ -4953,10 +4950,10 @@ var HTMLHeaderTransformer = class {
4953
4950
  };
4954
4951
  var HTMLSectionTransformer = class {
4955
4952
  headersToSplitOn;
4956
- options;
4957
- constructor(headersToSplitOn, options = {}) {
4958
- this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name14]) => [tag.toLowerCase(), name14]));
4959
- this.options = options;
4953
+ textSplitter;
4954
+ constructor(options) {
4955
+ this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name14]) => [tag.toLowerCase(), name14]));
4956
+ this.textSplitter = new RecursiveCharacterTransformer(options);
4960
4957
  }
4961
4958
  splitText(text) {
4962
4959
  const sections = this.splitHtmlByHeaders(text);
@@ -5025,8 +5022,7 @@ var HTMLSectionTransformer = class {
5025
5022
  metadatas.push(doc.metadata);
5026
5023
  }
5027
5024
  const results = await this.createDocuments(texts, metadatas);
5028
- const textSplitter = new RecursiveCharacterTransformer({ options: this.options });
5029
- return textSplitter.splitDocuments(results);
5025
+ return this.textSplitter.splitDocuments(results);
5030
5026
  }
5031
5027
  createDocuments(texts, metadatas) {
5032
5028
  const _metadatas = metadatas || Array(texts.length).fill({});
@@ -5068,9 +5064,13 @@ var HTMLSectionTransformer = class {
5068
5064
  var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5069
5065
  maxSize;
5070
5066
  minSize;
5071
- constructor({ maxSize = 2e3, minSize }) {
5067
+ ensureAscii;
5068
+ convertLists;
5069
+ constructor({ maxSize = 2e3, minSize, ensureAscii = false, convertLists = true }) {
5072
5070
  this.maxSize = maxSize;
5073
5071
  this.minSize = minSize ?? Math.max(maxSize - 200, 50);
5072
+ this.ensureAscii = ensureAscii;
5073
+ this.convertLists = convertLists;
5074
5074
  }
5075
5075
  static jsonSize(data) {
5076
5076
  const seen = /* @__PURE__ */ new WeakSet();
@@ -5202,7 +5202,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5202
5202
  */
5203
5203
  isWithinSizeLimit(value, currentSize = 0) {
5204
5204
  const size = _RecursiveJsonTransformer.jsonSize(value);
5205
- return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize;
5205
+ return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
5206
5206
  }
5207
5207
  /**
5208
5208
  * Splits arrays into chunks based on size limits
@@ -5469,7 +5469,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
5469
5469
  var LatexTransformer = class extends RecursiveCharacterTransformer {
5470
5470
  constructor(options = {}) {
5471
5471
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("latex" /* LATEX */);
5472
- super({ separators, isSeparatorRegex: true, options });
5472
+ super({ ...options, separators, isSeparatorRegex: true });
5473
5473
  }
5474
5474
  };
5475
5475
 
@@ -5477,7 +5477,7 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
5477
5477
  var MarkdownTransformer = class extends RecursiveCharacterTransformer {
5478
5478
  constructor(options = {}) {
5479
5479
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
5480
- super({ separators, isSeparatorRegex: true, options });
5480
+ super({ ...options, separators, isSeparatorRegex: true });
5481
5481
  }
5482
5482
  };
5483
5483
  var MarkdownHeaderTransformer = class {
@@ -5644,6 +5644,239 @@ var MarkdownHeaderTransformer = class {
5644
5644
  return this.createDocuments(texts, metadatas);
5645
5645
  }
5646
5646
  };
5647
+
5648
+ // src/document/transformers/sentence.ts
5649
+ var SentenceTransformer = class extends TextTransformer {
5650
+ minSize;
5651
+ maxSize;
5652
+ targetSize;
5653
+ sentenceEnders;
5654
+ fallbackToWords;
5655
+ fallbackToCharacters;
5656
+ keepSeparator;
5657
+ constructor(options) {
5658
+ const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
5659
+ const baseOptions = {
5660
+ ...options,
5661
+ overlap: parentOverlap
5662
+ // Use adjusted overlap for parent
5663
+ };
5664
+ super(baseOptions);
5665
+ this.maxSize = options.maxSize;
5666
+ this.minSize = options.minSize ?? 50;
5667
+ this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
5668
+ this.sentenceEnders = options.sentenceEnders ?? [".", "!", "?"];
5669
+ this.fallbackToWords = options.fallbackToWords ?? true;
5670
+ this.fallbackToCharacters = options.fallbackToCharacters ?? true;
5671
+ this.keepSeparator = options.keepSeparator ?? false;
5672
+ this.overlap = options.overlap ?? 0;
5673
+ }
5674
+ detectSentenceBoundaries(text) {
5675
+ if (!text) return [];
5676
+ const sentences = [];
5677
+ let currentSentence = "";
5678
+ let i = 0;
5679
+ while (i < text.length) {
5680
+ const char = text[i];
5681
+ if (!char) break;
5682
+ currentSentence += char;
5683
+ if (this.sentenceEnders.includes(char)) {
5684
+ const remainingText = text.slice(i + 1);
5685
+ if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
5686
+ sentences.push(currentSentence.trim());
5687
+ currentSentence = "";
5688
+ }
5689
+ }
5690
+ i++;
5691
+ }
5692
+ if (currentSentence.trim()) {
5693
+ sentences.push(currentSentence.trim());
5694
+ }
5695
+ return sentences.filter((s) => s.length > 0);
5696
+ }
5697
+ isRealSentenceBoundary(currentSentence, remainingText) {
5698
+ if (!remainingText.trim()) {
5699
+ return true;
5700
+ }
5701
+ if (!/^\s+[A-Z]/.test(remainingText)) {
5702
+ return false;
5703
+ }
5704
+ const words = currentSentence.trim().split(/\s+/);
5705
+ const lastWord = words[words.length - 1] || "";
5706
+ const baseWord = lastWord.slice(0, -1);
5707
+ if (this.isCommonAbbreviation(baseWord)) {
5708
+ return false;
5709
+ }
5710
+ return true;
5711
+ }
5712
+ isCommonAbbreviation(word) {
5713
+ const titles = ["Dr", "Mr", "Mrs", "Ms", "Prof", "Sr", "Jr"];
5714
+ if (titles.includes(word)) {
5715
+ return true;
5716
+ }
5717
+ if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
5718
+ return true;
5719
+ }
5720
+ if (/^[A-Z]$/.test(word)) {
5721
+ return true;
5722
+ }
5723
+ if (/^\d+$/.test(word)) {
5724
+ return true;
5725
+ }
5726
+ if (/^[ap]\.?m$/i.test(word)) {
5727
+ return true;
5728
+ }
5729
+ return false;
5730
+ }
5731
+ /**
5732
+ * Group sentences into chunks with integrated overlap processing
5733
+ */
5734
+ groupSentencesIntoChunks(sentences) {
5735
+ const chunks = [];
5736
+ let currentChunk = [];
5737
+ let currentSize = 0;
5738
+ const separator = " ";
5739
+ for (const sentence of sentences) {
5740
+ const sentenceLength = this.lengthFunction(sentence);
5741
+ const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
5742
+ const totalLength = currentSize + sentenceLength + separatorLength;
5743
+ if (sentenceLength > this.maxSize) {
5744
+ if (currentChunk.length > 0) {
5745
+ chunks.push(currentChunk.join(separator));
5746
+ currentChunk = [];
5747
+ currentSize = 0;
5748
+ }
5749
+ const fallbackChunks = this.handleOversizedSentence(sentence);
5750
+ chunks.push(...fallbackChunks);
5751
+ continue;
5752
+ }
5753
+ if (currentChunk.length > 0 && totalLength > this.maxSize) {
5754
+ chunks.push(currentChunk.join(separator));
5755
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
5756
+ currentChunk = overlapSentences;
5757
+ currentSize = this.calculateChunkSize(currentChunk);
5758
+ }
5759
+ currentChunk.push(sentence);
5760
+ currentSize += sentenceLength + separatorLength;
5761
+ if (currentSize >= this.targetSize) {
5762
+ chunks.push(currentChunk.join(separator));
5763
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
5764
+ currentChunk = overlapSentences;
5765
+ currentSize = this.calculateChunkSize(currentChunk);
5766
+ }
5767
+ }
5768
+ if (currentChunk.length > 0) {
5769
+ chunks.push(currentChunk.join(separator));
5770
+ }
5771
+ return chunks;
5772
+ }
5773
+ /**
5774
+ * Handle oversized sentences with fallback strategies
5775
+ */
5776
+ handleOversizedSentence(sentence) {
5777
+ if (this.fallbackToWords) {
5778
+ const wordChunks = this.splitSentenceIntoWords(sentence);
5779
+ if (wordChunks.length > 1) {
5780
+ return wordChunks;
5781
+ }
5782
+ }
5783
+ if (this.fallbackToCharacters) {
5784
+ return this.splitSentenceIntoCharacters(sentence);
5785
+ }
5786
+ console.warn(
5787
+ `Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`
5788
+ );
5789
+ return [sentence];
5790
+ }
5791
+ splitSentenceIntoWords(sentence) {
5792
+ const words = sentence.split(/\s+/);
5793
+ const chunks = [];
5794
+ let currentChunk = "";
5795
+ for (const word of words) {
5796
+ const testChunk = currentChunk ? currentChunk + " " + word : word;
5797
+ if (this.lengthFunction(testChunk) <= this.maxSize) {
5798
+ currentChunk = testChunk;
5799
+ } else {
5800
+ if (currentChunk) {
5801
+ chunks.push(currentChunk);
5802
+ }
5803
+ if (this.lengthFunction(word) > this.maxSize) {
5804
+ if (this.fallbackToCharacters) {
5805
+ chunks.push(...this.splitSentenceIntoCharacters(word));
5806
+ } else {
5807
+ chunks.push(word);
5808
+ }
5809
+ currentChunk = "";
5810
+ } else {
5811
+ currentChunk = word;
5812
+ }
5813
+ }
5814
+ }
5815
+ if (currentChunk) {
5816
+ chunks.push(currentChunk);
5817
+ }
5818
+ return chunks;
5819
+ }
5820
+ splitSentenceIntoCharacters(text) {
5821
+ const chunks = [];
5822
+ let currentChunk = "";
5823
+ for (const char of text) {
5824
+ if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
5825
+ currentChunk += char;
5826
+ } else {
5827
+ if (currentChunk) {
5828
+ chunks.push(currentChunk);
5829
+ }
5830
+ currentChunk = char;
5831
+ }
5832
+ }
5833
+ if (currentChunk) {
5834
+ chunks.push(currentChunk);
5835
+ }
5836
+ return chunks;
5837
+ }
5838
+ calculateSentenceOverlap(currentChunk) {
5839
+ if (this.overlap === 0 || currentChunk.length === 0) {
5840
+ return [];
5841
+ }
5842
+ const overlapSentences = [];
5843
+ let overlapSize = 0;
5844
+ const separator = " ";
5845
+ for (let i = currentChunk.length - 1; i >= 0; i--) {
5846
+ const sentence = currentChunk[i];
5847
+ if (!sentence) continue;
5848
+ const sentenceLength = this.lengthFunction(sentence);
5849
+ const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
5850
+ if (overlapSize + sentenceLength + separatorLength > this.overlap) {
5851
+ break;
5852
+ }
5853
+ overlapSentences.unshift(sentence);
5854
+ overlapSize += sentenceLength + separatorLength;
5855
+ }
5856
+ return overlapSentences;
5857
+ }
5858
+ calculateChunkSize(sentences) {
5859
+ if (!sentences || sentences.length === 0) {
5860
+ return 0;
5861
+ }
5862
+ let totalSize = 0;
5863
+ const separator = " ";
5864
+ for (let i = 0; i < sentences.length; i++) {
5865
+ const sentence = sentences[i];
5866
+ totalSize += this.lengthFunction(sentence);
5867
+ if (i < sentences.length - 1) {
5868
+ totalSize += this.lengthFunction(separator);
5869
+ }
5870
+ }
5871
+ return totalSize;
5872
+ }
5873
+ splitText({ text }) {
5874
+ if (!text) return [];
5875
+ const sentences = this.detectSentenceBoundaries(text);
5876
+ const chunks = this.groupSentencesIntoChunks(sentences);
5877
+ return chunks.filter((chunk) => chunk.trim().length > 0);
5878
+ }
5879
+ };
5647
5880
  function splitTextOnTokens({ text, tokenizer }) {
5648
5881
  const splits = [];
5649
5882
  const inputIds = tokenizer.encode(text);
@@ -5694,7 +5927,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
5694
5927
  };
5695
5928
  const tokenizer = {
5696
5929
  overlap: this.overlap,
5697
- tokensPerChunk: this.size,
5930
+ tokensPerChunk: this.maxSize,
5698
5931
  decode,
5699
5932
  encode
5700
5933
  };
@@ -5726,13 +5959,105 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
5726
5959
  allowedSpecial: options.allowedSpecial,
5727
5960
  disallowedSpecial: options.disallowedSpecial,
5728
5961
  options: {
5729
- size: options.size,
5962
+ maxSize: options.maxSize,
5730
5963
  overlap: options.overlap,
5731
5964
  lengthFunction: tikTokenEncoder
5732
5965
  }
5733
5966
  });
5734
5967
  }
5735
5968
  };
5969
+ function handleDeprecatedSize(data) {
5970
+ if (data.size !== void 0) {
5971
+ console.warn(
5972
+ "[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version."
5973
+ );
5974
+ if (data.maxSize === void 0) {
5975
+ data.maxSize = data.size;
5976
+ }
5977
+ }
5978
+ const { size, ...rest } = data;
5979
+ return rest;
5980
+ }
5981
+ var baseChunkOptionsSchema = z.object({
5982
+ size: z.number().positive().optional(),
5983
+ maxSize: z.number().positive().optional(),
5984
+ overlap: z.number().min(0).optional(),
5985
+ lengthFunction: z.function().optional(),
5986
+ keepSeparator: z.union([z.boolean(), z.literal("start"), z.literal("end")]).optional(),
5987
+ addStartIndex: z.boolean().optional(),
5988
+ stripWhitespace: z.boolean().optional()
5989
+ });
5990
+ var characterChunkOptionsSchema = baseChunkOptionsSchema.extend({
5991
+ separator: z.string().optional(),
5992
+ isSeparatorRegex: z.boolean().optional()
5993
+ }).strict();
5994
+ var recursiveChunkOptionsSchema = baseChunkOptionsSchema.extend({
5995
+ separators: z.array(z.string()).optional(),
5996
+ isSeparatorRegex: z.boolean().optional(),
5997
+ language: z.string().optional()
5998
+ }).strict();
5999
+ var sentenceChunkOptionsSchema = baseChunkOptionsSchema.extend({
6000
+ maxSize: z.number().positive(),
6001
+ minSize: z.number().positive().optional(),
6002
+ targetSize: z.number().positive().optional(),
6003
+ sentenceEnders: z.array(z.string()).optional(),
6004
+ fallbackToWords: z.boolean().optional(),
6005
+ fallbackToCharacters: z.boolean().optional()
6006
+ }).strict();
6007
+ var isSetLike = (value) => {
6008
+ return typeof value === "object" && value !== null && typeof value.has === "function" && typeof value.add === "function" && typeof value.delete === "function" && typeof value.clear === "function" && typeof value.size === "number";
6009
+ };
6010
+ var setOrAllSchema = z.any().refine((value) => value === "all" || isSetLike(value), {
6011
+ message: "Must be a Set object or the literal 'all'"
6012
+ }).optional();
6013
+ var tokenChunkOptionsSchema = baseChunkOptionsSchema.extend({
6014
+ encodingName: z.string().optional(),
6015
+ modelName: z.string().optional(),
6016
+ allowedSpecial: setOrAllSchema,
6017
+ disallowedSpecial: setOrAllSchema
6018
+ }).strict();
6019
+ var jsonChunkOptionsSchema = baseChunkOptionsSchema.extend({
6020
+ minSize: z.number().positive().optional(),
6021
+ ensureAscii: z.boolean().optional(),
6022
+ convertLists: z.boolean().optional()
6023
+ }).strict();
6024
+ var htmlChunkOptionsSchema = baseChunkOptionsSchema.extend({
6025
+ headers: z.array(z.tuple([z.string(), z.string()])).optional(),
6026
+ sections: z.array(z.tuple([z.string(), z.string()])).optional(),
6027
+ returnEachLine: z.boolean().optional()
6028
+ }).strict();
6029
+ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
6030
+ headers: z.array(z.tuple([z.string(), z.string()])).optional(),
6031
+ returnEachLine: z.boolean().optional(),
6032
+ stripHeaders: z.boolean().optional()
6033
+ }).strict();
6034
+ var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
6035
+ var validationSchemas = {
6036
+ character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
6037
+ recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
6038
+ sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
6039
+ token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
6040
+ json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
6041
+ html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
6042
+ markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
6043
+ latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
6044
+ };
6045
+ function validateChunkParams(strategy, params) {
6046
+ const schema = validationSchemas[strategy];
6047
+ if (!schema) {
6048
+ throw new Error(`Unknown chunking strategy: ${strategy}`);
6049
+ }
6050
+ const result = schema.safeParse(params);
6051
+ if (!result.success) {
6052
+ const unrecognizedError = result.error.errors.find((e) => e.code === "unrecognized_keys");
6053
+ if (unrecognizedError && "keys" in unrecognizedError) {
6054
+ const keys = unrecognizedError.keys.join(", ");
6055
+ throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
6056
+ }
6057
+ const errorMessage = result.error.errors.map((e) => `${e.path.length > 0 ? e.path.join(".") : "parameter"}: ${e.message}`).join(", ");
6058
+ throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
6059
+ }
6060
+ }
5736
6061
 
5737
6062
  // src/document/document.ts
5738
6063
  var MDocument = class _MDocument {
@@ -5845,30 +6170,21 @@ var MDocument = class _MDocument {
5845
6170
  }
5846
6171
  }
5847
6172
  async chunkBy(strategy, options) {
5848
- switch (strategy) {
5849
- case "recursive":
5850
- await this.chunkRecursive(options);
5851
- break;
5852
- case "character":
5853
- await this.chunkCharacter(options);
5854
- break;
5855
- case "token":
5856
- await this.chunkToken(options);
5857
- break;
5858
- case "markdown":
5859
- await this.chunkMarkdown(options);
5860
- break;
5861
- case "html":
5862
- await this.chunkHTML(options);
5863
- break;
5864
- case "json":
5865
- await this.chunkJSON(options);
5866
- break;
5867
- case "latex":
5868
- await this.chunkLatex(options);
5869
- break;
5870
- default:
5871
- throw new Error(`Unknown strategy: ${strategy}`);
6173
+ const strategyMap = {
6174
+ recursive: (options2) => this.chunkRecursive(options2),
6175
+ character: (options2) => this.chunkCharacter(options2),
6176
+ token: (options2) => this.chunkToken(options2),
6177
+ markdown: (options2) => this.chunkMarkdown(options2),
6178
+ html: (options2) => this.chunkHTML(options2),
6179
+ json: (options2) => this.chunkJSON(options2),
6180
+ latex: (options2) => this.chunkLatex(options2),
6181
+ sentence: (options2) => this.chunkSentence(options2)
6182
+ };
6183
+ const chunkingFunc = strategyMap[strategy];
6184
+ if (chunkingFunc) {
6185
+ await chunkingFunc(options);
6186
+ } else {
6187
+ throw new Error(`Unknown strategy: ${strategy}`);
5872
6188
  }
5873
6189
  }
5874
6190
  async chunkRecursive(options) {
@@ -5878,32 +6194,28 @@ var MDocument = class _MDocument {
5878
6194
  this.chunks = textSplit2;
5879
6195
  return;
5880
6196
  }
5881
- const rt = new RecursiveCharacterTransformer({
5882
- separators: options?.separators,
5883
- isSeparatorRegex: options?.isSeparatorRegex,
5884
- options
5885
- });
6197
+ const rt = new RecursiveCharacterTransformer(options);
5886
6198
  const textSplit = rt.transformDocuments(this.chunks);
5887
6199
  this.chunks = textSplit;
5888
6200
  }
5889
6201
  async chunkCharacter(options) {
5890
6202
  const rt = new CharacterTransformer({
6203
+ ...options,
5891
6204
  separator: options?.separator,
5892
- isSeparatorRegex: options?.isSeparatorRegex,
5893
- options
6205
+ isSeparatorRegex: options?.isSeparatorRegex
5894
6206
  });
5895
6207
  const textSplit = rt.transformDocuments(this.chunks);
5896
6208
  this.chunks = textSplit;
5897
6209
  }
5898
6210
  async chunkHTML(options) {
5899
6211
  if (options?.headers?.length) {
5900
- const rt = new HTMLHeaderTransformer(options.headers, options?.returnEachLine);
6212
+ const rt = new HTMLHeaderTransformer(options);
5901
6213
  const textSplit = rt.transformDocuments(this.chunks);
5902
6214
  this.chunks = textSplit;
5903
6215
  return;
5904
6216
  }
5905
6217
  if (options?.sections?.length) {
5906
- const rt = new HTMLSectionTransformer(options.sections);
6218
+ const rt = new HTMLSectionTransformer(options);
5907
6219
  const textSplit = rt.transformDocuments(this.chunks);
5908
6220
  this.chunks = textSplit;
5909
6221
  return;
@@ -5950,9 +6262,30 @@ var MDocument = class _MDocument {
5950
6262
  const textSplit = rt.transformDocuments(this.chunks);
5951
6263
  this.chunks = textSplit;
5952
6264
  }
6265
+ async chunkSentence(options) {
6266
+ if (!options?.maxSize) {
6267
+ throw new Error("Sentence chunking requires maxSize to be specified");
6268
+ }
6269
+ const rt = new SentenceTransformer({
6270
+ minSize: options?.minSize,
6271
+ maxSize: options?.maxSize,
6272
+ targetSize: options?.targetSize,
6273
+ overlap: options?.overlap,
6274
+ sentenceEnders: options?.sentenceEnders,
6275
+ fallbackToWords: options?.fallbackToWords,
6276
+ fallbackToCharacters: options?.fallbackToCharacters,
6277
+ keepSeparator: options?.keepSeparator,
6278
+ lengthFunction: options?.lengthFunction,
6279
+ addStartIndex: options?.addStartIndex,
6280
+ stripWhitespace: options?.stripWhitespace
6281
+ });
6282
+ const textSplit = rt.transformDocuments(this.chunks);
6283
+ this.chunks = textSplit;
6284
+ }
5953
6285
  async chunk(params) {
5954
6286
  const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
5955
6287
  const strategy = passedStrategy || this.defaultStrategy();
6288
+ validateChunkParams(strategy, chunkOptions);
5956
6289
  await this.chunkBy(strategy, chunkOptions);
5957
6290
  if (extract) {
5958
6291
  await this.extractMetadata(extract);
@@ -6345,19 +6678,20 @@ var GraphRAG = class {
6345
6678
  }));
6346
6679
  }
6347
6680
  };
6681
+ var DEFAULT_CHUNK_PARAMS = {
6682
+ strategy: "recursive",
6683
+ maxSize: 512,
6684
+ overlap: 50,
6685
+ separators: ["\n"]
6686
+ };
6348
6687
  var createDocumentChunkerTool = ({
6349
6688
  doc,
6350
- params = {
6351
- strategy: "recursive",
6352
- size: 512,
6353
- overlap: 50,
6354
- separator: "\n"
6355
- }
6689
+ params = DEFAULT_CHUNK_PARAMS
6356
6690
  }) => {
6357
6691
  return createTool({
6358
- id: `Document Chunker ${params.strategy} ${params.size}`,
6692
+ id: `Document Chunker ${params.strategy} ${params.maxSize}`,
6359
6693
  inputSchema: z.object({}),
6360
- description: `Chunks document using ${params.strategy} strategy with size ${params.size} and ${params.overlap} overlap`,
6694
+ description: `Chunks document using ${params.strategy} strategy with maxSize ${params.maxSize} and ${params.overlap || 0} overlap`,
6361
6695
  execute: async () => {
6362
6696
  const chunks = await doc.chunk(params);
6363
6697
  return {