@mastra/rag 1.0.6 → 1.0.7-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +12 -0
- package/dist/document/document.d.ts +9 -8
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/character.d.ts +4 -26
- package/dist/document/transformers/character.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +8 -3
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/document/transformers/json.d.ts +4 -4
- package/dist/document/transformers/json.d.ts.map +1 -1
- package/dist/document/transformers/latex.d.ts +2 -8
- package/dist/document/transformers/latex.d.ts.map +1 -1
- package/dist/document/transformers/markdown.d.ts +2 -8
- package/dist/document/transformers/markdown.d.ts.map +1 -1
- package/dist/document/transformers/sentence.d.ts +31 -0
- package/dist/document/transformers/sentence.d.ts.map +1 -0
- package/dist/document/transformers/text.d.ts +3 -3
- package/dist/document/transformers/text.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +4 -15
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/document/types.d.ts +85 -14
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts +3 -0
- package/dist/document/validation.d.ts.map +1 -0
- package/dist/index.cjs +414 -80
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +414 -80
- package/dist/index.js.map +1 -1
- package/dist/tools/document-chunker.d.ts.map +1 -1
- package/package.json +5 -5
- package/src/document/document.test.ts +294 -39
- package/src/document/document.ts +69 -41
- package/src/document/transformers/character.ts +15 -43
- package/src/document/transformers/html.ts +9 -9
- package/src/document/transformers/json.ts +8 -3
- package/src/document/transformers/latex.ts +3 -11
- package/src/document/transformers/markdown.ts +3 -11
- package/src/document/transformers/sentence.ts +314 -0
- package/src/document/transformers/text.ts +10 -10
- package/src/document/transformers/token.ts +6 -17
- package/src/document/types.ts +66 -15
- package/src/document/validation.ts +147 -0
- package/src/tools/document-chunker.ts +12 -8
package/dist/index.js
CHANGED
|
@@ -4471,24 +4471,24 @@ var Language = /* @__PURE__ */ ((Language2) => {
|
|
|
4471
4471
|
|
|
4472
4472
|
// src/document/transformers/text.ts
|
|
4473
4473
|
var TextTransformer = class {
|
|
4474
|
-
|
|
4474
|
+
maxSize;
|
|
4475
4475
|
overlap;
|
|
4476
4476
|
lengthFunction;
|
|
4477
4477
|
keepSeparator;
|
|
4478
4478
|
addStartIndex;
|
|
4479
4479
|
stripWhitespace;
|
|
4480
4480
|
constructor({
|
|
4481
|
-
|
|
4481
|
+
maxSize = 4e3,
|
|
4482
4482
|
overlap = 200,
|
|
4483
4483
|
lengthFunction = (text) => text.length,
|
|
4484
4484
|
keepSeparator = false,
|
|
4485
4485
|
addStartIndex = false,
|
|
4486
4486
|
stripWhitespace = true
|
|
4487
4487
|
}) {
|
|
4488
|
-
if (overlap >
|
|
4489
|
-
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${
|
|
4488
|
+
if (overlap > maxSize) {
|
|
4489
|
+
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${maxSize}), should be smaller.`);
|
|
4490
4490
|
}
|
|
4491
|
-
this.
|
|
4491
|
+
this.maxSize = maxSize;
|
|
4492
4492
|
this.overlap = overlap;
|
|
4493
4493
|
this.lengthFunction = lengthFunction;
|
|
4494
4494
|
this.keepSeparator = keepSeparator;
|
|
@@ -4554,9 +4554,9 @@ var TextTransformer = class {
|
|
|
4554
4554
|
for (const d of splits) {
|
|
4555
4555
|
const len = this.lengthFunction(d);
|
|
4556
4556
|
const separatorLen = separator ? this.lengthFunction(separator) : 0;
|
|
4557
|
-
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.
|
|
4558
|
-
if (total > this.
|
|
4559
|
-
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.
|
|
4557
|
+
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
|
|
4558
|
+
if (total > this.maxSize) {
|
|
4559
|
+
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
|
|
4560
4560
|
}
|
|
4561
4561
|
if (currentDoc.length > 0) {
|
|
4562
4562
|
const doc = this.joinDocs(currentDoc, separator);
|
|
@@ -4634,12 +4634,8 @@ function splitTextWithRegex(text, separator, keepSeparator) {
|
|
|
4634
4634
|
var CharacterTransformer = class extends TextTransformer {
|
|
4635
4635
|
separator;
|
|
4636
4636
|
isSeparatorRegex;
|
|
4637
|
-
constructor({
|
|
4638
|
-
|
|
4639
|
-
isSeparatorRegex = false,
|
|
4640
|
-
options = {}
|
|
4641
|
-
}) {
|
|
4642
|
-
super(options);
|
|
4637
|
+
constructor({ separator = "\n\n", isSeparatorRegex = false, ...baseOptions } = {}) {
|
|
4638
|
+
super(baseOptions);
|
|
4643
4639
|
this.separator = separator;
|
|
4644
4640
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
4645
4641
|
}
|
|
@@ -4648,7 +4644,7 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4648
4644
|
const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
|
|
4649
4645
|
const chunks = [];
|
|
4650
4646
|
for (const split of initialSplits) {
|
|
4651
|
-
if (this.lengthFunction(split) <= this.
|
|
4647
|
+
if (this.lengthFunction(split) <= this.maxSize) {
|
|
4652
4648
|
chunks.push(split);
|
|
4653
4649
|
} else {
|
|
4654
4650
|
const subChunks = this.__splitChunk(split);
|
|
@@ -4662,7 +4658,7 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4662
4658
|
let currentPosition = 0;
|
|
4663
4659
|
while (currentPosition < text.length) {
|
|
4664
4660
|
let chunkEnd = currentPosition;
|
|
4665
|
-
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.
|
|
4661
|
+
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
|
|
4666
4662
|
chunkEnd++;
|
|
4667
4663
|
}
|
|
4668
4664
|
const currentChunk = text.slice(currentPosition, chunkEnd);
|
|
@@ -4677,12 +4673,8 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4677
4673
|
var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends TextTransformer {
|
|
4678
4674
|
separators;
|
|
4679
4675
|
isSeparatorRegex;
|
|
4680
|
-
constructor({
|
|
4681
|
-
|
|
4682
|
-
isSeparatorRegex = false,
|
|
4683
|
-
options = {}
|
|
4684
|
-
}) {
|
|
4685
|
-
super(options);
|
|
4676
|
+
constructor({ separators, isSeparatorRegex = false, language, ...baseOptions } = {}) {
|
|
4677
|
+
super(baseOptions);
|
|
4686
4678
|
this.separators = separators || ["\n\n", "\n", " ", ""];
|
|
4687
4679
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
4688
4680
|
}
|
|
@@ -4708,7 +4700,7 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4708
4700
|
const goodSplits = [];
|
|
4709
4701
|
const mergeSeparator = this.keepSeparator ? "" : separator;
|
|
4710
4702
|
for (const s of splits) {
|
|
4711
|
-
if (this.lengthFunction(s) < this.
|
|
4703
|
+
if (this.lengthFunction(s) < this.maxSize) {
|
|
4712
4704
|
goodSplits.push(s);
|
|
4713
4705
|
} else {
|
|
4714
4706
|
if (goodSplits.length > 0) {
|
|
@@ -4735,7 +4727,12 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4735
4727
|
}
|
|
4736
4728
|
static fromLanguage(language, options = {}) {
|
|
4737
4729
|
const separators = _RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
|
|
4738
|
-
return new _RecursiveCharacterTransformer({
|
|
4730
|
+
return new _RecursiveCharacterTransformer({
|
|
4731
|
+
...options,
|
|
4732
|
+
separators,
|
|
4733
|
+
isSeparatorRegex: true,
|
|
4734
|
+
language
|
|
4735
|
+
});
|
|
4739
4736
|
}
|
|
4740
4737
|
static getSeparatorsForLanguage(language) {
|
|
4741
4738
|
switch (language) {
|
|
@@ -4820,9 +4817,9 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4820
4817
|
var HTMLHeaderTransformer = class {
|
|
4821
4818
|
headersToSplitOn;
|
|
4822
4819
|
returnEachElement;
|
|
4823
|
-
constructor(
|
|
4824
|
-
this.returnEachElement =
|
|
4825
|
-
this.headersToSplitOn = [...
|
|
4820
|
+
constructor(options) {
|
|
4821
|
+
this.returnEachElement = options.returnEachLine ?? false;
|
|
4822
|
+
this.headersToSplitOn = [...options.headers].sort();
|
|
4826
4823
|
}
|
|
4827
4824
|
splitText({ text }) {
|
|
4828
4825
|
const root = parse(text);
|
|
@@ -4953,10 +4950,10 @@ var HTMLHeaderTransformer = class {
|
|
|
4953
4950
|
};
|
|
4954
4951
|
var HTMLSectionTransformer = class {
|
|
4955
4952
|
headersToSplitOn;
|
|
4956
|
-
|
|
4957
|
-
constructor(
|
|
4958
|
-
this.headersToSplitOn = Object.fromEntries(
|
|
4959
|
-
this.
|
|
4953
|
+
textSplitter;
|
|
4954
|
+
constructor(options) {
|
|
4955
|
+
this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name14]) => [tag.toLowerCase(), name14]));
|
|
4956
|
+
this.textSplitter = new RecursiveCharacterTransformer(options);
|
|
4960
4957
|
}
|
|
4961
4958
|
splitText(text) {
|
|
4962
4959
|
const sections = this.splitHtmlByHeaders(text);
|
|
@@ -5025,8 +5022,7 @@ var HTMLSectionTransformer = class {
|
|
|
5025
5022
|
metadatas.push(doc.metadata);
|
|
5026
5023
|
}
|
|
5027
5024
|
const results = await this.createDocuments(texts, metadatas);
|
|
5028
|
-
|
|
5029
|
-
return textSplitter.splitDocuments(results);
|
|
5025
|
+
return this.textSplitter.splitDocuments(results);
|
|
5030
5026
|
}
|
|
5031
5027
|
createDocuments(texts, metadatas) {
|
|
5032
5028
|
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
@@ -5068,9 +5064,13 @@ var HTMLSectionTransformer = class {
|
|
|
5068
5064
|
var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
5069
5065
|
maxSize;
|
|
5070
5066
|
minSize;
|
|
5071
|
-
|
|
5067
|
+
ensureAscii;
|
|
5068
|
+
convertLists;
|
|
5069
|
+
constructor({ maxSize = 2e3, minSize, ensureAscii = false, convertLists = true }) {
|
|
5072
5070
|
this.maxSize = maxSize;
|
|
5073
5071
|
this.minSize = minSize ?? Math.max(maxSize - 200, 50);
|
|
5072
|
+
this.ensureAscii = ensureAscii;
|
|
5073
|
+
this.convertLists = convertLists;
|
|
5074
5074
|
}
|
|
5075
5075
|
static jsonSize(data) {
|
|
5076
5076
|
const seen = /* @__PURE__ */ new WeakSet();
|
|
@@ -5202,7 +5202,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
|
5202
5202
|
*/
|
|
5203
5203
|
isWithinSizeLimit(value, currentSize = 0) {
|
|
5204
5204
|
const size = _RecursiveJsonTransformer.jsonSize(value);
|
|
5205
|
-
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize
|
|
5205
|
+
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
|
|
5206
5206
|
}
|
|
5207
5207
|
/**
|
|
5208
5208
|
* Splits arrays into chunks based on size limits
|
|
@@ -5469,7 +5469,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
|
5469
5469
|
var LatexTransformer = class extends RecursiveCharacterTransformer {
|
|
5470
5470
|
constructor(options = {}) {
|
|
5471
5471
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("latex" /* LATEX */);
|
|
5472
|
-
super({ separators, isSeparatorRegex: true
|
|
5472
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
5473
5473
|
}
|
|
5474
5474
|
};
|
|
5475
5475
|
|
|
@@ -5477,7 +5477,7 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
|
|
|
5477
5477
|
var MarkdownTransformer = class extends RecursiveCharacterTransformer {
|
|
5478
5478
|
constructor(options = {}) {
|
|
5479
5479
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
|
|
5480
|
-
super({ separators, isSeparatorRegex: true
|
|
5480
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
5481
5481
|
}
|
|
5482
5482
|
};
|
|
5483
5483
|
var MarkdownHeaderTransformer = class {
|
|
@@ -5644,6 +5644,239 @@ var MarkdownHeaderTransformer = class {
|
|
|
5644
5644
|
return this.createDocuments(texts, metadatas);
|
|
5645
5645
|
}
|
|
5646
5646
|
};
|
|
5647
|
+
|
|
5648
|
+
// src/document/transformers/sentence.ts
|
|
5649
|
+
var SentenceTransformer = class extends TextTransformer {
|
|
5650
|
+
minSize;
|
|
5651
|
+
maxSize;
|
|
5652
|
+
targetSize;
|
|
5653
|
+
sentenceEnders;
|
|
5654
|
+
fallbackToWords;
|
|
5655
|
+
fallbackToCharacters;
|
|
5656
|
+
keepSeparator;
|
|
5657
|
+
constructor(options) {
|
|
5658
|
+
const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
|
|
5659
|
+
const baseOptions = {
|
|
5660
|
+
...options,
|
|
5661
|
+
overlap: parentOverlap
|
|
5662
|
+
// Use adjusted overlap for parent
|
|
5663
|
+
};
|
|
5664
|
+
super(baseOptions);
|
|
5665
|
+
this.maxSize = options.maxSize;
|
|
5666
|
+
this.minSize = options.minSize ?? 50;
|
|
5667
|
+
this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
|
|
5668
|
+
this.sentenceEnders = options.sentenceEnders ?? [".", "!", "?"];
|
|
5669
|
+
this.fallbackToWords = options.fallbackToWords ?? true;
|
|
5670
|
+
this.fallbackToCharacters = options.fallbackToCharacters ?? true;
|
|
5671
|
+
this.keepSeparator = options.keepSeparator ?? false;
|
|
5672
|
+
this.overlap = options.overlap ?? 0;
|
|
5673
|
+
}
|
|
5674
|
+
detectSentenceBoundaries(text) {
|
|
5675
|
+
if (!text) return [];
|
|
5676
|
+
const sentences = [];
|
|
5677
|
+
let currentSentence = "";
|
|
5678
|
+
let i = 0;
|
|
5679
|
+
while (i < text.length) {
|
|
5680
|
+
const char = text[i];
|
|
5681
|
+
if (!char) break;
|
|
5682
|
+
currentSentence += char;
|
|
5683
|
+
if (this.sentenceEnders.includes(char)) {
|
|
5684
|
+
const remainingText = text.slice(i + 1);
|
|
5685
|
+
if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
|
|
5686
|
+
sentences.push(currentSentence.trim());
|
|
5687
|
+
currentSentence = "";
|
|
5688
|
+
}
|
|
5689
|
+
}
|
|
5690
|
+
i++;
|
|
5691
|
+
}
|
|
5692
|
+
if (currentSentence.trim()) {
|
|
5693
|
+
sentences.push(currentSentence.trim());
|
|
5694
|
+
}
|
|
5695
|
+
return sentences.filter((s) => s.length > 0);
|
|
5696
|
+
}
|
|
5697
|
+
isRealSentenceBoundary(currentSentence, remainingText) {
|
|
5698
|
+
if (!remainingText.trim()) {
|
|
5699
|
+
return true;
|
|
5700
|
+
}
|
|
5701
|
+
if (!/^\s+[A-Z]/.test(remainingText)) {
|
|
5702
|
+
return false;
|
|
5703
|
+
}
|
|
5704
|
+
const words = currentSentence.trim().split(/\s+/);
|
|
5705
|
+
const lastWord = words[words.length - 1] || "";
|
|
5706
|
+
const baseWord = lastWord.slice(0, -1);
|
|
5707
|
+
if (this.isCommonAbbreviation(baseWord)) {
|
|
5708
|
+
return false;
|
|
5709
|
+
}
|
|
5710
|
+
return true;
|
|
5711
|
+
}
|
|
5712
|
+
isCommonAbbreviation(word) {
|
|
5713
|
+
const titles = ["Dr", "Mr", "Mrs", "Ms", "Prof", "Sr", "Jr"];
|
|
5714
|
+
if (titles.includes(word)) {
|
|
5715
|
+
return true;
|
|
5716
|
+
}
|
|
5717
|
+
if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
|
|
5718
|
+
return true;
|
|
5719
|
+
}
|
|
5720
|
+
if (/^[A-Z]$/.test(word)) {
|
|
5721
|
+
return true;
|
|
5722
|
+
}
|
|
5723
|
+
if (/^\d+$/.test(word)) {
|
|
5724
|
+
return true;
|
|
5725
|
+
}
|
|
5726
|
+
if (/^[ap]\.?m$/i.test(word)) {
|
|
5727
|
+
return true;
|
|
5728
|
+
}
|
|
5729
|
+
return false;
|
|
5730
|
+
}
|
|
5731
|
+
/**
|
|
5732
|
+
* Group sentences into chunks with integrated overlap processing
|
|
5733
|
+
*/
|
|
5734
|
+
groupSentencesIntoChunks(sentences) {
|
|
5735
|
+
const chunks = [];
|
|
5736
|
+
let currentChunk = [];
|
|
5737
|
+
let currentSize = 0;
|
|
5738
|
+
const separator = " ";
|
|
5739
|
+
for (const sentence of sentences) {
|
|
5740
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
5741
|
+
const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
|
|
5742
|
+
const totalLength = currentSize + sentenceLength + separatorLength;
|
|
5743
|
+
if (sentenceLength > this.maxSize) {
|
|
5744
|
+
if (currentChunk.length > 0) {
|
|
5745
|
+
chunks.push(currentChunk.join(separator));
|
|
5746
|
+
currentChunk = [];
|
|
5747
|
+
currentSize = 0;
|
|
5748
|
+
}
|
|
5749
|
+
const fallbackChunks = this.handleOversizedSentence(sentence);
|
|
5750
|
+
chunks.push(...fallbackChunks);
|
|
5751
|
+
continue;
|
|
5752
|
+
}
|
|
5753
|
+
if (currentChunk.length > 0 && totalLength > this.maxSize) {
|
|
5754
|
+
chunks.push(currentChunk.join(separator));
|
|
5755
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
5756
|
+
currentChunk = overlapSentences;
|
|
5757
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
5758
|
+
}
|
|
5759
|
+
currentChunk.push(sentence);
|
|
5760
|
+
currentSize += sentenceLength + separatorLength;
|
|
5761
|
+
if (currentSize >= this.targetSize) {
|
|
5762
|
+
chunks.push(currentChunk.join(separator));
|
|
5763
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
5764
|
+
currentChunk = overlapSentences;
|
|
5765
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
5766
|
+
}
|
|
5767
|
+
}
|
|
5768
|
+
if (currentChunk.length > 0) {
|
|
5769
|
+
chunks.push(currentChunk.join(separator));
|
|
5770
|
+
}
|
|
5771
|
+
return chunks;
|
|
5772
|
+
}
|
|
5773
|
+
/**
|
|
5774
|
+
* Handle oversized sentences with fallback strategies
|
|
5775
|
+
*/
|
|
5776
|
+
handleOversizedSentence(sentence) {
|
|
5777
|
+
if (this.fallbackToWords) {
|
|
5778
|
+
const wordChunks = this.splitSentenceIntoWords(sentence);
|
|
5779
|
+
if (wordChunks.length > 1) {
|
|
5780
|
+
return wordChunks;
|
|
5781
|
+
}
|
|
5782
|
+
}
|
|
5783
|
+
if (this.fallbackToCharacters) {
|
|
5784
|
+
return this.splitSentenceIntoCharacters(sentence);
|
|
5785
|
+
}
|
|
5786
|
+
console.warn(
|
|
5787
|
+
`Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`
|
|
5788
|
+
);
|
|
5789
|
+
return [sentence];
|
|
5790
|
+
}
|
|
5791
|
+
splitSentenceIntoWords(sentence) {
|
|
5792
|
+
const words = sentence.split(/\s+/);
|
|
5793
|
+
const chunks = [];
|
|
5794
|
+
let currentChunk = "";
|
|
5795
|
+
for (const word of words) {
|
|
5796
|
+
const testChunk = currentChunk ? currentChunk + " " + word : word;
|
|
5797
|
+
if (this.lengthFunction(testChunk) <= this.maxSize) {
|
|
5798
|
+
currentChunk = testChunk;
|
|
5799
|
+
} else {
|
|
5800
|
+
if (currentChunk) {
|
|
5801
|
+
chunks.push(currentChunk);
|
|
5802
|
+
}
|
|
5803
|
+
if (this.lengthFunction(word) > this.maxSize) {
|
|
5804
|
+
if (this.fallbackToCharacters) {
|
|
5805
|
+
chunks.push(...this.splitSentenceIntoCharacters(word));
|
|
5806
|
+
} else {
|
|
5807
|
+
chunks.push(word);
|
|
5808
|
+
}
|
|
5809
|
+
currentChunk = "";
|
|
5810
|
+
} else {
|
|
5811
|
+
currentChunk = word;
|
|
5812
|
+
}
|
|
5813
|
+
}
|
|
5814
|
+
}
|
|
5815
|
+
if (currentChunk) {
|
|
5816
|
+
chunks.push(currentChunk);
|
|
5817
|
+
}
|
|
5818
|
+
return chunks;
|
|
5819
|
+
}
|
|
5820
|
+
splitSentenceIntoCharacters(text) {
|
|
5821
|
+
const chunks = [];
|
|
5822
|
+
let currentChunk = "";
|
|
5823
|
+
for (const char of text) {
|
|
5824
|
+
if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
|
|
5825
|
+
currentChunk += char;
|
|
5826
|
+
} else {
|
|
5827
|
+
if (currentChunk) {
|
|
5828
|
+
chunks.push(currentChunk);
|
|
5829
|
+
}
|
|
5830
|
+
currentChunk = char;
|
|
5831
|
+
}
|
|
5832
|
+
}
|
|
5833
|
+
if (currentChunk) {
|
|
5834
|
+
chunks.push(currentChunk);
|
|
5835
|
+
}
|
|
5836
|
+
return chunks;
|
|
5837
|
+
}
|
|
5838
|
+
calculateSentenceOverlap(currentChunk) {
|
|
5839
|
+
if (this.overlap === 0 || currentChunk.length === 0) {
|
|
5840
|
+
return [];
|
|
5841
|
+
}
|
|
5842
|
+
const overlapSentences = [];
|
|
5843
|
+
let overlapSize = 0;
|
|
5844
|
+
const separator = " ";
|
|
5845
|
+
for (let i = currentChunk.length - 1; i >= 0; i--) {
|
|
5846
|
+
const sentence = currentChunk[i];
|
|
5847
|
+
if (!sentence) continue;
|
|
5848
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
5849
|
+
const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
|
|
5850
|
+
if (overlapSize + sentenceLength + separatorLength > this.overlap) {
|
|
5851
|
+
break;
|
|
5852
|
+
}
|
|
5853
|
+
overlapSentences.unshift(sentence);
|
|
5854
|
+
overlapSize += sentenceLength + separatorLength;
|
|
5855
|
+
}
|
|
5856
|
+
return overlapSentences;
|
|
5857
|
+
}
|
|
5858
|
+
calculateChunkSize(sentences) {
|
|
5859
|
+
if (!sentences || sentences.length === 0) {
|
|
5860
|
+
return 0;
|
|
5861
|
+
}
|
|
5862
|
+
let totalSize = 0;
|
|
5863
|
+
const separator = " ";
|
|
5864
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
5865
|
+
const sentence = sentences[i];
|
|
5866
|
+
totalSize += this.lengthFunction(sentence);
|
|
5867
|
+
if (i < sentences.length - 1) {
|
|
5868
|
+
totalSize += this.lengthFunction(separator);
|
|
5869
|
+
}
|
|
5870
|
+
}
|
|
5871
|
+
return totalSize;
|
|
5872
|
+
}
|
|
5873
|
+
splitText({ text }) {
|
|
5874
|
+
if (!text) return [];
|
|
5875
|
+
const sentences = this.detectSentenceBoundaries(text);
|
|
5876
|
+
const chunks = this.groupSentencesIntoChunks(sentences);
|
|
5877
|
+
return chunks.filter((chunk) => chunk.trim().length > 0);
|
|
5878
|
+
}
|
|
5879
|
+
};
|
|
5647
5880
|
function splitTextOnTokens({ text, tokenizer }) {
|
|
5648
5881
|
const splits = [];
|
|
5649
5882
|
const inputIds = tokenizer.encode(text);
|
|
@@ -5694,7 +5927,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
|
|
|
5694
5927
|
};
|
|
5695
5928
|
const tokenizer = {
|
|
5696
5929
|
overlap: this.overlap,
|
|
5697
|
-
tokensPerChunk: this.
|
|
5930
|
+
tokensPerChunk: this.maxSize,
|
|
5698
5931
|
decode,
|
|
5699
5932
|
encode
|
|
5700
5933
|
};
|
|
@@ -5726,13 +5959,105 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
|
|
|
5726
5959
|
allowedSpecial: options.allowedSpecial,
|
|
5727
5960
|
disallowedSpecial: options.disallowedSpecial,
|
|
5728
5961
|
options: {
|
|
5729
|
-
|
|
5962
|
+
maxSize: options.maxSize,
|
|
5730
5963
|
overlap: options.overlap,
|
|
5731
5964
|
lengthFunction: tikTokenEncoder
|
|
5732
5965
|
}
|
|
5733
5966
|
});
|
|
5734
5967
|
}
|
|
5735
5968
|
};
|
|
5969
|
+
function handleDeprecatedSize(data) {
|
|
5970
|
+
if (data.size !== void 0) {
|
|
5971
|
+
console.warn(
|
|
5972
|
+
"[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version."
|
|
5973
|
+
);
|
|
5974
|
+
if (data.maxSize === void 0) {
|
|
5975
|
+
data.maxSize = data.size;
|
|
5976
|
+
}
|
|
5977
|
+
}
|
|
5978
|
+
const { size, ...rest } = data;
|
|
5979
|
+
return rest;
|
|
5980
|
+
}
|
|
5981
|
+
var baseChunkOptionsSchema = z.object({
|
|
5982
|
+
size: z.number().positive().optional(),
|
|
5983
|
+
maxSize: z.number().positive().optional(),
|
|
5984
|
+
overlap: z.number().min(0).optional(),
|
|
5985
|
+
lengthFunction: z.function().optional(),
|
|
5986
|
+
keepSeparator: z.union([z.boolean(), z.literal("start"), z.literal("end")]).optional(),
|
|
5987
|
+
addStartIndex: z.boolean().optional(),
|
|
5988
|
+
stripWhitespace: z.boolean().optional()
|
|
5989
|
+
});
|
|
5990
|
+
var characterChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
5991
|
+
separator: z.string().optional(),
|
|
5992
|
+
isSeparatorRegex: z.boolean().optional()
|
|
5993
|
+
}).strict();
|
|
5994
|
+
var recursiveChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
5995
|
+
separators: z.array(z.string()).optional(),
|
|
5996
|
+
isSeparatorRegex: z.boolean().optional(),
|
|
5997
|
+
language: z.string().optional()
|
|
5998
|
+
}).strict();
|
|
5999
|
+
var sentenceChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6000
|
+
maxSize: z.number().positive(),
|
|
6001
|
+
minSize: z.number().positive().optional(),
|
|
6002
|
+
targetSize: z.number().positive().optional(),
|
|
6003
|
+
sentenceEnders: z.array(z.string()).optional(),
|
|
6004
|
+
fallbackToWords: z.boolean().optional(),
|
|
6005
|
+
fallbackToCharacters: z.boolean().optional()
|
|
6006
|
+
}).strict();
|
|
6007
|
+
var isSetLike = (value) => {
|
|
6008
|
+
return typeof value === "object" && value !== null && typeof value.has === "function" && typeof value.add === "function" && typeof value.delete === "function" && typeof value.clear === "function" && typeof value.size === "number";
|
|
6009
|
+
};
|
|
6010
|
+
var setOrAllSchema = z.any().refine((value) => value === "all" || isSetLike(value), {
|
|
6011
|
+
message: "Must be a Set object or the literal 'all'"
|
|
6012
|
+
}).optional();
|
|
6013
|
+
var tokenChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6014
|
+
encodingName: z.string().optional(),
|
|
6015
|
+
modelName: z.string().optional(),
|
|
6016
|
+
allowedSpecial: setOrAllSchema,
|
|
6017
|
+
disallowedSpecial: setOrAllSchema
|
|
6018
|
+
}).strict();
|
|
6019
|
+
var jsonChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6020
|
+
minSize: z.number().positive().optional(),
|
|
6021
|
+
ensureAscii: z.boolean().optional(),
|
|
6022
|
+
convertLists: z.boolean().optional()
|
|
6023
|
+
}).strict();
|
|
6024
|
+
var htmlChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6025
|
+
headers: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
6026
|
+
sections: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
6027
|
+
returnEachLine: z.boolean().optional()
|
|
6028
|
+
}).strict();
|
|
6029
|
+
var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6030
|
+
headers: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
6031
|
+
returnEachLine: z.boolean().optional(),
|
|
6032
|
+
stripHeaders: z.boolean().optional()
|
|
6033
|
+
}).strict();
|
|
6034
|
+
var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
6035
|
+
var validationSchemas = {
|
|
6036
|
+
character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6037
|
+
recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6038
|
+
sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6039
|
+
token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6040
|
+
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6041
|
+
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6042
|
+
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6043
|
+
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
|
|
6044
|
+
};
|
|
6045
|
+
function validateChunkParams(strategy, params) {
|
|
6046
|
+
const schema = validationSchemas[strategy];
|
|
6047
|
+
if (!schema) {
|
|
6048
|
+
throw new Error(`Unknown chunking strategy: ${strategy}`);
|
|
6049
|
+
}
|
|
6050
|
+
const result = schema.safeParse(params);
|
|
6051
|
+
if (!result.success) {
|
|
6052
|
+
const unrecognizedError = result.error.errors.find((e) => e.code === "unrecognized_keys");
|
|
6053
|
+
if (unrecognizedError && "keys" in unrecognizedError) {
|
|
6054
|
+
const keys = unrecognizedError.keys.join(", ");
|
|
6055
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
|
|
6056
|
+
}
|
|
6057
|
+
const errorMessage = result.error.errors.map((e) => `${e.path.length > 0 ? e.path.join(".") : "parameter"}: ${e.message}`).join(", ");
|
|
6058
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
|
|
6059
|
+
}
|
|
6060
|
+
}
|
|
5736
6061
|
|
|
5737
6062
|
// src/document/document.ts
|
|
5738
6063
|
var MDocument = class _MDocument {
|
|
@@ -5845,30 +6170,21 @@ var MDocument = class _MDocument {
|
|
|
5845
6170
|
}
|
|
5846
6171
|
}
|
|
5847
6172
|
async chunkBy(strategy, options) {
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
|
|
5854
|
-
|
|
5855
|
-
|
|
5856
|
-
|
|
5857
|
-
|
|
5858
|
-
|
|
5859
|
-
|
|
5860
|
-
|
|
5861
|
-
|
|
5862
|
-
|
|
5863
|
-
break;
|
|
5864
|
-
case "json":
|
|
5865
|
-
await this.chunkJSON(options);
|
|
5866
|
-
break;
|
|
5867
|
-
case "latex":
|
|
5868
|
-
await this.chunkLatex(options);
|
|
5869
|
-
break;
|
|
5870
|
-
default:
|
|
5871
|
-
throw new Error(`Unknown strategy: ${strategy}`);
|
|
6173
|
+
const strategyMap = {
|
|
6174
|
+
recursive: (options2) => this.chunkRecursive(options2),
|
|
6175
|
+
character: (options2) => this.chunkCharacter(options2),
|
|
6176
|
+
token: (options2) => this.chunkToken(options2),
|
|
6177
|
+
markdown: (options2) => this.chunkMarkdown(options2),
|
|
6178
|
+
html: (options2) => this.chunkHTML(options2),
|
|
6179
|
+
json: (options2) => this.chunkJSON(options2),
|
|
6180
|
+
latex: (options2) => this.chunkLatex(options2),
|
|
6181
|
+
sentence: (options2) => this.chunkSentence(options2)
|
|
6182
|
+
};
|
|
6183
|
+
const chunkingFunc = strategyMap[strategy];
|
|
6184
|
+
if (chunkingFunc) {
|
|
6185
|
+
await chunkingFunc(options);
|
|
6186
|
+
} else {
|
|
6187
|
+
throw new Error(`Unknown strategy: ${strategy}`);
|
|
5872
6188
|
}
|
|
5873
6189
|
}
|
|
5874
6190
|
async chunkRecursive(options) {
|
|
@@ -5878,32 +6194,28 @@ var MDocument = class _MDocument {
|
|
|
5878
6194
|
this.chunks = textSplit2;
|
|
5879
6195
|
return;
|
|
5880
6196
|
}
|
|
5881
|
-
const rt = new RecursiveCharacterTransformer(
|
|
5882
|
-
separators: options?.separators,
|
|
5883
|
-
isSeparatorRegex: options?.isSeparatorRegex,
|
|
5884
|
-
options
|
|
5885
|
-
});
|
|
6197
|
+
const rt = new RecursiveCharacterTransformer(options);
|
|
5886
6198
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5887
6199
|
this.chunks = textSplit;
|
|
5888
6200
|
}
|
|
5889
6201
|
async chunkCharacter(options) {
|
|
5890
6202
|
const rt = new CharacterTransformer({
|
|
6203
|
+
...options,
|
|
5891
6204
|
separator: options?.separator,
|
|
5892
|
-
isSeparatorRegex: options?.isSeparatorRegex
|
|
5893
|
-
options
|
|
6205
|
+
isSeparatorRegex: options?.isSeparatorRegex
|
|
5894
6206
|
});
|
|
5895
6207
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5896
6208
|
this.chunks = textSplit;
|
|
5897
6209
|
}
|
|
5898
6210
|
async chunkHTML(options) {
|
|
5899
6211
|
if (options?.headers?.length) {
|
|
5900
|
-
const rt = new HTMLHeaderTransformer(options
|
|
6212
|
+
const rt = new HTMLHeaderTransformer(options);
|
|
5901
6213
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5902
6214
|
this.chunks = textSplit;
|
|
5903
6215
|
return;
|
|
5904
6216
|
}
|
|
5905
6217
|
if (options?.sections?.length) {
|
|
5906
|
-
const rt = new HTMLSectionTransformer(options
|
|
6218
|
+
const rt = new HTMLSectionTransformer(options);
|
|
5907
6219
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5908
6220
|
this.chunks = textSplit;
|
|
5909
6221
|
return;
|
|
@@ -5950,9 +6262,30 @@ var MDocument = class _MDocument {
|
|
|
5950
6262
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5951
6263
|
this.chunks = textSplit;
|
|
5952
6264
|
}
|
|
6265
|
+
async chunkSentence(options) {
|
|
6266
|
+
if (!options?.maxSize) {
|
|
6267
|
+
throw new Error("Sentence chunking requires maxSize to be specified");
|
|
6268
|
+
}
|
|
6269
|
+
const rt = new SentenceTransformer({
|
|
6270
|
+
minSize: options?.minSize,
|
|
6271
|
+
maxSize: options?.maxSize,
|
|
6272
|
+
targetSize: options?.targetSize,
|
|
6273
|
+
overlap: options?.overlap,
|
|
6274
|
+
sentenceEnders: options?.sentenceEnders,
|
|
6275
|
+
fallbackToWords: options?.fallbackToWords,
|
|
6276
|
+
fallbackToCharacters: options?.fallbackToCharacters,
|
|
6277
|
+
keepSeparator: options?.keepSeparator,
|
|
6278
|
+
lengthFunction: options?.lengthFunction,
|
|
6279
|
+
addStartIndex: options?.addStartIndex,
|
|
6280
|
+
stripWhitespace: options?.stripWhitespace
|
|
6281
|
+
});
|
|
6282
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
6283
|
+
this.chunks = textSplit;
|
|
6284
|
+
}
|
|
5953
6285
|
async chunk(params) {
|
|
5954
6286
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
5955
6287
|
const strategy = passedStrategy || this.defaultStrategy();
|
|
6288
|
+
validateChunkParams(strategy, chunkOptions);
|
|
5956
6289
|
await this.chunkBy(strategy, chunkOptions);
|
|
5957
6290
|
if (extract) {
|
|
5958
6291
|
await this.extractMetadata(extract);
|
|
@@ -6345,19 +6678,20 @@ var GraphRAG = class {
|
|
|
6345
6678
|
}));
|
|
6346
6679
|
}
|
|
6347
6680
|
};
|
|
6681
|
+
var DEFAULT_CHUNK_PARAMS = {
|
|
6682
|
+
strategy: "recursive",
|
|
6683
|
+
maxSize: 512,
|
|
6684
|
+
overlap: 50,
|
|
6685
|
+
separators: ["\n"]
|
|
6686
|
+
};
|
|
6348
6687
|
var createDocumentChunkerTool = ({
|
|
6349
6688
|
doc,
|
|
6350
|
-
params =
|
|
6351
|
-
strategy: "recursive",
|
|
6352
|
-
size: 512,
|
|
6353
|
-
overlap: 50,
|
|
6354
|
-
separator: "\n"
|
|
6355
|
-
}
|
|
6689
|
+
params = DEFAULT_CHUNK_PARAMS
|
|
6356
6690
|
}) => {
|
|
6357
6691
|
return createTool({
|
|
6358
|
-
id: `Document Chunker ${params.strategy} ${params.
|
|
6692
|
+
id: `Document Chunker ${params.strategy} ${params.maxSize}`,
|
|
6359
6693
|
inputSchema: z.object({}),
|
|
6360
|
-
description: `Chunks document using ${params.strategy} strategy with
|
|
6694
|
+
description: `Chunks document using ${params.strategy} strategy with maxSize ${params.maxSize} and ${params.overlap || 0} overlap`,
|
|
6361
6695
|
execute: async () => {
|
|
6362
6696
|
const chunks = await doc.chunk(params);
|
|
6363
6697
|
return {
|