@mastra/rag 1.0.6 → 1.0.7-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +25 -0
- package/dist/document/document.d.ts +10 -9
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/extractors/base.d.ts +1 -1
- package/dist/document/extractors/index.d.ts +5 -5
- package/dist/document/extractors/keywords.d.ts +4 -4
- package/dist/document/extractors/questions.d.ts +4 -4
- package/dist/document/extractors/summary.d.ts +4 -4
- package/dist/document/extractors/title.d.ts +4 -4
- package/dist/document/extractors/types.d.ts +1 -1
- package/dist/document/index.d.ts +2 -2
- package/dist/document/prompts/base.d.ts +1 -1
- package/dist/document/prompts/index.d.ts +3 -3
- package/dist/document/prompts/prompt.d.ts +1 -1
- package/dist/document/schema/index.d.ts +3 -3
- package/dist/document/schema/node.d.ts +2 -2
- package/dist/document/transformers/character.d.ts +6 -28
- package/dist/document/transformers/character.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +9 -4
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/document/transformers/json.d.ts +5 -5
- package/dist/document/transformers/json.d.ts.map +1 -1
- package/dist/document/transformers/latex.d.ts +3 -9
- package/dist/document/transformers/latex.d.ts.map +1 -1
- package/dist/document/transformers/markdown.d.ts +4 -10
- package/dist/document/transformers/markdown.d.ts.map +1 -1
- package/dist/document/transformers/sentence.d.ts +31 -0
- package/dist/document/transformers/sentence.d.ts.map +1 -0
- package/dist/document/transformers/text.d.ts +5 -5
- package/dist/document/transformers/text.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +5 -16
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/document/transformers/transformer.d.ts +1 -1
- package/dist/document/types.d.ts +86 -15
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts +3 -0
- package/dist/document/validation.d.ts.map +1 -0
- package/dist/index.cjs +414 -80
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -8
- package/dist/index.js +414 -80
- package/dist/index.js.map +1 -1
- package/dist/rerank/relevance/index.d.ts +3 -3
- package/dist/tools/document-chunker.d.ts +1 -1
- package/dist/tools/document-chunker.d.ts.map +1 -1
- package/dist/tools/graph-rag.d.ts +2 -2
- package/dist/tools/index.d.ts +3 -3
- package/dist/tools/types.d.ts +1 -1
- package/dist/tools/vector-query.d.ts +2 -2
- package/dist/utils/convert-sources.d.ts +2 -2
- package/dist/utils/index.d.ts +3 -3
- package/dist/utils/vector-search.d.ts +1 -1
- package/package.json +8 -7
- package/src/document/document.test.ts +294 -39
- package/src/document/document.ts +69 -41
- package/src/document/transformers/character.ts +15 -43
- package/src/document/transformers/html.ts +9 -9
- package/src/document/transformers/json.ts +8 -3
- package/src/document/transformers/latex.ts +3 -11
- package/src/document/transformers/markdown.ts +3 -11
- package/src/document/transformers/sentence.ts +314 -0
- package/src/document/transformers/text.ts +10 -10
- package/src/document/transformers/token.ts +6 -17
- package/src/document/types.ts +66 -15
- package/src/document/validation.ts +147 -0
- package/src/tools/document-chunker.ts +12 -8
- package/tsup.config.ts +2 -7
package/dist/index.cjs
CHANGED
|
@@ -4477,24 +4477,24 @@ var Language = /* @__PURE__ */ ((Language2) => {
|
|
|
4477
4477
|
|
|
4478
4478
|
// src/document/transformers/text.ts
|
|
4479
4479
|
var TextTransformer = class {
|
|
4480
|
-
|
|
4480
|
+
maxSize;
|
|
4481
4481
|
overlap;
|
|
4482
4482
|
lengthFunction;
|
|
4483
4483
|
keepSeparator;
|
|
4484
4484
|
addStartIndex;
|
|
4485
4485
|
stripWhitespace;
|
|
4486
4486
|
constructor({
|
|
4487
|
-
|
|
4487
|
+
maxSize = 4e3,
|
|
4488
4488
|
overlap = 200,
|
|
4489
4489
|
lengthFunction = (text) => text.length,
|
|
4490
4490
|
keepSeparator = false,
|
|
4491
4491
|
addStartIndex = false,
|
|
4492
4492
|
stripWhitespace = true
|
|
4493
4493
|
}) {
|
|
4494
|
-
if (overlap >
|
|
4495
|
-
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${
|
|
4494
|
+
if (overlap > maxSize) {
|
|
4495
|
+
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size (${maxSize}), should be smaller.`);
|
|
4496
4496
|
}
|
|
4497
|
-
this.
|
|
4497
|
+
this.maxSize = maxSize;
|
|
4498
4498
|
this.overlap = overlap;
|
|
4499
4499
|
this.lengthFunction = lengthFunction;
|
|
4500
4500
|
this.keepSeparator = keepSeparator;
|
|
@@ -4560,9 +4560,9 @@ var TextTransformer = class {
|
|
|
4560
4560
|
for (const d of splits) {
|
|
4561
4561
|
const len = this.lengthFunction(d);
|
|
4562
4562
|
const separatorLen = separator ? this.lengthFunction(separator) : 0;
|
|
4563
|
-
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.
|
|
4564
|
-
if (total > this.
|
|
4565
|
-
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.
|
|
4563
|
+
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
|
|
4564
|
+
if (total > this.maxSize) {
|
|
4565
|
+
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
|
|
4566
4566
|
}
|
|
4567
4567
|
if (currentDoc.length > 0) {
|
|
4568
4568
|
const doc = this.joinDocs(currentDoc, separator);
|
|
@@ -4640,12 +4640,8 @@ function splitTextWithRegex(text, separator, keepSeparator) {
|
|
|
4640
4640
|
var CharacterTransformer = class extends TextTransformer {
|
|
4641
4641
|
separator;
|
|
4642
4642
|
isSeparatorRegex;
|
|
4643
|
-
constructor({
|
|
4644
|
-
|
|
4645
|
-
isSeparatorRegex = false,
|
|
4646
|
-
options = {}
|
|
4647
|
-
}) {
|
|
4648
|
-
super(options);
|
|
4643
|
+
constructor({ separator = "\n\n", isSeparatorRegex = false, ...baseOptions } = {}) {
|
|
4644
|
+
super(baseOptions);
|
|
4649
4645
|
this.separator = separator;
|
|
4650
4646
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
4651
4647
|
}
|
|
@@ -4654,7 +4650,7 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4654
4650
|
const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
|
|
4655
4651
|
const chunks = [];
|
|
4656
4652
|
for (const split of initialSplits) {
|
|
4657
|
-
if (this.lengthFunction(split) <= this.
|
|
4653
|
+
if (this.lengthFunction(split) <= this.maxSize) {
|
|
4658
4654
|
chunks.push(split);
|
|
4659
4655
|
} else {
|
|
4660
4656
|
const subChunks = this.__splitChunk(split);
|
|
@@ -4668,7 +4664,7 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4668
4664
|
let currentPosition = 0;
|
|
4669
4665
|
while (currentPosition < text.length) {
|
|
4670
4666
|
let chunkEnd = currentPosition;
|
|
4671
|
-
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.
|
|
4667
|
+
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
|
|
4672
4668
|
chunkEnd++;
|
|
4673
4669
|
}
|
|
4674
4670
|
const currentChunk = text.slice(currentPosition, chunkEnd);
|
|
@@ -4683,12 +4679,8 @@ var CharacterTransformer = class extends TextTransformer {
|
|
|
4683
4679
|
var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends TextTransformer {
|
|
4684
4680
|
separators;
|
|
4685
4681
|
isSeparatorRegex;
|
|
4686
|
-
constructor({
|
|
4687
|
-
|
|
4688
|
-
isSeparatorRegex = false,
|
|
4689
|
-
options = {}
|
|
4690
|
-
}) {
|
|
4691
|
-
super(options);
|
|
4682
|
+
constructor({ separators, isSeparatorRegex = false, language, ...baseOptions } = {}) {
|
|
4683
|
+
super(baseOptions);
|
|
4692
4684
|
this.separators = separators || ["\n\n", "\n", " ", ""];
|
|
4693
4685
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
4694
4686
|
}
|
|
@@ -4714,7 +4706,7 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4714
4706
|
const goodSplits = [];
|
|
4715
4707
|
const mergeSeparator = this.keepSeparator ? "" : separator;
|
|
4716
4708
|
for (const s of splits) {
|
|
4717
|
-
if (this.lengthFunction(s) < this.
|
|
4709
|
+
if (this.lengthFunction(s) < this.maxSize) {
|
|
4718
4710
|
goodSplits.push(s);
|
|
4719
4711
|
} else {
|
|
4720
4712
|
if (goodSplits.length > 0) {
|
|
@@ -4741,7 +4733,12 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4741
4733
|
}
|
|
4742
4734
|
static fromLanguage(language, options = {}) {
|
|
4743
4735
|
const separators = _RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
|
|
4744
|
-
return new _RecursiveCharacterTransformer({
|
|
4736
|
+
return new _RecursiveCharacterTransformer({
|
|
4737
|
+
...options,
|
|
4738
|
+
separators,
|
|
4739
|
+
isSeparatorRegex: true,
|
|
4740
|
+
language
|
|
4741
|
+
});
|
|
4745
4742
|
}
|
|
4746
4743
|
static getSeparatorsForLanguage(language) {
|
|
4747
4744
|
switch (language) {
|
|
@@ -4826,9 +4823,9 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4826
4823
|
var HTMLHeaderTransformer = class {
|
|
4827
4824
|
headersToSplitOn;
|
|
4828
4825
|
returnEachElement;
|
|
4829
|
-
constructor(
|
|
4830
|
-
this.returnEachElement =
|
|
4831
|
-
this.headersToSplitOn = [...
|
|
4826
|
+
constructor(options) {
|
|
4827
|
+
this.returnEachElement = options.returnEachLine ?? false;
|
|
4828
|
+
this.headersToSplitOn = [...options.headers].sort();
|
|
4832
4829
|
}
|
|
4833
4830
|
splitText({ text }) {
|
|
4834
4831
|
const root = nodeHtmlBetterParser.parse(text);
|
|
@@ -4959,10 +4956,10 @@ var HTMLHeaderTransformer = class {
|
|
|
4959
4956
|
};
|
|
4960
4957
|
var HTMLSectionTransformer = class {
|
|
4961
4958
|
headersToSplitOn;
|
|
4962
|
-
|
|
4963
|
-
constructor(
|
|
4964
|
-
this.headersToSplitOn = Object.fromEntries(
|
|
4965
|
-
this.
|
|
4959
|
+
textSplitter;
|
|
4960
|
+
constructor(options) {
|
|
4961
|
+
this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name14]) => [tag.toLowerCase(), name14]));
|
|
4962
|
+
this.textSplitter = new RecursiveCharacterTransformer(options);
|
|
4966
4963
|
}
|
|
4967
4964
|
splitText(text) {
|
|
4968
4965
|
const sections = this.splitHtmlByHeaders(text);
|
|
@@ -5031,8 +5028,7 @@ var HTMLSectionTransformer = class {
|
|
|
5031
5028
|
metadatas.push(doc.metadata);
|
|
5032
5029
|
}
|
|
5033
5030
|
const results = await this.createDocuments(texts, metadatas);
|
|
5034
|
-
|
|
5035
|
-
return textSplitter.splitDocuments(results);
|
|
5031
|
+
return this.textSplitter.splitDocuments(results);
|
|
5036
5032
|
}
|
|
5037
5033
|
createDocuments(texts, metadatas) {
|
|
5038
5034
|
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
@@ -5074,9 +5070,13 @@ var HTMLSectionTransformer = class {
|
|
|
5074
5070
|
var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
5075
5071
|
maxSize;
|
|
5076
5072
|
minSize;
|
|
5077
|
-
|
|
5073
|
+
ensureAscii;
|
|
5074
|
+
convertLists;
|
|
5075
|
+
constructor({ maxSize = 2e3, minSize, ensureAscii = false, convertLists = true }) {
|
|
5078
5076
|
this.maxSize = maxSize;
|
|
5079
5077
|
this.minSize = minSize ?? Math.max(maxSize - 200, 50);
|
|
5078
|
+
this.ensureAscii = ensureAscii;
|
|
5079
|
+
this.convertLists = convertLists;
|
|
5080
5080
|
}
|
|
5081
5081
|
static jsonSize(data) {
|
|
5082
5082
|
const seen = /* @__PURE__ */ new WeakSet();
|
|
@@ -5208,7 +5208,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
|
5208
5208
|
*/
|
|
5209
5209
|
isWithinSizeLimit(value, currentSize = 0) {
|
|
5210
5210
|
const size = _RecursiveJsonTransformer.jsonSize(value);
|
|
5211
|
-
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize
|
|
5211
|
+
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
|
|
5212
5212
|
}
|
|
5213
5213
|
/**
|
|
5214
5214
|
* Splits arrays into chunks based on size limits
|
|
@@ -5475,7 +5475,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
|
5475
5475
|
var LatexTransformer = class extends RecursiveCharacterTransformer {
|
|
5476
5476
|
constructor(options = {}) {
|
|
5477
5477
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("latex" /* LATEX */);
|
|
5478
|
-
super({ separators, isSeparatorRegex: true
|
|
5478
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
5479
5479
|
}
|
|
5480
5480
|
};
|
|
5481
5481
|
|
|
@@ -5483,7 +5483,7 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
|
|
|
5483
5483
|
var MarkdownTransformer = class extends RecursiveCharacterTransformer {
|
|
5484
5484
|
constructor(options = {}) {
|
|
5485
5485
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
|
|
5486
|
-
super({ separators, isSeparatorRegex: true
|
|
5486
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
5487
5487
|
}
|
|
5488
5488
|
};
|
|
5489
5489
|
var MarkdownHeaderTransformer = class {
|
|
@@ -5650,6 +5650,239 @@ var MarkdownHeaderTransformer = class {
|
|
|
5650
5650
|
return this.createDocuments(texts, metadatas);
|
|
5651
5651
|
}
|
|
5652
5652
|
};
|
|
5653
|
+
|
|
5654
|
+
// src/document/transformers/sentence.ts
|
|
5655
|
+
var SentenceTransformer = class extends TextTransformer {
|
|
5656
|
+
minSize;
|
|
5657
|
+
maxSize;
|
|
5658
|
+
targetSize;
|
|
5659
|
+
sentenceEnders;
|
|
5660
|
+
fallbackToWords;
|
|
5661
|
+
fallbackToCharacters;
|
|
5662
|
+
keepSeparator;
|
|
5663
|
+
constructor(options) {
|
|
5664
|
+
const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
|
|
5665
|
+
const baseOptions = {
|
|
5666
|
+
...options,
|
|
5667
|
+
overlap: parentOverlap
|
|
5668
|
+
// Use adjusted overlap for parent
|
|
5669
|
+
};
|
|
5670
|
+
super(baseOptions);
|
|
5671
|
+
this.maxSize = options.maxSize;
|
|
5672
|
+
this.minSize = options.minSize ?? 50;
|
|
5673
|
+
this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
|
|
5674
|
+
this.sentenceEnders = options.sentenceEnders ?? [".", "!", "?"];
|
|
5675
|
+
this.fallbackToWords = options.fallbackToWords ?? true;
|
|
5676
|
+
this.fallbackToCharacters = options.fallbackToCharacters ?? true;
|
|
5677
|
+
this.keepSeparator = options.keepSeparator ?? false;
|
|
5678
|
+
this.overlap = options.overlap ?? 0;
|
|
5679
|
+
}
|
|
5680
|
+
detectSentenceBoundaries(text) {
|
|
5681
|
+
if (!text) return [];
|
|
5682
|
+
const sentences = [];
|
|
5683
|
+
let currentSentence = "";
|
|
5684
|
+
let i = 0;
|
|
5685
|
+
while (i < text.length) {
|
|
5686
|
+
const char = text[i];
|
|
5687
|
+
if (!char) break;
|
|
5688
|
+
currentSentence += char;
|
|
5689
|
+
if (this.sentenceEnders.includes(char)) {
|
|
5690
|
+
const remainingText = text.slice(i + 1);
|
|
5691
|
+
if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
|
|
5692
|
+
sentences.push(currentSentence.trim());
|
|
5693
|
+
currentSentence = "";
|
|
5694
|
+
}
|
|
5695
|
+
}
|
|
5696
|
+
i++;
|
|
5697
|
+
}
|
|
5698
|
+
if (currentSentence.trim()) {
|
|
5699
|
+
sentences.push(currentSentence.trim());
|
|
5700
|
+
}
|
|
5701
|
+
return sentences.filter((s) => s.length > 0);
|
|
5702
|
+
}
|
|
5703
|
+
isRealSentenceBoundary(currentSentence, remainingText) {
|
|
5704
|
+
if (!remainingText.trim()) {
|
|
5705
|
+
return true;
|
|
5706
|
+
}
|
|
5707
|
+
if (!/^\s+[A-Z]/.test(remainingText)) {
|
|
5708
|
+
return false;
|
|
5709
|
+
}
|
|
5710
|
+
const words = currentSentence.trim().split(/\s+/);
|
|
5711
|
+
const lastWord = words[words.length - 1] || "";
|
|
5712
|
+
const baseWord = lastWord.slice(0, -1);
|
|
5713
|
+
if (this.isCommonAbbreviation(baseWord)) {
|
|
5714
|
+
return false;
|
|
5715
|
+
}
|
|
5716
|
+
return true;
|
|
5717
|
+
}
|
|
5718
|
+
isCommonAbbreviation(word) {
|
|
5719
|
+
const titles = ["Dr", "Mr", "Mrs", "Ms", "Prof", "Sr", "Jr"];
|
|
5720
|
+
if (titles.includes(word)) {
|
|
5721
|
+
return true;
|
|
5722
|
+
}
|
|
5723
|
+
if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
|
|
5724
|
+
return true;
|
|
5725
|
+
}
|
|
5726
|
+
if (/^[A-Z]$/.test(word)) {
|
|
5727
|
+
return true;
|
|
5728
|
+
}
|
|
5729
|
+
if (/^\d+$/.test(word)) {
|
|
5730
|
+
return true;
|
|
5731
|
+
}
|
|
5732
|
+
if (/^[ap]\.?m$/i.test(word)) {
|
|
5733
|
+
return true;
|
|
5734
|
+
}
|
|
5735
|
+
return false;
|
|
5736
|
+
}
|
|
5737
|
+
/**
|
|
5738
|
+
* Group sentences into chunks with integrated overlap processing
|
|
5739
|
+
*/
|
|
5740
|
+
groupSentencesIntoChunks(sentences) {
|
|
5741
|
+
const chunks = [];
|
|
5742
|
+
let currentChunk = [];
|
|
5743
|
+
let currentSize = 0;
|
|
5744
|
+
const separator = " ";
|
|
5745
|
+
for (const sentence of sentences) {
|
|
5746
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
5747
|
+
const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
|
|
5748
|
+
const totalLength = currentSize + sentenceLength + separatorLength;
|
|
5749
|
+
if (sentenceLength > this.maxSize) {
|
|
5750
|
+
if (currentChunk.length > 0) {
|
|
5751
|
+
chunks.push(currentChunk.join(separator));
|
|
5752
|
+
currentChunk = [];
|
|
5753
|
+
currentSize = 0;
|
|
5754
|
+
}
|
|
5755
|
+
const fallbackChunks = this.handleOversizedSentence(sentence);
|
|
5756
|
+
chunks.push(...fallbackChunks);
|
|
5757
|
+
continue;
|
|
5758
|
+
}
|
|
5759
|
+
if (currentChunk.length > 0 && totalLength > this.maxSize) {
|
|
5760
|
+
chunks.push(currentChunk.join(separator));
|
|
5761
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
5762
|
+
currentChunk = overlapSentences;
|
|
5763
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
5764
|
+
}
|
|
5765
|
+
currentChunk.push(sentence);
|
|
5766
|
+
currentSize += sentenceLength + separatorLength;
|
|
5767
|
+
if (currentSize >= this.targetSize) {
|
|
5768
|
+
chunks.push(currentChunk.join(separator));
|
|
5769
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
5770
|
+
currentChunk = overlapSentences;
|
|
5771
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
5772
|
+
}
|
|
5773
|
+
}
|
|
5774
|
+
if (currentChunk.length > 0) {
|
|
5775
|
+
chunks.push(currentChunk.join(separator));
|
|
5776
|
+
}
|
|
5777
|
+
return chunks;
|
|
5778
|
+
}
|
|
5779
|
+
/**
|
|
5780
|
+
* Handle oversized sentences with fallback strategies
|
|
5781
|
+
*/
|
|
5782
|
+
handleOversizedSentence(sentence) {
|
|
5783
|
+
if (this.fallbackToWords) {
|
|
5784
|
+
const wordChunks = this.splitSentenceIntoWords(sentence);
|
|
5785
|
+
if (wordChunks.length > 1) {
|
|
5786
|
+
return wordChunks;
|
|
5787
|
+
}
|
|
5788
|
+
}
|
|
5789
|
+
if (this.fallbackToCharacters) {
|
|
5790
|
+
return this.splitSentenceIntoCharacters(sentence);
|
|
5791
|
+
}
|
|
5792
|
+
console.warn(
|
|
5793
|
+
`Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`
|
|
5794
|
+
);
|
|
5795
|
+
return [sentence];
|
|
5796
|
+
}
|
|
5797
|
+
splitSentenceIntoWords(sentence) {
|
|
5798
|
+
const words = sentence.split(/\s+/);
|
|
5799
|
+
const chunks = [];
|
|
5800
|
+
let currentChunk = "";
|
|
5801
|
+
for (const word of words) {
|
|
5802
|
+
const testChunk = currentChunk ? currentChunk + " " + word : word;
|
|
5803
|
+
if (this.lengthFunction(testChunk) <= this.maxSize) {
|
|
5804
|
+
currentChunk = testChunk;
|
|
5805
|
+
} else {
|
|
5806
|
+
if (currentChunk) {
|
|
5807
|
+
chunks.push(currentChunk);
|
|
5808
|
+
}
|
|
5809
|
+
if (this.lengthFunction(word) > this.maxSize) {
|
|
5810
|
+
if (this.fallbackToCharacters) {
|
|
5811
|
+
chunks.push(...this.splitSentenceIntoCharacters(word));
|
|
5812
|
+
} else {
|
|
5813
|
+
chunks.push(word);
|
|
5814
|
+
}
|
|
5815
|
+
currentChunk = "";
|
|
5816
|
+
} else {
|
|
5817
|
+
currentChunk = word;
|
|
5818
|
+
}
|
|
5819
|
+
}
|
|
5820
|
+
}
|
|
5821
|
+
if (currentChunk) {
|
|
5822
|
+
chunks.push(currentChunk);
|
|
5823
|
+
}
|
|
5824
|
+
return chunks;
|
|
5825
|
+
}
|
|
5826
|
+
splitSentenceIntoCharacters(text) {
|
|
5827
|
+
const chunks = [];
|
|
5828
|
+
let currentChunk = "";
|
|
5829
|
+
for (const char of text) {
|
|
5830
|
+
if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
|
|
5831
|
+
currentChunk += char;
|
|
5832
|
+
} else {
|
|
5833
|
+
if (currentChunk) {
|
|
5834
|
+
chunks.push(currentChunk);
|
|
5835
|
+
}
|
|
5836
|
+
currentChunk = char;
|
|
5837
|
+
}
|
|
5838
|
+
}
|
|
5839
|
+
if (currentChunk) {
|
|
5840
|
+
chunks.push(currentChunk);
|
|
5841
|
+
}
|
|
5842
|
+
return chunks;
|
|
5843
|
+
}
|
|
5844
|
+
calculateSentenceOverlap(currentChunk) {
|
|
5845
|
+
if (this.overlap === 0 || currentChunk.length === 0) {
|
|
5846
|
+
return [];
|
|
5847
|
+
}
|
|
5848
|
+
const overlapSentences = [];
|
|
5849
|
+
let overlapSize = 0;
|
|
5850
|
+
const separator = " ";
|
|
5851
|
+
for (let i = currentChunk.length - 1; i >= 0; i--) {
|
|
5852
|
+
const sentence = currentChunk[i];
|
|
5853
|
+
if (!sentence) continue;
|
|
5854
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
5855
|
+
const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
|
|
5856
|
+
if (overlapSize + sentenceLength + separatorLength > this.overlap) {
|
|
5857
|
+
break;
|
|
5858
|
+
}
|
|
5859
|
+
overlapSentences.unshift(sentence);
|
|
5860
|
+
overlapSize += sentenceLength + separatorLength;
|
|
5861
|
+
}
|
|
5862
|
+
return overlapSentences;
|
|
5863
|
+
}
|
|
5864
|
+
calculateChunkSize(sentences) {
|
|
5865
|
+
if (!sentences || sentences.length === 0) {
|
|
5866
|
+
return 0;
|
|
5867
|
+
}
|
|
5868
|
+
let totalSize = 0;
|
|
5869
|
+
const separator = " ";
|
|
5870
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
5871
|
+
const sentence = sentences[i];
|
|
5872
|
+
totalSize += this.lengthFunction(sentence);
|
|
5873
|
+
if (i < sentences.length - 1) {
|
|
5874
|
+
totalSize += this.lengthFunction(separator);
|
|
5875
|
+
}
|
|
5876
|
+
}
|
|
5877
|
+
return totalSize;
|
|
5878
|
+
}
|
|
5879
|
+
splitText({ text }) {
|
|
5880
|
+
if (!text) return [];
|
|
5881
|
+
const sentences = this.detectSentenceBoundaries(text);
|
|
5882
|
+
const chunks = this.groupSentencesIntoChunks(sentences);
|
|
5883
|
+
return chunks.filter((chunk) => chunk.trim().length > 0);
|
|
5884
|
+
}
|
|
5885
|
+
};
|
|
5653
5886
|
function splitTextOnTokens({ text, tokenizer }) {
|
|
5654
5887
|
const splits = [];
|
|
5655
5888
|
const inputIds = tokenizer.encode(text);
|
|
@@ -5700,7 +5933,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
|
|
|
5700
5933
|
};
|
|
5701
5934
|
const tokenizer = {
|
|
5702
5935
|
overlap: this.overlap,
|
|
5703
|
-
tokensPerChunk: this.
|
|
5936
|
+
tokensPerChunk: this.maxSize,
|
|
5704
5937
|
decode,
|
|
5705
5938
|
encode
|
|
5706
5939
|
};
|
|
@@ -5732,13 +5965,105 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
|
|
|
5732
5965
|
allowedSpecial: options.allowedSpecial,
|
|
5733
5966
|
disallowedSpecial: options.disallowedSpecial,
|
|
5734
5967
|
options: {
|
|
5735
|
-
|
|
5968
|
+
maxSize: options.maxSize,
|
|
5736
5969
|
overlap: options.overlap,
|
|
5737
5970
|
lengthFunction: tikTokenEncoder
|
|
5738
5971
|
}
|
|
5739
5972
|
});
|
|
5740
5973
|
}
|
|
5741
5974
|
};
|
|
5975
|
+
function handleDeprecatedSize(data) {
|
|
5976
|
+
if (data.size !== void 0) {
|
|
5977
|
+
console.warn(
|
|
5978
|
+
"[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version."
|
|
5979
|
+
);
|
|
5980
|
+
if (data.maxSize === void 0) {
|
|
5981
|
+
data.maxSize = data.size;
|
|
5982
|
+
}
|
|
5983
|
+
}
|
|
5984
|
+
const { size, ...rest } = data;
|
|
5985
|
+
return rest;
|
|
5986
|
+
}
|
|
5987
|
+
var baseChunkOptionsSchema = zod.z.object({
|
|
5988
|
+
size: zod.z.number().positive().optional(),
|
|
5989
|
+
maxSize: zod.z.number().positive().optional(),
|
|
5990
|
+
overlap: zod.z.number().min(0).optional(),
|
|
5991
|
+
lengthFunction: zod.z.function().optional(),
|
|
5992
|
+
keepSeparator: zod.z.union([zod.z.boolean(), zod.z.literal("start"), zod.z.literal("end")]).optional(),
|
|
5993
|
+
addStartIndex: zod.z.boolean().optional(),
|
|
5994
|
+
stripWhitespace: zod.z.boolean().optional()
|
|
5995
|
+
});
|
|
5996
|
+
var characterChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
5997
|
+
separator: zod.z.string().optional(),
|
|
5998
|
+
isSeparatorRegex: zod.z.boolean().optional()
|
|
5999
|
+
}).strict();
|
|
6000
|
+
var recursiveChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6001
|
+
separators: zod.z.array(zod.z.string()).optional(),
|
|
6002
|
+
isSeparatorRegex: zod.z.boolean().optional(),
|
|
6003
|
+
language: zod.z.string().optional()
|
|
6004
|
+
}).strict();
|
|
6005
|
+
var sentenceChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6006
|
+
maxSize: zod.z.number().positive(),
|
|
6007
|
+
minSize: zod.z.number().positive().optional(),
|
|
6008
|
+
targetSize: zod.z.number().positive().optional(),
|
|
6009
|
+
sentenceEnders: zod.z.array(zod.z.string()).optional(),
|
|
6010
|
+
fallbackToWords: zod.z.boolean().optional(),
|
|
6011
|
+
fallbackToCharacters: zod.z.boolean().optional()
|
|
6012
|
+
}).strict();
|
|
6013
|
+
var isSetLike = (value) => {
|
|
6014
|
+
return typeof value === "object" && value !== null && typeof value.has === "function" && typeof value.add === "function" && typeof value.delete === "function" && typeof value.clear === "function" && typeof value.size === "number";
|
|
6015
|
+
};
|
|
6016
|
+
var setOrAllSchema = zod.z.any().refine((value) => value === "all" || isSetLike(value), {
|
|
6017
|
+
message: "Must be a Set object or the literal 'all'"
|
|
6018
|
+
}).optional();
|
|
6019
|
+
var tokenChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6020
|
+
encodingName: zod.z.string().optional(),
|
|
6021
|
+
modelName: zod.z.string().optional(),
|
|
6022
|
+
allowedSpecial: setOrAllSchema,
|
|
6023
|
+
disallowedSpecial: setOrAllSchema
|
|
6024
|
+
}).strict();
|
|
6025
|
+
var jsonChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6026
|
+
minSize: zod.z.number().positive().optional(),
|
|
6027
|
+
ensureAscii: zod.z.boolean().optional(),
|
|
6028
|
+
convertLists: zod.z.boolean().optional()
|
|
6029
|
+
}).strict();
|
|
6030
|
+
var htmlChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6031
|
+
headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
|
|
6032
|
+
sections: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
|
|
6033
|
+
returnEachLine: zod.z.boolean().optional()
|
|
6034
|
+
}).strict();
|
|
6035
|
+
var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6036
|
+
headers: zod.z.array(zod.z.tuple([zod.z.string(), zod.z.string()])).optional(),
|
|
6037
|
+
returnEachLine: zod.z.boolean().optional(),
|
|
6038
|
+
stripHeaders: zod.z.boolean().optional()
|
|
6039
|
+
}).strict();
|
|
6040
|
+
var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
6041
|
+
var validationSchemas = {
|
|
6042
|
+
character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6043
|
+
recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6044
|
+
sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6045
|
+
token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6046
|
+
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6047
|
+
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6048
|
+
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6049
|
+
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
|
|
6050
|
+
};
|
|
6051
|
+
function validateChunkParams(strategy, params) {
|
|
6052
|
+
const schema = validationSchemas[strategy];
|
|
6053
|
+
if (!schema) {
|
|
6054
|
+
throw new Error(`Unknown chunking strategy: ${strategy}`);
|
|
6055
|
+
}
|
|
6056
|
+
const result = schema.safeParse(params);
|
|
6057
|
+
if (!result.success) {
|
|
6058
|
+
const unrecognizedError = result.error.errors.find((e) => e.code === "unrecognized_keys");
|
|
6059
|
+
if (unrecognizedError && "keys" in unrecognizedError) {
|
|
6060
|
+
const keys = unrecognizedError.keys.join(", ");
|
|
6061
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
|
|
6062
|
+
}
|
|
6063
|
+
const errorMessage = result.error.errors.map((e) => `${e.path.length > 0 ? e.path.join(".") : "parameter"}: ${e.message}`).join(", ");
|
|
6064
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
|
|
6065
|
+
}
|
|
6066
|
+
}
|
|
5742
6067
|
|
|
5743
6068
|
// src/document/document.ts
|
|
5744
6069
|
var MDocument = class _MDocument {
|
|
@@ -5851,30 +6176,21 @@ var MDocument = class _MDocument {
|
|
|
5851
6176
|
}
|
|
5852
6177
|
}
|
|
5853
6178
|
async chunkBy(strategy, options) {
|
|
5854
|
-
|
|
5855
|
-
|
|
5856
|
-
|
|
5857
|
-
|
|
5858
|
-
|
|
5859
|
-
|
|
5860
|
-
|
|
5861
|
-
|
|
5862
|
-
|
|
5863
|
-
|
|
5864
|
-
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
break;
|
|
5870
|
-
case "json":
|
|
5871
|
-
await this.chunkJSON(options);
|
|
5872
|
-
break;
|
|
5873
|
-
case "latex":
|
|
5874
|
-
await this.chunkLatex(options);
|
|
5875
|
-
break;
|
|
5876
|
-
default:
|
|
5877
|
-
throw new Error(`Unknown strategy: ${strategy}`);
|
|
6179
|
+
const strategyMap = {
|
|
6180
|
+
recursive: (options2) => this.chunkRecursive(options2),
|
|
6181
|
+
character: (options2) => this.chunkCharacter(options2),
|
|
6182
|
+
token: (options2) => this.chunkToken(options2),
|
|
6183
|
+
markdown: (options2) => this.chunkMarkdown(options2),
|
|
6184
|
+
html: (options2) => this.chunkHTML(options2),
|
|
6185
|
+
json: (options2) => this.chunkJSON(options2),
|
|
6186
|
+
latex: (options2) => this.chunkLatex(options2),
|
|
6187
|
+
sentence: (options2) => this.chunkSentence(options2)
|
|
6188
|
+
};
|
|
6189
|
+
const chunkingFunc = strategyMap[strategy];
|
|
6190
|
+
if (chunkingFunc) {
|
|
6191
|
+
await chunkingFunc(options);
|
|
6192
|
+
} else {
|
|
6193
|
+
throw new Error(`Unknown strategy: ${strategy}`);
|
|
5878
6194
|
}
|
|
5879
6195
|
}
|
|
5880
6196
|
async chunkRecursive(options) {
|
|
@@ -5884,32 +6200,28 @@ var MDocument = class _MDocument {
|
|
|
5884
6200
|
this.chunks = textSplit2;
|
|
5885
6201
|
return;
|
|
5886
6202
|
}
|
|
5887
|
-
const rt = new RecursiveCharacterTransformer(
|
|
5888
|
-
separators: options?.separators,
|
|
5889
|
-
isSeparatorRegex: options?.isSeparatorRegex,
|
|
5890
|
-
options
|
|
5891
|
-
});
|
|
6203
|
+
const rt = new RecursiveCharacterTransformer(options);
|
|
5892
6204
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5893
6205
|
this.chunks = textSplit;
|
|
5894
6206
|
}
|
|
5895
6207
|
async chunkCharacter(options) {
|
|
5896
6208
|
const rt = new CharacterTransformer({
|
|
6209
|
+
...options,
|
|
5897
6210
|
separator: options?.separator,
|
|
5898
|
-
isSeparatorRegex: options?.isSeparatorRegex
|
|
5899
|
-
options
|
|
6211
|
+
isSeparatorRegex: options?.isSeparatorRegex
|
|
5900
6212
|
});
|
|
5901
6213
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5902
6214
|
this.chunks = textSplit;
|
|
5903
6215
|
}
|
|
5904
6216
|
async chunkHTML(options) {
|
|
5905
6217
|
if (options?.headers?.length) {
|
|
5906
|
-
const rt = new HTMLHeaderTransformer(options
|
|
6218
|
+
const rt = new HTMLHeaderTransformer(options);
|
|
5907
6219
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5908
6220
|
this.chunks = textSplit;
|
|
5909
6221
|
return;
|
|
5910
6222
|
}
|
|
5911
6223
|
if (options?.sections?.length) {
|
|
5912
|
-
const rt = new HTMLSectionTransformer(options
|
|
6224
|
+
const rt = new HTMLSectionTransformer(options);
|
|
5913
6225
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5914
6226
|
this.chunks = textSplit;
|
|
5915
6227
|
return;
|
|
@@ -5956,9 +6268,30 @@ var MDocument = class _MDocument {
|
|
|
5956
6268
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
5957
6269
|
this.chunks = textSplit;
|
|
5958
6270
|
}
|
|
6271
|
+
async chunkSentence(options) {
|
|
6272
|
+
if (!options?.maxSize) {
|
|
6273
|
+
throw new Error("Sentence chunking requires maxSize to be specified");
|
|
6274
|
+
}
|
|
6275
|
+
const rt = new SentenceTransformer({
|
|
6276
|
+
minSize: options?.minSize,
|
|
6277
|
+
maxSize: options?.maxSize,
|
|
6278
|
+
targetSize: options?.targetSize,
|
|
6279
|
+
overlap: options?.overlap,
|
|
6280
|
+
sentenceEnders: options?.sentenceEnders,
|
|
6281
|
+
fallbackToWords: options?.fallbackToWords,
|
|
6282
|
+
fallbackToCharacters: options?.fallbackToCharacters,
|
|
6283
|
+
keepSeparator: options?.keepSeparator,
|
|
6284
|
+
lengthFunction: options?.lengthFunction,
|
|
6285
|
+
addStartIndex: options?.addStartIndex,
|
|
6286
|
+
stripWhitespace: options?.stripWhitespace
|
|
6287
|
+
});
|
|
6288
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
6289
|
+
this.chunks = textSplit;
|
|
6290
|
+
}
|
|
5959
6291
|
async chunk(params) {
|
|
5960
6292
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
5961
6293
|
const strategy = passedStrategy || this.defaultStrategy();
|
|
6294
|
+
validateChunkParams(strategy, chunkOptions);
|
|
5962
6295
|
await this.chunkBy(strategy, chunkOptions);
|
|
5963
6296
|
if (extract) {
|
|
5964
6297
|
await this.extractMetadata(extract);
|
|
@@ -6351,19 +6684,20 @@ var GraphRAG = class {
|
|
|
6351
6684
|
}));
|
|
6352
6685
|
}
|
|
6353
6686
|
};
|
|
6687
|
+
var DEFAULT_CHUNK_PARAMS = {
|
|
6688
|
+
strategy: "recursive",
|
|
6689
|
+
maxSize: 512,
|
|
6690
|
+
overlap: 50,
|
|
6691
|
+
separators: ["\n"]
|
|
6692
|
+
};
|
|
6354
6693
|
var createDocumentChunkerTool = ({
|
|
6355
6694
|
doc,
|
|
6356
|
-
params =
|
|
6357
|
-
strategy: "recursive",
|
|
6358
|
-
size: 512,
|
|
6359
|
-
overlap: 50,
|
|
6360
|
-
separator: "\n"
|
|
6361
|
-
}
|
|
6695
|
+
params = DEFAULT_CHUNK_PARAMS
|
|
6362
6696
|
}) => {
|
|
6363
6697
|
return tools.createTool({
|
|
6364
|
-
id: `Document Chunker ${params.strategy} ${params.
|
|
6698
|
+
id: `Document Chunker ${params.strategy} ${params.maxSize}`,
|
|
6365
6699
|
inputSchema: zod.z.object({}),
|
|
6366
|
-
description: `Chunks document using ${params.strategy} strategy with
|
|
6700
|
+
description: `Chunks document using ${params.strategy} strategy with maxSize ${params.maxSize} and ${params.overlap || 0} overlap`,
|
|
6367
6701
|
execute: async () => {
|
|
6368
6702
|
const chunks = await doc.chunk(params);
|
|
6369
6703
|
return {
|