@mastra/rag 1.0.6 → 1.0.7-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +12 -0
- package/dist/document/document.d.ts +9 -8
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/character.d.ts +4 -26
- package/dist/document/transformers/character.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +8 -3
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/document/transformers/json.d.ts +4 -4
- package/dist/document/transformers/json.d.ts.map +1 -1
- package/dist/document/transformers/latex.d.ts +2 -8
- package/dist/document/transformers/latex.d.ts.map +1 -1
- package/dist/document/transformers/markdown.d.ts +2 -8
- package/dist/document/transformers/markdown.d.ts.map +1 -1
- package/dist/document/transformers/sentence.d.ts +31 -0
- package/dist/document/transformers/sentence.d.ts.map +1 -0
- package/dist/document/transformers/text.d.ts +3 -3
- package/dist/document/transformers/text.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +4 -15
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/document/types.d.ts +85 -14
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts +3 -0
- package/dist/document/validation.d.ts.map +1 -0
- package/dist/index.cjs +414 -80
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +414 -80
- package/dist/index.js.map +1 -1
- package/dist/tools/document-chunker.d.ts.map +1 -1
- package/package.json +5 -5
- package/src/document/document.test.ts +294 -39
- package/src/document/document.ts +69 -41
- package/src/document/transformers/character.ts +15 -43
- package/src/document/transformers/html.ts +9 -9
- package/src/document/transformers/json.ts +8 -3
- package/src/document/transformers/latex.ts +3 -11
- package/src/document/transformers/markdown.ts +3 -11
- package/src/document/transformers/sentence.ts +314 -0
- package/src/document/transformers/text.ts +10 -10
- package/src/document/transformers/token.ts +6 -17
- package/src/document/types.ts +66 -15
- package/src/document/validation.ts +147 -0
- package/src/tools/document-chunker.ts +12 -8
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import type { SentenceChunkOptions } from '../types';
|
|
2
|
+
import { TextTransformer } from './text';
|
|
3
|
+
|
|
4
|
+
export class SentenceTransformer extends TextTransformer {
|
|
5
|
+
protected minSize: number;
|
|
6
|
+
protected maxSize: number;
|
|
7
|
+
protected targetSize: number;
|
|
8
|
+
protected sentenceEnders: string[];
|
|
9
|
+
protected fallbackToWords: boolean;
|
|
10
|
+
protected fallbackToCharacters: boolean;
|
|
11
|
+
protected keepSeparator: boolean | 'start' | 'end';
|
|
12
|
+
|
|
13
|
+
constructor(options: SentenceChunkOptions) {
|
|
14
|
+
// Ensure overlap doesn't exceed maxSize for parent validation
|
|
15
|
+
const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
|
|
16
|
+
|
|
17
|
+
const baseOptions = {
|
|
18
|
+
...options,
|
|
19
|
+
overlap: parentOverlap, // Use adjusted overlap for parent
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
super(baseOptions);
|
|
23
|
+
|
|
24
|
+
this.maxSize = options.maxSize;
|
|
25
|
+
this.minSize = options.minSize ?? 50;
|
|
26
|
+
this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
|
|
27
|
+
this.sentenceEnders = options.sentenceEnders ?? ['.', '!', '?'];
|
|
28
|
+
this.fallbackToWords = options.fallbackToWords ?? true;
|
|
29
|
+
this.fallbackToCharacters = options.fallbackToCharacters ?? true;
|
|
30
|
+
this.keepSeparator = options.keepSeparator ?? false;
|
|
31
|
+
|
|
32
|
+
// Override with original overlap for our sentence logic
|
|
33
|
+
this.overlap = options.overlap ?? 0;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
private detectSentenceBoundaries(text: string): string[] {
|
|
37
|
+
if (!text) return [];
|
|
38
|
+
|
|
39
|
+
const sentences: string[] = [];
|
|
40
|
+
let currentSentence = '';
|
|
41
|
+
let i = 0;
|
|
42
|
+
|
|
43
|
+
while (i < text.length) {
|
|
44
|
+
const char = text[i];
|
|
45
|
+
if (!char) break; // Safety check
|
|
46
|
+
|
|
47
|
+
currentSentence += char;
|
|
48
|
+
|
|
49
|
+
if (this.sentenceEnders.includes(char)) {
|
|
50
|
+
const remainingText = text.slice(i + 1);
|
|
51
|
+
|
|
52
|
+
if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
|
|
53
|
+
sentences.push(currentSentence.trim());
|
|
54
|
+
currentSentence = '';
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
i++;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (currentSentence.trim()) {
|
|
61
|
+
sentences.push(currentSentence.trim());
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return sentences.filter(s => s.length > 0);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
private isRealSentenceBoundary(currentSentence: string, remainingText: string): boolean {
|
|
68
|
+
if (!remainingText.trim()) {
|
|
69
|
+
return true;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (!/^\s+[A-Z]/.test(remainingText)) {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const words = currentSentence.trim().split(/\s+/);
|
|
77
|
+
const lastWord = words[words.length - 1] || '';
|
|
78
|
+
|
|
79
|
+
const baseWord = lastWord.slice(0, -1);
|
|
80
|
+
|
|
81
|
+
if (this.isCommonAbbreviation(baseWord)) {
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return true;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
private isCommonAbbreviation(word: string): boolean {
|
|
89
|
+
// Common titles
|
|
90
|
+
const titles = ['Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr'];
|
|
91
|
+
if (titles.includes(word)) {
|
|
92
|
+
return true;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Multi-character abbreviations with periods (U.S.A., a.m., p.m., etc.)
|
|
96
|
+
if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Single capital letters (initials)
|
|
101
|
+
if (/^[A-Z]$/.test(word)) {
|
|
102
|
+
return true;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Numbers (versions, decimals)
|
|
106
|
+
if (/^\d+$/.test(word)) {
|
|
107
|
+
return true;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Time abbreviations
|
|
111
|
+
if (/^[ap]\.?m$/i.test(word)) {
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Group sentences into chunks with integrated overlap processing
|
|
120
|
+
*/
|
|
121
|
+
private groupSentencesIntoChunks(sentences: string[]): string[] {
|
|
122
|
+
const chunks: string[] = [];
|
|
123
|
+
let currentChunk: string[] = [];
|
|
124
|
+
let currentSize = 0;
|
|
125
|
+
|
|
126
|
+
const separator = ' ';
|
|
127
|
+
|
|
128
|
+
for (const sentence of sentences) {
|
|
129
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
130
|
+
const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
|
|
131
|
+
const totalLength = currentSize + sentenceLength + separatorLength;
|
|
132
|
+
|
|
133
|
+
// Handle oversized sentences with fallback strategies
|
|
134
|
+
if (sentenceLength > this.maxSize) {
|
|
135
|
+
if (currentChunk.length > 0) {
|
|
136
|
+
chunks.push(currentChunk.join(separator));
|
|
137
|
+
currentChunk = [];
|
|
138
|
+
currentSize = 0;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const fallbackChunks = this.handleOversizedSentence(sentence);
|
|
142
|
+
chunks.push(...fallbackChunks);
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// If adding this sentence would exceed maxSize, finalize current chunk
|
|
147
|
+
if (currentChunk.length > 0 && totalLength > this.maxSize) {
|
|
148
|
+
chunks.push(currentChunk.join(separator));
|
|
149
|
+
|
|
150
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
151
|
+
currentChunk = overlapSentences;
|
|
152
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
currentChunk.push(sentence);
|
|
156
|
+
currentSize += sentenceLength + separatorLength;
|
|
157
|
+
|
|
158
|
+
// If we've reached our target size, consider finalizing the chunk
|
|
159
|
+
if (currentSize >= this.targetSize) {
|
|
160
|
+
chunks.push(currentChunk.join(separator));
|
|
161
|
+
|
|
162
|
+
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
163
|
+
currentChunk = overlapSentences;
|
|
164
|
+
currentSize = this.calculateChunkSize(currentChunk);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (currentChunk.length > 0) {
|
|
169
|
+
chunks.push(currentChunk.join(separator));
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return chunks;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Handle oversized sentences with fallback strategies
|
|
177
|
+
*/
|
|
178
|
+
private handleOversizedSentence(sentence: string): string[] {
|
|
179
|
+
// First fallback
|
|
180
|
+
if (this.fallbackToWords) {
|
|
181
|
+
const wordChunks = this.splitSentenceIntoWords(sentence);
|
|
182
|
+
if (wordChunks.length > 1) {
|
|
183
|
+
return wordChunks;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Second fallback
|
|
188
|
+
if (this.fallbackToCharacters) {
|
|
189
|
+
return this.splitSentenceIntoCharacters(sentence);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Last resort
|
|
193
|
+
console.warn(
|
|
194
|
+
`Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`,
|
|
195
|
+
);
|
|
196
|
+
return [sentence];
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
private splitSentenceIntoWords(sentence: string): string[] {
|
|
200
|
+
const words = sentence.split(/\s+/);
|
|
201
|
+
const chunks: string[] = [];
|
|
202
|
+
let currentChunk = '';
|
|
203
|
+
|
|
204
|
+
for (const word of words) {
|
|
205
|
+
const testChunk = currentChunk ? currentChunk + ' ' + word : word;
|
|
206
|
+
|
|
207
|
+
if (this.lengthFunction(testChunk) <= this.maxSize) {
|
|
208
|
+
currentChunk = testChunk;
|
|
209
|
+
} else {
|
|
210
|
+
if (currentChunk) {
|
|
211
|
+
chunks.push(currentChunk);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (this.lengthFunction(word) > this.maxSize) {
|
|
215
|
+
if (this.fallbackToCharacters) {
|
|
216
|
+
chunks.push(...this.splitSentenceIntoCharacters(word));
|
|
217
|
+
} else {
|
|
218
|
+
chunks.push(word);
|
|
219
|
+
}
|
|
220
|
+
currentChunk = '';
|
|
221
|
+
} else {
|
|
222
|
+
currentChunk = word;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (currentChunk) {
|
|
228
|
+
chunks.push(currentChunk);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return chunks;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
private splitSentenceIntoCharacters(text: string): string[] {
|
|
235
|
+
const chunks: string[] = [];
|
|
236
|
+
let currentChunk = '';
|
|
237
|
+
|
|
238
|
+
for (const char of text) {
|
|
239
|
+
if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
|
|
240
|
+
currentChunk += char;
|
|
241
|
+
} else {
|
|
242
|
+
if (currentChunk) {
|
|
243
|
+
chunks.push(currentChunk);
|
|
244
|
+
}
|
|
245
|
+
currentChunk = char;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (currentChunk) {
|
|
250
|
+
chunks.push(currentChunk);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return chunks;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
private calculateSentenceOverlap(currentChunk: string[]): string[] {
|
|
257
|
+
if (this.overlap === 0 || currentChunk.length === 0) {
|
|
258
|
+
return [];
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const overlapSentences: string[] = [];
|
|
262
|
+
let overlapSize = 0;
|
|
263
|
+
const separator = ' ';
|
|
264
|
+
|
|
265
|
+
// Work backwards through sentences to build overlap
|
|
266
|
+
for (let i = currentChunk.length - 1; i >= 0; i--) {
|
|
267
|
+
const sentence = currentChunk[i];
|
|
268
|
+
if (!sentence) continue;
|
|
269
|
+
|
|
270
|
+
const sentenceLength = this.lengthFunction(sentence);
|
|
271
|
+
const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
|
|
272
|
+
|
|
273
|
+
if (overlapSize + sentenceLength + separatorLength > this.overlap) {
|
|
274
|
+
break;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
overlapSentences.unshift(sentence);
|
|
278
|
+
overlapSize += sentenceLength + separatorLength;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return overlapSentences;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
private calculateChunkSize(sentences: string[]): number {
|
|
285
|
+
if (!sentences || sentences.length === 0) {
|
|
286
|
+
return 0;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
let totalSize = 0;
|
|
290
|
+
const separator = ' ';
|
|
291
|
+
|
|
292
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
293
|
+
const sentence = sentences[i]!;
|
|
294
|
+
totalSize += this.lengthFunction(sentence);
|
|
295
|
+
|
|
296
|
+
// Add separator length for all but the last sentence
|
|
297
|
+
if (i < sentences.length - 1) {
|
|
298
|
+
totalSize += this.lengthFunction(separator);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
return totalSize;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
splitText({ text }: { text: string }): string[] {
|
|
306
|
+
if (!text) return [];
|
|
307
|
+
|
|
308
|
+
const sentences = this.detectSentenceBoundaries(text);
|
|
309
|
+
|
|
310
|
+
const chunks = this.groupSentencesIntoChunks(sentences);
|
|
311
|
+
|
|
312
|
+
return chunks.filter(chunk => chunk.trim().length > 0);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { Document } from '../schema';
|
|
2
2
|
|
|
3
|
-
import type {
|
|
3
|
+
import type { BaseChunkOptions } from '../types';
|
|
4
4
|
|
|
5
5
|
import type { Transformer } from './transformer';
|
|
6
6
|
|
|
7
7
|
export abstract class TextTransformer implements Transformer {
|
|
8
|
-
protected
|
|
8
|
+
protected maxSize: number;
|
|
9
9
|
protected overlap: number;
|
|
10
10
|
protected lengthFunction: (text: string) => number;
|
|
11
11
|
protected keepSeparator: boolean | 'start' | 'end';
|
|
@@ -13,17 +13,17 @@ export abstract class TextTransformer implements Transformer {
|
|
|
13
13
|
protected stripWhitespace: boolean;
|
|
14
14
|
|
|
15
15
|
constructor({
|
|
16
|
-
|
|
16
|
+
maxSize = 4000,
|
|
17
17
|
overlap = 200,
|
|
18
18
|
lengthFunction = (text: string) => text.length,
|
|
19
19
|
keepSeparator = false,
|
|
20
20
|
addStartIndex = false,
|
|
21
21
|
stripWhitespace = true,
|
|
22
|
-
}:
|
|
23
|
-
if (overlap >
|
|
24
|
-
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${
|
|
22
|
+
}: BaseChunkOptions) {
|
|
23
|
+
if (overlap > maxSize) {
|
|
24
|
+
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${maxSize}), should be smaller.`);
|
|
25
25
|
}
|
|
26
|
-
this.
|
|
26
|
+
this.maxSize = maxSize;
|
|
27
27
|
this.overlap = overlap;
|
|
28
28
|
this.lengthFunction = lengthFunction;
|
|
29
29
|
this.keepSeparator = keepSeparator;
|
|
@@ -104,9 +104,9 @@ export abstract class TextTransformer implements Transformer {
|
|
|
104
104
|
const len = this.lengthFunction(d);
|
|
105
105
|
const separatorLen = separator ? this.lengthFunction(separator) : 0;
|
|
106
106
|
|
|
107
|
-
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.
|
|
108
|
-
if (total > this.
|
|
109
|
-
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.
|
|
107
|
+
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
|
|
108
|
+
if (total > this.maxSize) {
|
|
109
|
+
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
|
|
110
110
|
}
|
|
111
111
|
|
|
112
112
|
if (currentDoc.length > 0) {
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
|
|
2
2
|
import { encodingForModel, getEncoding } from 'js-tiktoken';
|
|
3
|
+
import type { TokenChunkOptions } from '../types';
|
|
3
4
|
|
|
4
5
|
import { TextTransformer } from './text';
|
|
5
6
|
|
|
@@ -42,18 +43,11 @@ export class TokenTransformer extends TextTransformer {
|
|
|
42
43
|
disallowedSpecial = 'all',
|
|
43
44
|
options = {},
|
|
44
45
|
}: {
|
|
45
|
-
encodingName
|
|
46
|
+
encodingName?: TiktokenEncoding;
|
|
46
47
|
modelName?: TiktokenModel;
|
|
47
48
|
allowedSpecial?: Set<string> | 'all';
|
|
48
49
|
disallowedSpecial?: Set<string> | 'all';
|
|
49
|
-
options:
|
|
50
|
-
size?: number;
|
|
51
|
-
overlap?: number;
|
|
52
|
-
lengthFunction?: (text: string) => number;
|
|
53
|
-
keepSeparator?: boolean | 'start' | 'end';
|
|
54
|
-
addStartIndex?: boolean;
|
|
55
|
-
stripWhitespace?: boolean;
|
|
56
|
-
};
|
|
50
|
+
options: TokenChunkOptions;
|
|
57
51
|
}) {
|
|
58
52
|
super(options);
|
|
59
53
|
|
|
@@ -85,7 +79,7 @@ export class TokenTransformer extends TextTransformer {
|
|
|
85
79
|
|
|
86
80
|
const tokenizer: Tokenizer = {
|
|
87
81
|
overlap: this.overlap,
|
|
88
|
-
tokensPerChunk: this.
|
|
82
|
+
tokensPerChunk: this.maxSize,
|
|
89
83
|
decode,
|
|
90
84
|
encode,
|
|
91
85
|
};
|
|
@@ -100,12 +94,7 @@ export class TokenTransformer extends TextTransformer {
|
|
|
100
94
|
}: {
|
|
101
95
|
encodingName?: TiktokenEncoding;
|
|
102
96
|
modelName?: TiktokenModel;
|
|
103
|
-
options?:
|
|
104
|
-
size?: number;
|
|
105
|
-
overlap?: number;
|
|
106
|
-
allowedSpecial?: Set<string> | 'all';
|
|
107
|
-
disallowedSpecial?: Set<string> | 'all';
|
|
108
|
-
};
|
|
97
|
+
options?: TokenChunkOptions;
|
|
109
98
|
}): TokenTransformer {
|
|
110
99
|
let tokenizer: Tiktoken;
|
|
111
100
|
|
|
@@ -139,7 +128,7 @@ export class TokenTransformer extends TextTransformer {
|
|
|
139
128
|
allowedSpecial: options.allowedSpecial,
|
|
140
129
|
disallowedSpecial: options.disallowedSpecial,
|
|
141
130
|
options: {
|
|
142
|
-
|
|
131
|
+
maxSize: options.maxSize,
|
|
143
132
|
overlap: options.overlap,
|
|
144
133
|
lengthFunction: tikTokenEncoder,
|
|
145
134
|
},
|
package/src/document/types.ts
CHANGED
|
@@ -42,34 +42,85 @@ export type ExtractParams = {
|
|
|
42
42
|
keywords?: KeywordExtractArgs | boolean;
|
|
43
43
|
};
|
|
44
44
|
|
|
45
|
-
export type
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
separator?: string;
|
|
50
|
-
separators?: string[];
|
|
51
|
-
isSeparatorRegex?: boolean;
|
|
45
|
+
export type BaseChunkOptions = {
|
|
46
|
+
/**
|
|
47
|
+
* @deprecated Use `maxSize` instead. Will be removed in next major version.
|
|
48
|
+
*/
|
|
52
49
|
size?: number;
|
|
53
50
|
maxSize?: number;
|
|
54
|
-
minSize?: number;
|
|
55
51
|
overlap?: number;
|
|
56
52
|
lengthFunction?: (text: string) => number;
|
|
57
53
|
keepSeparator?: boolean | 'start' | 'end';
|
|
58
54
|
addStartIndex?: boolean;
|
|
59
55
|
stripWhitespace?: boolean;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
export type CharacterChunkOptions = BaseChunkOptions & {
|
|
59
|
+
separator?: string;
|
|
60
|
+
isSeparatorRegex?: boolean;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
export type RecursiveChunkOptions = BaseChunkOptions & {
|
|
64
|
+
separators?: string[];
|
|
65
|
+
isSeparatorRegex?: boolean;
|
|
60
66
|
language?: Language;
|
|
61
|
-
|
|
62
|
-
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
export type TokenChunkOptions = BaseChunkOptions & {
|
|
63
70
|
encodingName?: TiktokenEncoding;
|
|
64
71
|
modelName?: TiktokenModel;
|
|
65
72
|
allowedSpecial?: Set<string> | 'all';
|
|
66
73
|
disallowedSpecial?: Set<string> | 'all';
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
export type MarkdownChunkOptions = BaseChunkOptions & {
|
|
77
|
+
headers?: [string, string][];
|
|
78
|
+
returnEachLine?: boolean;
|
|
67
79
|
stripHeaders?: boolean;
|
|
68
80
|
};
|
|
69
81
|
|
|
70
|
-
export type
|
|
82
|
+
export type HTMLChunkOptions = BaseChunkOptions &
|
|
83
|
+
(
|
|
84
|
+
| { headers: [string, string][]; sections?: never; returnEachLine?: boolean }
|
|
85
|
+
| { sections: [string, string][]; headers?: never }
|
|
86
|
+
) & { returnEachLine?: boolean };
|
|
71
87
|
|
|
72
|
-
export
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
88
|
+
export type JsonChunkOptions = BaseChunkOptions & {
|
|
89
|
+
minSize?: number;
|
|
90
|
+
ensureAscii?: boolean;
|
|
91
|
+
convertLists?: boolean;
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
export type LatexChunkOptions = BaseChunkOptions & {};
|
|
95
|
+
|
|
96
|
+
export type SentenceChunkOptions = BaseChunkOptions & {
|
|
97
|
+
maxSize: number; // Override to make required for sentence strategy
|
|
98
|
+
minSize?: number;
|
|
99
|
+
targetSize?: number;
|
|
100
|
+
sentenceEnders?: string[];
|
|
101
|
+
fallbackToWords?: boolean;
|
|
102
|
+
fallbackToCharacters?: boolean;
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
export type StrategyOptions = {
|
|
106
|
+
recursive: RecursiveChunkOptions;
|
|
107
|
+
character: CharacterChunkOptions;
|
|
108
|
+
token: TokenChunkOptions;
|
|
109
|
+
markdown: MarkdownChunkOptions;
|
|
110
|
+
html: HTMLChunkOptions;
|
|
111
|
+
json: JsonChunkOptions;
|
|
112
|
+
latex: LatexChunkOptions;
|
|
113
|
+
sentence: SentenceChunkOptions;
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence';
|
|
117
|
+
|
|
118
|
+
export type ChunkParams =
|
|
119
|
+
| ({ strategy?: 'character' } & CharacterChunkOptions & { extract?: ExtractParams })
|
|
120
|
+
| ({ strategy: 'recursive' } & RecursiveChunkOptions & { extract?: ExtractParams })
|
|
121
|
+
| ({ strategy: 'token' } & TokenChunkOptions & { extract?: ExtractParams })
|
|
122
|
+
| ({ strategy: 'markdown' } & MarkdownChunkOptions & { extract?: ExtractParams })
|
|
123
|
+
| ({ strategy: 'html' } & HTMLChunkOptions & { extract?: ExtractParams })
|
|
124
|
+
| ({ strategy: 'json' } & JsonChunkOptions & { extract?: ExtractParams })
|
|
125
|
+
| ({ strategy: 'latex' } & LatexChunkOptions & { extract?: ExtractParams })
|
|
126
|
+
| ({ strategy: 'sentence' } & SentenceChunkOptions & { extract?: ExtractParams });
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import type { ChunkStrategy } from './types';
|
|
3
|
+
|
|
4
|
+
function handleDeprecatedSize<T extends { size?: number; maxSize?: number }>(data: T): Omit<T, 'size'> {
|
|
5
|
+
if (data.size !== undefined) {
|
|
6
|
+
console.warn(
|
|
7
|
+
'[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version.',
|
|
8
|
+
);
|
|
9
|
+
|
|
10
|
+
if (data.maxSize === undefined) {
|
|
11
|
+
data.maxSize = data.size;
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const { size, ...rest } = data;
|
|
16
|
+
return rest;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Base options that apply to all strategies
|
|
20
|
+
const baseChunkOptionsSchema = z.object({
|
|
21
|
+
size: z.number().positive().optional(),
|
|
22
|
+
maxSize: z.number().positive().optional(),
|
|
23
|
+
overlap: z.number().min(0).optional(),
|
|
24
|
+
lengthFunction: z.function().optional(),
|
|
25
|
+
keepSeparator: z.union([z.boolean(), z.literal('start'), z.literal('end')]).optional(),
|
|
26
|
+
addStartIndex: z.boolean().optional(),
|
|
27
|
+
stripWhitespace: z.boolean().optional(),
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
// Strategy-specific schemas
|
|
31
|
+
const characterChunkOptionsSchema = baseChunkOptionsSchema
|
|
32
|
+
.extend({
|
|
33
|
+
separator: z.string().optional(),
|
|
34
|
+
isSeparatorRegex: z.boolean().optional(),
|
|
35
|
+
})
|
|
36
|
+
.strict();
|
|
37
|
+
|
|
38
|
+
const recursiveChunkOptionsSchema = baseChunkOptionsSchema
|
|
39
|
+
.extend({
|
|
40
|
+
separators: z.array(z.string()).optional(),
|
|
41
|
+
isSeparatorRegex: z.boolean().optional(),
|
|
42
|
+
language: z.string().optional(),
|
|
43
|
+
})
|
|
44
|
+
.strict();
|
|
45
|
+
|
|
46
|
+
const sentenceChunkOptionsSchema = baseChunkOptionsSchema
|
|
47
|
+
.extend({
|
|
48
|
+
maxSize: z.number().positive(),
|
|
49
|
+
minSize: z.number().positive().optional(),
|
|
50
|
+
targetSize: z.number().positive().optional(),
|
|
51
|
+
sentenceEnders: z.array(z.string()).optional(),
|
|
52
|
+
fallbackToWords: z.boolean().optional(),
|
|
53
|
+
fallbackToCharacters: z.boolean().optional(),
|
|
54
|
+
})
|
|
55
|
+
.strict();
|
|
56
|
+
|
|
57
|
+
// Predicate to check for Set-like objects
|
|
58
|
+
const isSetLike = (value: unknown): value is Set<any> => {
|
|
59
|
+
return (
|
|
60
|
+
typeof value === 'object' &&
|
|
61
|
+
value !== null &&
|
|
62
|
+
typeof (value as Set<any>).has === 'function' &&
|
|
63
|
+
typeof (value as Set<any>).add === 'function' &&
|
|
64
|
+
typeof (value as Set<any>).delete === 'function' &&
|
|
65
|
+
typeof (value as Set<any>).clear === 'function' &&
|
|
66
|
+
typeof (value as Set<any>).size === 'number'
|
|
67
|
+
);
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Zod schema for a Set or the literal 'all'
|
|
71
|
+
const setOrAllSchema = z
|
|
72
|
+
.any()
|
|
73
|
+
.refine(value => value === 'all' || isSetLike(value), {
|
|
74
|
+
message: "Must be a Set object or the literal 'all'",
|
|
75
|
+
})
|
|
76
|
+
.optional();
|
|
77
|
+
|
|
78
|
+
const tokenChunkOptionsSchema = baseChunkOptionsSchema
|
|
79
|
+
.extend({
|
|
80
|
+
encodingName: z.string().optional(),
|
|
81
|
+
modelName: z.string().optional(),
|
|
82
|
+
allowedSpecial: setOrAllSchema,
|
|
83
|
+
disallowedSpecial: setOrAllSchema,
|
|
84
|
+
})
|
|
85
|
+
.strict();
|
|
86
|
+
|
|
87
|
+
const jsonChunkOptionsSchema = baseChunkOptionsSchema
|
|
88
|
+
.extend({
|
|
89
|
+
minSize: z.number().positive().optional(),
|
|
90
|
+
ensureAscii: z.boolean().optional(),
|
|
91
|
+
convertLists: z.boolean().optional(),
|
|
92
|
+
})
|
|
93
|
+
.strict();
|
|
94
|
+
|
|
95
|
+
const htmlChunkOptionsSchema = baseChunkOptionsSchema
|
|
96
|
+
.extend({
|
|
97
|
+
headers: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
98
|
+
sections: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
99
|
+
returnEachLine: z.boolean().optional(),
|
|
100
|
+
})
|
|
101
|
+
.strict();
|
|
102
|
+
|
|
103
|
+
const markdownChunkOptionsSchema = baseChunkOptionsSchema
|
|
104
|
+
.extend({
|
|
105
|
+
headers: z.array(z.tuple([z.string(), z.string()])).optional(),
|
|
106
|
+
returnEachLine: z.boolean().optional(),
|
|
107
|
+
stripHeaders: z.boolean().optional(),
|
|
108
|
+
})
|
|
109
|
+
.strict();
|
|
110
|
+
|
|
111
|
+
const latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
112
|
+
|
|
113
|
+
// Strategy-specific validation schemas
|
|
114
|
+
const validationSchemas = {
|
|
115
|
+
character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
116
|
+
recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
117
|
+
sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
118
|
+
token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
119
|
+
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
120
|
+
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
121
|
+
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
122
|
+
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
123
|
+
} as const;
|
|
124
|
+
|
|
125
|
+
export function validateChunkParams(strategy: ChunkStrategy, params: any): void {
|
|
126
|
+
const schema = validationSchemas[strategy];
|
|
127
|
+
if (!schema) {
|
|
128
|
+
throw new Error(`Unknown chunking strategy: ${strategy}`);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const result = schema.safeParse(params);
|
|
132
|
+
if (!result.success) {
|
|
133
|
+
// Extract unrecognized keys for cleaner error message
|
|
134
|
+
const unrecognizedError = result.error.errors.find((e: any) => e.code === 'unrecognized_keys');
|
|
135
|
+
if (unrecognizedError && 'keys' in unrecognizedError) {
|
|
136
|
+
const keys = (unrecognizedError as any).keys.join(', ');
|
|
137
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Fallback to general error message for other validation issues
|
|
141
|
+
const errorMessage = result.error.errors
|
|
142
|
+
.map((e: any) => `${e.path.length > 0 ? e.path.join('.') : 'parameter'}: ${e.message}`)
|
|
143
|
+
.join(', ');
|
|
144
|
+
|
|
145
|
+
throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
|
|
146
|
+
}
|
|
147
|
+
}
|