@mastra/rag 1.0.6 → 1.0.7-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +12 -0
  3. package/dist/document/document.d.ts +9 -8
  4. package/dist/document/document.d.ts.map +1 -1
  5. package/dist/document/transformers/character.d.ts +4 -26
  6. package/dist/document/transformers/character.d.ts.map +1 -1
  7. package/dist/document/transformers/html.d.ts +8 -3
  8. package/dist/document/transformers/html.d.ts.map +1 -1
  9. package/dist/document/transformers/json.d.ts +4 -4
  10. package/dist/document/transformers/json.d.ts.map +1 -1
  11. package/dist/document/transformers/latex.d.ts +2 -8
  12. package/dist/document/transformers/latex.d.ts.map +1 -1
  13. package/dist/document/transformers/markdown.d.ts +2 -8
  14. package/dist/document/transformers/markdown.d.ts.map +1 -1
  15. package/dist/document/transformers/sentence.d.ts +31 -0
  16. package/dist/document/transformers/sentence.d.ts.map +1 -0
  17. package/dist/document/transformers/text.d.ts +3 -3
  18. package/dist/document/transformers/text.d.ts.map +1 -1
  19. package/dist/document/transformers/token.d.ts +4 -15
  20. package/dist/document/transformers/token.d.ts.map +1 -1
  21. package/dist/document/types.d.ts +85 -14
  22. package/dist/document/types.d.ts.map +1 -1
  23. package/dist/document/validation.d.ts +3 -0
  24. package/dist/document/validation.d.ts.map +1 -0
  25. package/dist/index.cjs +414 -80
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.js +414 -80
  28. package/dist/index.js.map +1 -1
  29. package/dist/tools/document-chunker.d.ts.map +1 -1
  30. package/package.json +5 -5
  31. package/src/document/document.test.ts +294 -39
  32. package/src/document/document.ts +69 -41
  33. package/src/document/transformers/character.ts +15 -43
  34. package/src/document/transformers/html.ts +9 -9
  35. package/src/document/transformers/json.ts +8 -3
  36. package/src/document/transformers/latex.ts +3 -11
  37. package/src/document/transformers/markdown.ts +3 -11
  38. package/src/document/transformers/sentence.ts +314 -0
  39. package/src/document/transformers/text.ts +10 -10
  40. package/src/document/transformers/token.ts +6 -17
  41. package/src/document/types.ts +66 -15
  42. package/src/document/validation.ts +147 -0
  43. package/src/tools/document-chunker.ts +12 -8
@@ -0,0 +1,314 @@
1
+ import type { SentenceChunkOptions } from '../types';
2
+ import { TextTransformer } from './text';
3
+
4
+ export class SentenceTransformer extends TextTransformer {
5
+ protected minSize: number;
6
+ protected maxSize: number;
7
+ protected targetSize: number;
8
+ protected sentenceEnders: string[];
9
+ protected fallbackToWords: boolean;
10
+ protected fallbackToCharacters: boolean;
11
+ protected keepSeparator: boolean | 'start' | 'end';
12
+
13
+ constructor(options: SentenceChunkOptions) {
14
+ // Ensure overlap doesn't exceed maxSize for parent validation
15
+ const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
16
+
17
+ const baseOptions = {
18
+ ...options,
19
+ overlap: parentOverlap, // Use adjusted overlap for parent
20
+ };
21
+
22
+ super(baseOptions);
23
+
24
+ this.maxSize = options.maxSize;
25
+ this.minSize = options.minSize ?? 50;
26
+ this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
27
+ this.sentenceEnders = options.sentenceEnders ?? ['.', '!', '?'];
28
+ this.fallbackToWords = options.fallbackToWords ?? true;
29
+ this.fallbackToCharacters = options.fallbackToCharacters ?? true;
30
+ this.keepSeparator = options.keepSeparator ?? false;
31
+
32
+ // Override with original overlap for our sentence logic
33
+ this.overlap = options.overlap ?? 0;
34
+ }
35
+
36
+ private detectSentenceBoundaries(text: string): string[] {
37
+ if (!text) return [];
38
+
39
+ const sentences: string[] = [];
40
+ let currentSentence = '';
41
+ let i = 0;
42
+
43
+ while (i < text.length) {
44
+ const char = text[i];
45
+ if (!char) break; // Safety check
46
+
47
+ currentSentence += char;
48
+
49
+ if (this.sentenceEnders.includes(char)) {
50
+ const remainingText = text.slice(i + 1);
51
+
52
+ if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
53
+ sentences.push(currentSentence.trim());
54
+ currentSentence = '';
55
+ }
56
+ }
57
+ i++;
58
+ }
59
+
60
+ if (currentSentence.trim()) {
61
+ sentences.push(currentSentence.trim());
62
+ }
63
+
64
+ return sentences.filter(s => s.length > 0);
65
+ }
66
+
67
+ private isRealSentenceBoundary(currentSentence: string, remainingText: string): boolean {
68
+ if (!remainingText.trim()) {
69
+ return true;
70
+ }
71
+
72
+ if (!/^\s+[A-Z]/.test(remainingText)) {
73
+ return false;
74
+ }
75
+
76
+ const words = currentSentence.trim().split(/\s+/);
77
+ const lastWord = words[words.length - 1] || '';
78
+
79
+ const baseWord = lastWord.slice(0, -1);
80
+
81
+ if (this.isCommonAbbreviation(baseWord)) {
82
+ return false;
83
+ }
84
+
85
+ return true;
86
+ }
87
+
88
+ private isCommonAbbreviation(word: string): boolean {
89
+ // Common titles
90
+ const titles = ['Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr'];
91
+ if (titles.includes(word)) {
92
+ return true;
93
+ }
94
+
95
+ // Multi-character abbreviations with periods (U.S.A., a.m., p.m., etc.)
96
+ if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
97
+ return true;
98
+ }
99
+
100
+ // Single capital letters (initials)
101
+ if (/^[A-Z]$/.test(word)) {
102
+ return true;
103
+ }
104
+
105
+ // Numbers (versions, decimals)
106
+ if (/^\d+$/.test(word)) {
107
+ return true;
108
+ }
109
+
110
+ // Time abbreviations
111
+ if (/^[ap]\.?m$/i.test(word)) {
112
+ return true;
113
+ }
114
+
115
+ return false;
116
+ }
117
+
118
+ /**
119
+ * Group sentences into chunks with integrated overlap processing
120
+ */
121
+ private groupSentencesIntoChunks(sentences: string[]): string[] {
122
+ const chunks: string[] = [];
123
+ let currentChunk: string[] = [];
124
+ let currentSize = 0;
125
+
126
+ const separator = ' ';
127
+
128
+ for (const sentence of sentences) {
129
+ const sentenceLength = this.lengthFunction(sentence);
130
+ const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
131
+ const totalLength = currentSize + sentenceLength + separatorLength;
132
+
133
+ // Handle oversized sentences with fallback strategies
134
+ if (sentenceLength > this.maxSize) {
135
+ if (currentChunk.length > 0) {
136
+ chunks.push(currentChunk.join(separator));
137
+ currentChunk = [];
138
+ currentSize = 0;
139
+ }
140
+
141
+ const fallbackChunks = this.handleOversizedSentence(sentence);
142
+ chunks.push(...fallbackChunks);
143
+ continue;
144
+ }
145
+
146
+ // If adding this sentence would exceed maxSize, finalize current chunk
147
+ if (currentChunk.length > 0 && totalLength > this.maxSize) {
148
+ chunks.push(currentChunk.join(separator));
149
+
150
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
151
+ currentChunk = overlapSentences;
152
+ currentSize = this.calculateChunkSize(currentChunk);
153
+ }
154
+
155
+ currentChunk.push(sentence);
156
+ currentSize += sentenceLength + separatorLength;
157
+
158
+ // If we've reached our target size, consider finalizing the chunk
159
+ if (currentSize >= this.targetSize) {
160
+ chunks.push(currentChunk.join(separator));
161
+
162
+ const overlapSentences = this.calculateSentenceOverlap(currentChunk);
163
+ currentChunk = overlapSentences;
164
+ currentSize = this.calculateChunkSize(currentChunk);
165
+ }
166
+ }
167
+
168
+ if (currentChunk.length > 0) {
169
+ chunks.push(currentChunk.join(separator));
170
+ }
171
+
172
+ return chunks;
173
+ }
174
+
175
+ /**
176
+ * Handle oversized sentences with fallback strategies
177
+ */
178
+ private handleOversizedSentence(sentence: string): string[] {
179
+ // First fallback
180
+ if (this.fallbackToWords) {
181
+ const wordChunks = this.splitSentenceIntoWords(sentence);
182
+ if (wordChunks.length > 1) {
183
+ return wordChunks;
184
+ }
185
+ }
186
+
187
+ // Second fallback
188
+ if (this.fallbackToCharacters) {
189
+ return this.splitSentenceIntoCharacters(sentence);
190
+ }
191
+
192
+ // Last resort
193
+ console.warn(
194
+ `Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`,
195
+ );
196
+ return [sentence];
197
+ }
198
+
199
+ private splitSentenceIntoWords(sentence: string): string[] {
200
+ const words = sentence.split(/\s+/);
201
+ const chunks: string[] = [];
202
+ let currentChunk = '';
203
+
204
+ for (const word of words) {
205
+ const testChunk = currentChunk ? currentChunk + ' ' + word : word;
206
+
207
+ if (this.lengthFunction(testChunk) <= this.maxSize) {
208
+ currentChunk = testChunk;
209
+ } else {
210
+ if (currentChunk) {
211
+ chunks.push(currentChunk);
212
+ }
213
+
214
+ if (this.lengthFunction(word) > this.maxSize) {
215
+ if (this.fallbackToCharacters) {
216
+ chunks.push(...this.splitSentenceIntoCharacters(word));
217
+ } else {
218
+ chunks.push(word);
219
+ }
220
+ currentChunk = '';
221
+ } else {
222
+ currentChunk = word;
223
+ }
224
+ }
225
+ }
226
+
227
+ if (currentChunk) {
228
+ chunks.push(currentChunk);
229
+ }
230
+
231
+ return chunks;
232
+ }
233
+
234
+ private splitSentenceIntoCharacters(text: string): string[] {
235
+ const chunks: string[] = [];
236
+ let currentChunk = '';
237
+
238
+ for (const char of text) {
239
+ if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
240
+ currentChunk += char;
241
+ } else {
242
+ if (currentChunk) {
243
+ chunks.push(currentChunk);
244
+ }
245
+ currentChunk = char;
246
+ }
247
+ }
248
+
249
+ if (currentChunk) {
250
+ chunks.push(currentChunk);
251
+ }
252
+
253
+ return chunks;
254
+ }
255
+
256
+ private calculateSentenceOverlap(currentChunk: string[]): string[] {
257
+ if (this.overlap === 0 || currentChunk.length === 0) {
258
+ return [];
259
+ }
260
+
261
+ const overlapSentences: string[] = [];
262
+ let overlapSize = 0;
263
+ const separator = ' ';
264
+
265
+ // Work backwards through sentences to build overlap
266
+ for (let i = currentChunk.length - 1; i >= 0; i--) {
267
+ const sentence = currentChunk[i];
268
+ if (!sentence) continue;
269
+
270
+ const sentenceLength = this.lengthFunction(sentence);
271
+ const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
272
+
273
+ if (overlapSize + sentenceLength + separatorLength > this.overlap) {
274
+ break;
275
+ }
276
+
277
+ overlapSentences.unshift(sentence);
278
+ overlapSize += sentenceLength + separatorLength;
279
+ }
280
+
281
+ return overlapSentences;
282
+ }
283
+
284
+ private calculateChunkSize(sentences: string[]): number {
285
+ if (!sentences || sentences.length === 0) {
286
+ return 0;
287
+ }
288
+
289
+ let totalSize = 0;
290
+ const separator = ' ';
291
+
292
+ for (let i = 0; i < sentences.length; i++) {
293
+ const sentence = sentences[i]!;
294
+ totalSize += this.lengthFunction(sentence);
295
+
296
+ // Add separator length for all but the last sentence
297
+ if (i < sentences.length - 1) {
298
+ totalSize += this.lengthFunction(separator);
299
+ }
300
+ }
301
+
302
+ return totalSize;
303
+ }
304
+
305
+ splitText({ text }: { text: string }): string[] {
306
+ if (!text) return [];
307
+
308
+ const sentences = this.detectSentenceBoundaries(text);
309
+
310
+ const chunks = this.groupSentencesIntoChunks(sentences);
311
+
312
+ return chunks.filter(chunk => chunk.trim().length > 0);
313
+ }
314
+ }
@@ -1,11 +1,11 @@
1
1
  import { Document } from '../schema';
2
2
 
3
- import type { ChunkOptions } from '../types';
3
+ import type { BaseChunkOptions } from '../types';
4
4
 
5
5
  import type { Transformer } from './transformer';
6
6
 
7
7
  export abstract class TextTransformer implements Transformer {
8
- protected size: number;
8
+ protected maxSize: number;
9
9
  protected overlap: number;
10
10
  protected lengthFunction: (text: string) => number;
11
11
  protected keepSeparator: boolean | 'start' | 'end';
@@ -13,17 +13,17 @@ export abstract class TextTransformer implements Transformer {
13
13
  protected stripWhitespace: boolean;
14
14
 
15
15
  constructor({
16
- size = 4000,
16
+ maxSize = 4000,
17
17
  overlap = 200,
18
18
  lengthFunction = (text: string) => text.length,
19
19
  keepSeparator = false,
20
20
  addStartIndex = false,
21
21
  stripWhitespace = true,
22
- }: ChunkOptions) {
23
- if (overlap > size) {
24
- throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${size}), should be smaller.`);
22
+ }: BaseChunkOptions) {
23
+ if (overlap > maxSize) {
24
+ throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${maxSize}), should be smaller.`);
25
25
  }
26
- this.size = size;
26
+ this.maxSize = maxSize;
27
27
  this.overlap = overlap;
28
28
  this.lengthFunction = lengthFunction;
29
29
  this.keepSeparator = keepSeparator;
@@ -104,9 +104,9 @@ export abstract class TextTransformer implements Transformer {
104
104
  const len = this.lengthFunction(d);
105
105
  const separatorLen = separator ? this.lengthFunction(separator) : 0;
106
106
 
107
- if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.size) {
108
- if (total > this.size) {
109
- console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.size}`);
107
+ if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
108
+ if (total > this.maxSize) {
109
+ console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
110
110
  }
111
111
 
112
112
  if (currentDoc.length > 0) {
@@ -1,5 +1,6 @@
1
1
  import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
2
2
  import { encodingForModel, getEncoding } from 'js-tiktoken';
3
+ import type { TokenChunkOptions } from '../types';
3
4
 
4
5
  import { TextTransformer } from './text';
5
6
 
@@ -42,18 +43,11 @@ export class TokenTransformer extends TextTransformer {
42
43
  disallowedSpecial = 'all',
43
44
  options = {},
44
45
  }: {
45
- encodingName: TiktokenEncoding;
46
+ encodingName?: TiktokenEncoding;
46
47
  modelName?: TiktokenModel;
47
48
  allowedSpecial?: Set<string> | 'all';
48
49
  disallowedSpecial?: Set<string> | 'all';
49
- options: {
50
- size?: number;
51
- overlap?: number;
52
- lengthFunction?: (text: string) => number;
53
- keepSeparator?: boolean | 'start' | 'end';
54
- addStartIndex?: boolean;
55
- stripWhitespace?: boolean;
56
- };
50
+ options: TokenChunkOptions;
57
51
  }) {
58
52
  super(options);
59
53
 
@@ -85,7 +79,7 @@ export class TokenTransformer extends TextTransformer {
85
79
 
86
80
  const tokenizer: Tokenizer = {
87
81
  overlap: this.overlap,
88
- tokensPerChunk: this.size,
82
+ tokensPerChunk: this.maxSize,
89
83
  decode,
90
84
  encode,
91
85
  };
@@ -100,12 +94,7 @@ export class TokenTransformer extends TextTransformer {
100
94
  }: {
101
95
  encodingName?: TiktokenEncoding;
102
96
  modelName?: TiktokenModel;
103
- options?: {
104
- size?: number;
105
- overlap?: number;
106
- allowedSpecial?: Set<string> | 'all';
107
- disallowedSpecial?: Set<string> | 'all';
108
- };
97
+ options?: TokenChunkOptions;
109
98
  }): TokenTransformer {
110
99
  let tokenizer: Tiktoken;
111
100
 
@@ -139,7 +128,7 @@ export class TokenTransformer extends TextTransformer {
139
128
  allowedSpecial: options.allowedSpecial,
140
129
  disallowedSpecial: options.disallowedSpecial,
141
130
  options: {
142
- size: options.size,
131
+ maxSize: options.maxSize,
143
132
  overlap: options.overlap,
144
133
  lengthFunction: tikTokenEncoder,
145
134
  },
@@ -42,34 +42,85 @@ export type ExtractParams = {
42
42
  keywords?: KeywordExtractArgs | boolean;
43
43
  };
44
44
 
45
- export type ChunkOptions = {
46
- headers?: [string, string][];
47
- returnEachLine?: boolean;
48
- sections?: [string, string][];
49
- separator?: string;
50
- separators?: string[];
51
- isSeparatorRegex?: boolean;
45
+ export type BaseChunkOptions = {
46
+ /**
47
+ * @deprecated Use `maxSize` instead. Will be removed in next major version.
48
+ */
52
49
  size?: number;
53
50
  maxSize?: number;
54
- minSize?: number;
55
51
  overlap?: number;
56
52
  lengthFunction?: (text: string) => number;
57
53
  keepSeparator?: boolean | 'start' | 'end';
58
54
  addStartIndex?: boolean;
59
55
  stripWhitespace?: boolean;
56
+ };
57
+
58
+ export type CharacterChunkOptions = BaseChunkOptions & {
59
+ separator?: string;
60
+ isSeparatorRegex?: boolean;
61
+ };
62
+
63
+ export type RecursiveChunkOptions = BaseChunkOptions & {
64
+ separators?: string[];
65
+ isSeparatorRegex?: boolean;
60
66
  language?: Language;
61
- ensureAscii?: boolean;
62
- convertLists?: boolean;
67
+ };
68
+
69
+ export type TokenChunkOptions = BaseChunkOptions & {
63
70
  encodingName?: TiktokenEncoding;
64
71
  modelName?: TiktokenModel;
65
72
  allowedSpecial?: Set<string> | 'all';
66
73
  disallowedSpecial?: Set<string> | 'all';
74
+ };
75
+
76
+ export type MarkdownChunkOptions = BaseChunkOptions & {
77
+ headers?: [string, string][];
78
+ returnEachLine?: boolean;
67
79
  stripHeaders?: boolean;
68
80
  };
69
81
 
70
- export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex';
82
+ export type HTMLChunkOptions = BaseChunkOptions &
83
+ (
84
+ | { headers: [string, string][]; sections?: never; returnEachLine?: boolean }
85
+ | { sections: [string, string][]; headers?: never }
86
+ ) & { returnEachLine?: boolean };
71
87
 
72
- export interface ChunkParams extends ChunkOptions {
73
- strategy?: ChunkStrategy;
74
- extract?: ExtractParams;
75
- }
88
+ export type JsonChunkOptions = BaseChunkOptions & {
89
+ minSize?: number;
90
+ ensureAscii?: boolean;
91
+ convertLists?: boolean;
92
+ };
93
+
94
+ export type LatexChunkOptions = BaseChunkOptions & {};
95
+
96
+ export type SentenceChunkOptions = BaseChunkOptions & {
97
+ maxSize: number; // Override to make required for sentence strategy
98
+ minSize?: number;
99
+ targetSize?: number;
100
+ sentenceEnders?: string[];
101
+ fallbackToWords?: boolean;
102
+ fallbackToCharacters?: boolean;
103
+ };
104
+
105
+ export type StrategyOptions = {
106
+ recursive: RecursiveChunkOptions;
107
+ character: CharacterChunkOptions;
108
+ token: TokenChunkOptions;
109
+ markdown: MarkdownChunkOptions;
110
+ html: HTMLChunkOptions;
111
+ json: JsonChunkOptions;
112
+ latex: LatexChunkOptions;
113
+ sentence: SentenceChunkOptions;
114
+ };
115
+
116
+ export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence';
117
+
118
+ export type ChunkParams =
119
+ | ({ strategy?: 'character' } & CharacterChunkOptions & { extract?: ExtractParams })
120
+ | ({ strategy: 'recursive' } & RecursiveChunkOptions & { extract?: ExtractParams })
121
+ | ({ strategy: 'token' } & TokenChunkOptions & { extract?: ExtractParams })
122
+ | ({ strategy: 'markdown' } & MarkdownChunkOptions & { extract?: ExtractParams })
123
+ | ({ strategy: 'html' } & HTMLChunkOptions & { extract?: ExtractParams })
124
+ | ({ strategy: 'json' } & JsonChunkOptions & { extract?: ExtractParams })
125
+ | ({ strategy: 'latex' } & LatexChunkOptions & { extract?: ExtractParams })
126
+ | ({ strategy: 'sentence' } & SentenceChunkOptions & { extract?: ExtractParams });
@@ -0,0 +1,147 @@
1
+ import { z } from 'zod';
2
+ import type { ChunkStrategy } from './types';
3
+
4
+ function handleDeprecatedSize<T extends { size?: number; maxSize?: number }>(data: T): Omit<T, 'size'> {
5
+ if (data.size !== undefined) {
6
+ console.warn(
7
+ '[DEPRECATION] `size` is deprecated. Use `maxSize` instead. This will be removed in the next major version.',
8
+ );
9
+
10
+ if (data.maxSize === undefined) {
11
+ data.maxSize = data.size;
12
+ }
13
+ }
14
+
15
+ const { size, ...rest } = data;
16
+ return rest;
17
+ }
18
+
19
+ // Base options that apply to all strategies
20
+ const baseChunkOptionsSchema = z.object({
21
+ size: z.number().positive().optional(),
22
+ maxSize: z.number().positive().optional(),
23
+ overlap: z.number().min(0).optional(),
24
+ lengthFunction: z.function().optional(),
25
+ keepSeparator: z.union([z.boolean(), z.literal('start'), z.literal('end')]).optional(),
26
+ addStartIndex: z.boolean().optional(),
27
+ stripWhitespace: z.boolean().optional(),
28
+ });
29
+
30
+ // Strategy-specific schemas
31
+ const characterChunkOptionsSchema = baseChunkOptionsSchema
32
+ .extend({
33
+ separator: z.string().optional(),
34
+ isSeparatorRegex: z.boolean().optional(),
35
+ })
36
+ .strict();
37
+
38
+ const recursiveChunkOptionsSchema = baseChunkOptionsSchema
39
+ .extend({
40
+ separators: z.array(z.string()).optional(),
41
+ isSeparatorRegex: z.boolean().optional(),
42
+ language: z.string().optional(),
43
+ })
44
+ .strict();
45
+
46
+ const sentenceChunkOptionsSchema = baseChunkOptionsSchema
47
+ .extend({
48
+ maxSize: z.number().positive(),
49
+ minSize: z.number().positive().optional(),
50
+ targetSize: z.number().positive().optional(),
51
+ sentenceEnders: z.array(z.string()).optional(),
52
+ fallbackToWords: z.boolean().optional(),
53
+ fallbackToCharacters: z.boolean().optional(),
54
+ })
55
+ .strict();
56
+
57
+ // Predicate to check for Set-like objects
58
+ const isSetLike = (value: unknown): value is Set<any> => {
59
+ return (
60
+ typeof value === 'object' &&
61
+ value !== null &&
62
+ typeof (value as Set<any>).has === 'function' &&
63
+ typeof (value as Set<any>).add === 'function' &&
64
+ typeof (value as Set<any>).delete === 'function' &&
65
+ typeof (value as Set<any>).clear === 'function' &&
66
+ typeof (value as Set<any>).size === 'number'
67
+ );
68
+ };
69
+
70
+ // Zod schema for a Set or the literal 'all'
71
+ const setOrAllSchema = z
72
+ .any()
73
+ .refine(value => value === 'all' || isSetLike(value), {
74
+ message: "Must be a Set object or the literal 'all'",
75
+ })
76
+ .optional();
77
+
78
+ const tokenChunkOptionsSchema = baseChunkOptionsSchema
79
+ .extend({
80
+ encodingName: z.string().optional(),
81
+ modelName: z.string().optional(),
82
+ allowedSpecial: setOrAllSchema,
83
+ disallowedSpecial: setOrAllSchema,
84
+ })
85
+ .strict();
86
+
87
+ const jsonChunkOptionsSchema = baseChunkOptionsSchema
88
+ .extend({
89
+ minSize: z.number().positive().optional(),
90
+ ensureAscii: z.boolean().optional(),
91
+ convertLists: z.boolean().optional(),
92
+ })
93
+ .strict();
94
+
95
+ const htmlChunkOptionsSchema = baseChunkOptionsSchema
96
+ .extend({
97
+ headers: z.array(z.tuple([z.string(), z.string()])).optional(),
98
+ sections: z.array(z.tuple([z.string(), z.string()])).optional(),
99
+ returnEachLine: z.boolean().optional(),
100
+ })
101
+ .strict();
102
+
103
+ const markdownChunkOptionsSchema = baseChunkOptionsSchema
104
+ .extend({
105
+ headers: z.array(z.tuple([z.string(), z.string()])).optional(),
106
+ returnEachLine: z.boolean().optional(),
107
+ stripHeaders: z.boolean().optional(),
108
+ })
109
+ .strict();
110
+
111
+ const latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
112
+
113
+ // Strategy-specific validation schemas
114
+ const validationSchemas = {
115
+ character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
116
+ recursive: recursiveChunkOptionsSchema.transform(handleDeprecatedSize),
117
+ sentence: sentenceChunkOptionsSchema.transform(handleDeprecatedSize),
118
+ token: tokenChunkOptionsSchema.transform(handleDeprecatedSize),
119
+ json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
120
+ html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
121
+ markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
122
+ latex: latexChunkOptionsSchema.transform(handleDeprecatedSize),
123
+ } as const;
124
+
125
+ export function validateChunkParams(strategy: ChunkStrategy, params: any): void {
126
+ const schema = validationSchemas[strategy];
127
+ if (!schema) {
128
+ throw new Error(`Unknown chunking strategy: ${strategy}`);
129
+ }
130
+
131
+ const result = schema.safeParse(params);
132
+ if (!result.success) {
133
+ // Extract unrecognized keys for cleaner error message
134
+ const unrecognizedError = result.error.errors.find((e: any) => e.code === 'unrecognized_keys');
135
+ if (unrecognizedError && 'keys' in unrecognizedError) {
136
+ const keys = (unrecognizedError as any).keys.join(', ');
137
+ throw new Error(`Invalid parameters for ${strategy} strategy: '${keys}' not supported`);
138
+ }
139
+
140
+ // Fallback to general error message for other validation issues
141
+ const errorMessage = result.error.errors
142
+ .map((e: any) => `${e.path.length > 0 ? e.path.join('.') : 'parameter'}: ${e.message}`)
143
+ .join(', ');
144
+
145
+ throw new Error(`Invalid parameters for ${strategy} strategy: ${errorMessage}`);
146
+ }
147
+ }