@mastra/rag 1.0.6 → 1.0.7-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +12 -0
  3. package/dist/document/document.d.ts +9 -8
  4. package/dist/document/document.d.ts.map +1 -1
  5. package/dist/document/transformers/character.d.ts +4 -26
  6. package/dist/document/transformers/character.d.ts.map +1 -1
  7. package/dist/document/transformers/html.d.ts +8 -3
  8. package/dist/document/transformers/html.d.ts.map +1 -1
  9. package/dist/document/transformers/json.d.ts +4 -4
  10. package/dist/document/transformers/json.d.ts.map +1 -1
  11. package/dist/document/transformers/latex.d.ts +2 -8
  12. package/dist/document/transformers/latex.d.ts.map +1 -1
  13. package/dist/document/transformers/markdown.d.ts +2 -8
  14. package/dist/document/transformers/markdown.d.ts.map +1 -1
  15. package/dist/document/transformers/sentence.d.ts +31 -0
  16. package/dist/document/transformers/sentence.d.ts.map +1 -0
  17. package/dist/document/transformers/text.d.ts +3 -3
  18. package/dist/document/transformers/text.d.ts.map +1 -1
  19. package/dist/document/transformers/token.d.ts +4 -15
  20. package/dist/document/transformers/token.d.ts.map +1 -1
  21. package/dist/document/types.d.ts +85 -14
  22. package/dist/document/types.d.ts.map +1 -1
  23. package/dist/document/validation.d.ts +3 -0
  24. package/dist/document/validation.d.ts.map +1 -0
  25. package/dist/index.cjs +414 -80
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.js +414 -80
  28. package/dist/index.js.map +1 -1
  29. package/dist/tools/document-chunker.d.ts.map +1 -1
  30. package/package.json +5 -5
  31. package/src/document/document.test.ts +294 -39
  32. package/src/document/document.ts +69 -41
  33. package/src/document/transformers/character.ts +15 -43
  34. package/src/document/transformers/html.ts +9 -9
  35. package/src/document/transformers/json.ts +8 -3
  36. package/src/document/transformers/latex.ts +3 -11
  37. package/src/document/transformers/markdown.ts +3 -11
  38. package/src/document/transformers/sentence.ts +314 -0
  39. package/src/document/transformers/text.ts +10 -10
  40. package/src/document/transformers/token.ts +6 -17
  41. package/src/document/types.ts +66 -15
  42. package/src/document/validation.ts +147 -0
  43. package/src/tools/document-chunker.ts +12 -8
@@ -7,8 +7,23 @@ import { HTMLHeaderTransformer, HTMLSectionTransformer } from './transformers/ht
7
7
  import { RecursiveJsonTransformer } from './transformers/json';
8
8
  import { LatexTransformer } from './transformers/latex';
9
9
  import { MarkdownHeaderTransformer, MarkdownTransformer } from './transformers/markdown';
10
+ import { SentenceTransformer } from './transformers/sentence';
10
11
  import { TokenTransformer } from './transformers/token';
11
- import type { ChunkOptions, ChunkParams, ChunkStrategy, ExtractParams } from './types';
12
+ import type {
13
+ ChunkParams,
14
+ ChunkStrategy,
15
+ ExtractParams,
16
+ HTMLChunkOptions,
17
+ RecursiveChunkOptions,
18
+ CharacterChunkOptions,
19
+ TokenChunkOptions,
20
+ MarkdownChunkOptions,
21
+ JsonChunkOptions,
22
+ LatexChunkOptions,
23
+ SentenceChunkOptions,
24
+ StrategyOptions,
25
+ } from './types';
26
+ import { validateChunkParams } from './validation';
12
27
 
13
28
  export class MDocument {
14
29
  private chunks: Chunk[];
@@ -135,35 +150,27 @@ export class MDocument {
135
150
  }
136
151
  }
137
152
 
138
- private async chunkBy(strategy: ChunkStrategy, options?: ChunkOptions): Promise<void> {
139
- switch (strategy) {
140
- case 'recursive':
141
- await this.chunkRecursive(options);
142
- break;
143
- case 'character':
144
- await this.chunkCharacter(options);
145
- break;
146
- case 'token':
147
- await this.chunkToken(options);
148
- break;
149
- case 'markdown':
150
- await this.chunkMarkdown(options);
151
- break;
152
- case 'html':
153
- await this.chunkHTML(options);
154
- break;
155
- case 'json':
156
- await this.chunkJSON(options);
157
- break;
158
- case 'latex':
159
- await this.chunkLatex(options);
160
- break;
161
- default:
162
- throw new Error(`Unknown strategy: ${strategy}`);
153
+ private async chunkBy<K extends ChunkStrategy>(strategy: K, options?: StrategyOptions[K]): Promise<void> {
154
+ const strategyMap: { [S in ChunkStrategy]: (options?: StrategyOptions[S]) => Promise<void> } = {
155
+ recursive: options => this.chunkRecursive(options),
156
+ character: options => this.chunkCharacter(options),
157
+ token: options => this.chunkToken(options),
158
+ markdown: options => this.chunkMarkdown(options),
159
+ html: options => this.chunkHTML(options),
160
+ json: options => this.chunkJSON(options),
161
+ latex: options => this.chunkLatex(options),
162
+ sentence: options => this.chunkSentence(options),
163
+ };
164
+
165
+ const chunkingFunc = strategyMap[strategy];
166
+ if (chunkingFunc) {
167
+ await chunkingFunc(options);
168
+ } else {
169
+ throw new Error(`Unknown strategy: ${strategy}`);
163
170
  }
164
171
  }
165
172
 
166
- async chunkRecursive(options?: ChunkOptions): Promise<void> {
173
+ async chunkRecursive(options?: RecursiveChunkOptions): Promise<void> {
167
174
  if (options?.language) {
168
175
  const rt = RecursiveCharacterTransformer.fromLanguage(options.language, options);
169
176
  const textSplit = rt.transformDocuments(this.chunks);
@@ -171,28 +178,24 @@ export class MDocument {
171
178
  return;
172
179
  }
173
180
 
174
- const rt = new RecursiveCharacterTransformer({
175
- separators: options?.separators,
176
- isSeparatorRegex: options?.isSeparatorRegex,
177
- options,
178
- });
181
+ const rt = new RecursiveCharacterTransformer(options);
179
182
  const textSplit = rt.transformDocuments(this.chunks);
180
183
  this.chunks = textSplit;
181
184
  }
182
185
 
183
- async chunkCharacter(options?: ChunkOptions): Promise<void> {
186
+ async chunkCharacter(options?: CharacterChunkOptions): Promise<void> {
184
187
  const rt = new CharacterTransformer({
188
+ ...options,
185
189
  separator: options?.separator,
186
190
  isSeparatorRegex: options?.isSeparatorRegex,
187
- options,
188
191
  });
189
192
  const textSplit = rt.transformDocuments(this.chunks);
190
193
  this.chunks = textSplit;
191
194
  }
192
195
 
193
- async chunkHTML(options?: ChunkOptions): Promise<void> {
196
+ async chunkHTML(options?: HTMLChunkOptions): Promise<void> {
194
197
  if (options?.headers?.length) {
195
- const rt = new HTMLHeaderTransformer(options.headers, options?.returnEachLine);
198
+ const rt = new HTMLHeaderTransformer(options as HTMLChunkOptions & { headers: [string, string][] });
196
199
 
197
200
  const textSplit = rt.transformDocuments(this.chunks);
198
201
  this.chunks = textSplit;
@@ -200,7 +203,7 @@ export class MDocument {
200
203
  }
201
204
 
202
205
  if (options?.sections?.length) {
203
- const rt = new HTMLSectionTransformer(options.sections);
206
+ const rt = new HTMLSectionTransformer(options as HTMLChunkOptions & { sections: [string, string][] });
204
207
 
205
208
  const textSplit = rt.transformDocuments(this.chunks);
206
209
  this.chunks = textSplit;
@@ -210,7 +213,7 @@ export class MDocument {
210
213
  throw new Error('HTML chunking requires either headers or sections to be specified');
211
214
  }
212
215
 
213
- async chunkJSON(options?: ChunkOptions): Promise<void> {
216
+ async chunkJSON(options?: JsonChunkOptions): Promise<void> {
214
217
  if (!options?.maxSize) {
215
218
  throw new Error('JSON chunking requires maxSize to be specified');
216
219
  }
@@ -229,13 +232,13 @@ export class MDocument {
229
232
  this.chunks = textSplit;
230
233
  }
231
234
 
232
- async chunkLatex(options?: ChunkOptions): Promise<void> {
235
+ async chunkLatex(options?: LatexChunkOptions): Promise<void> {
233
236
  const rt = new LatexTransformer(options);
234
237
  const textSplit = rt.transformDocuments(this.chunks);
235
238
  this.chunks = textSplit;
236
239
  }
237
240
 
238
- async chunkToken(options?: ChunkOptions): Promise<void> {
241
+ async chunkToken(options?: TokenChunkOptions): Promise<void> {
239
242
  const rt = TokenTransformer.fromTikToken({
240
243
  options,
241
244
  encodingName: options?.encodingName,
@@ -245,7 +248,7 @@ export class MDocument {
245
248
  this.chunks = textSplit;
246
249
  }
247
250
 
248
- async chunkMarkdown(options?: ChunkOptions): Promise<void> {
251
+ async chunkMarkdown(options?: MarkdownChunkOptions): Promise<void> {
249
252
  if (options?.headers) {
250
253
  const rt = new MarkdownHeaderTransformer(options.headers, options?.returnEachLine, options?.stripHeaders);
251
254
  const textSplit = rt.transformDocuments(this.chunks);
@@ -258,11 +261,36 @@ export class MDocument {
258
261
  this.chunks = textSplit;
259
262
  }
260
263
 
264
+ async chunkSentence(options?: SentenceChunkOptions): Promise<void> {
265
+ if (!options?.maxSize) {
266
+ throw new Error('Sentence chunking requires maxSize to be specified');
267
+ }
268
+
269
+ const rt = new SentenceTransformer({
270
+ minSize: options?.minSize,
271
+ maxSize: options?.maxSize,
272
+ targetSize: options?.targetSize,
273
+ overlap: options?.overlap,
274
+ sentenceEnders: options?.sentenceEnders,
275
+ fallbackToWords: options?.fallbackToWords,
276
+ fallbackToCharacters: options?.fallbackToCharacters,
277
+ keepSeparator: options?.keepSeparator,
278
+ lengthFunction: options?.lengthFunction,
279
+ addStartIndex: options?.addStartIndex,
280
+ stripWhitespace: options?.stripWhitespace,
281
+ });
282
+
283
+ const textSplit = rt.transformDocuments(this.chunks);
284
+ this.chunks = textSplit;
285
+ }
286
+
261
287
  async chunk(params?: ChunkParams): Promise<Chunk[]> {
262
288
  const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
263
289
  // Determine the default strategy based on type if not specified
264
290
  const strategy = passedStrategy || this.defaultStrategy();
265
291
 
292
+ validateChunkParams(strategy, chunkOptions);
293
+
266
294
  // Apply the appropriate chunking strategy
267
295
  await this.chunkBy(strategy, chunkOptions);
268
296
 
@@ -1,5 +1,5 @@
1
1
  import { Language } from '../types';
2
- import type { ChunkOptions } from '../types';
2
+ import type { BaseChunkOptions, CharacterChunkOptions, RecursiveChunkOptions } from '../types';
3
3
 
4
4
  import { TextTransformer } from './text';
5
5
 
@@ -52,23 +52,8 @@ export class CharacterTransformer extends TextTransformer {
52
52
  protected separator: string;
53
53
  protected isSeparatorRegex: boolean;
54
54
 
55
- constructor({
56
- separator = '\n\n',
57
- isSeparatorRegex = false,
58
- options = {},
59
- }: {
60
- separator?: string;
61
- isSeparatorRegex?: boolean;
62
- options?: {
63
- size?: number;
64
- overlap?: number;
65
- lengthFunction?: (text: string) => number;
66
- keepSeparator?: boolean | 'start' | 'end';
67
- addStartIndex?: boolean;
68
- stripWhitespace?: boolean;
69
- };
70
- }) {
71
- super(options);
55
+ constructor({ separator = '\n\n', isSeparatorRegex = false, ...baseOptions }: CharacterChunkOptions = {}) {
56
+ super(baseOptions);
72
57
  this.separator = separator;
73
58
  this.isSeparatorRegex = isSeparatorRegex;
74
59
  }
@@ -82,7 +67,7 @@ export class CharacterTransformer extends TextTransformer {
82
67
  // If length of any split is greater than chunk size, perform additional splitting
83
68
  const chunks: string[] = [];
84
69
  for (const split of initialSplits) {
85
- if (this.lengthFunction(split) <= this.size) {
70
+ if (this.lengthFunction(split) <= this.maxSize) {
86
71
  chunks.push(split);
87
72
  } else {
88
73
  // If a single split is too large, split it further with overlap
@@ -102,7 +87,7 @@ export class CharacterTransformer extends TextTransformer {
102
87
  let chunkEnd = currentPosition;
103
88
 
104
89
  // Build chunk up to max size
105
- while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.size) {
90
+ while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
106
91
  chunkEnd++;
107
92
  }
108
93
 
@@ -125,16 +110,8 @@ export class RecursiveCharacterTransformer extends TextTransformer {
125
110
  protected separators: string[];
126
111
  protected isSeparatorRegex: boolean;
127
112
 
128
- constructor({
129
- separators,
130
- isSeparatorRegex = false,
131
- options = {},
132
- }: {
133
- separators?: string[];
134
- isSeparatorRegex?: boolean;
135
- options?: ChunkOptions;
136
- }) {
137
- super(options);
113
+ constructor({ separators, isSeparatorRegex = false, language, ...baseOptions }: RecursiveChunkOptions = {}) {
114
+ super(baseOptions);
138
115
  this.separators = separators || ['\n\n', '\n', ' ', ''];
139
116
  this.isSeparatorRegex = isSeparatorRegex;
140
117
  }
@@ -169,7 +146,7 @@ export class RecursiveCharacterTransformer extends TextTransformer {
169
146
  const mergeSeparator = this.keepSeparator ? '' : separator;
170
147
 
171
148
  for (const s of splits) {
172
- if (this.lengthFunction(s) < this.size) {
149
+ if (this.lengthFunction(s) < this.maxSize) {
173
150
  goodSplits.push(s);
174
151
  } else {
175
152
  if (goodSplits.length > 0) {
@@ -198,19 +175,14 @@ export class RecursiveCharacterTransformer extends TextTransformer {
198
175
  return this._splitText(text, this.separators);
199
176
  }
200
177
 
201
- static fromLanguage(
202
- language: Language,
203
- options: {
204
- size?: number;
205
- chunkOverlap?: number;
206
- lengthFunction?: (text: string) => number;
207
- keepSeparator?: boolean | 'start' | 'end';
208
- addStartIndex?: boolean;
209
- stripWhitespace?: boolean;
210
- } = {},
211
- ): RecursiveCharacterTransformer {
178
+ static fromLanguage(language: Language, options: BaseChunkOptions = {}): RecursiveCharacterTransformer {
212
179
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
213
- return new RecursiveCharacterTransformer({ separators, isSeparatorRegex: true, options });
180
+ return new RecursiveCharacterTransformer({
181
+ ...options,
182
+ separators,
183
+ isSeparatorRegex: true,
184
+ language,
185
+ });
214
186
  }
215
187
 
216
188
  static getSeparatorsForLanguage(language: Language): string[] {
@@ -1,5 +1,6 @@
1
1
  import { parse } from 'node-html-better-parser';
2
2
  import { Document } from '../schema';
3
+ import type { HTMLChunkOptions } from '../types';
3
4
 
4
5
  import { RecursiveCharacterTransformer } from './character';
5
6
 
@@ -14,9 +15,9 @@ export class HTMLHeaderTransformer {
14
15
  private headersToSplitOn: [string, string][];
15
16
  private returnEachElement: boolean;
16
17
 
17
- constructor(headersToSplitOn: [string, string][], returnEachElement: boolean = false) {
18
- this.returnEachElement = returnEachElement;
19
- this.headersToSplitOn = [...headersToSplitOn].sort();
18
+ constructor(options: HTMLChunkOptions & { headers: [string, string][] }) {
19
+ this.returnEachElement = options.returnEachLine ?? false;
20
+ this.headersToSplitOn = [...options.headers].sort();
20
21
  }
21
22
 
22
23
  splitText({ text }: { text: string }): Document[] {
@@ -195,11 +196,11 @@ export class HTMLHeaderTransformer {
195
196
 
196
197
  export class HTMLSectionTransformer {
197
198
  private headersToSplitOn: Record<string, string>;
198
- private options: Record<string, any>;
199
+ private textSplitter: RecursiveCharacterTransformer;
199
200
 
200
- constructor(headersToSplitOn: [string, string][], options: Record<string, any> = {}) {
201
- this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
202
- this.options = options;
201
+ constructor(options: HTMLChunkOptions & { sections: [string, string][] }) {
202
+ this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name]) => [tag.toLowerCase(), name]));
203
+ this.textSplitter = new RecursiveCharacterTransformer(options);
203
204
  }
204
205
 
205
206
  splitText(text: string): Document[] {
@@ -296,9 +297,8 @@ export class HTMLSectionTransformer {
296
297
  metadatas.push(doc.metadata);
297
298
  }
298
299
  const results = await this.createDocuments(texts, metadatas);
299
- const textSplitter = new RecursiveCharacterTransformer({ options: this.options });
300
300
 
301
- return textSplitter.splitDocuments(results);
301
+ return this.textSplitter.splitDocuments(results);
302
302
  }
303
303
 
304
304
  createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
@@ -1,12 +1,17 @@
1
1
  import { Document } from '../schema';
2
+ import type { JsonChunkOptions } from '../types';
2
3
 
3
4
  export class RecursiveJsonTransformer {
4
5
  private maxSize: number;
5
6
  private minSize: number;
7
+ private ensureAscii: boolean;
8
+ private convertLists: boolean;
6
9
 
7
- constructor({ maxSize = 2000, minSize }: { maxSize: number; minSize?: number }) {
10
+ constructor({ maxSize = 2000, minSize, ensureAscii = false, convertLists = true }: JsonChunkOptions) {
8
11
  this.maxSize = maxSize;
9
12
  this.minSize = minSize ?? Math.max(maxSize - 200, 50);
13
+ this.ensureAscii = ensureAscii;
14
+ this.convertLists = convertLists;
10
15
  }
11
16
 
12
17
  private static jsonSize(data: Record<string, any>): number {
@@ -170,8 +175,8 @@ export class RecursiveJsonTransformer {
170
175
  private isWithinSizeLimit(value: any, currentSize: number = 0): boolean {
171
176
  const size = RecursiveJsonTransformer.jsonSize(value);
172
177
  // If this is a new chunk (currentSize = 0), allow items smaller than maxSize
173
- // If adding to existing chunk, ensure we're above minSize before splitting
174
- return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize;
178
+ // If adding to existing chunk, ensure total size doesn't exceed maxSize
179
+ return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
175
180
  }
176
181
 
177
182
  /**
@@ -1,19 +1,11 @@
1
1
  import { Language } from '../types';
2
+ import type { BaseChunkOptions } from '../types';
2
3
 
3
4
  import { RecursiveCharacterTransformer } from './character';
4
5
 
5
6
  export class LatexTransformer extends RecursiveCharacterTransformer {
6
- constructor(
7
- options: {
8
- size?: number;
9
- overlap?: number;
10
- lengthFunction?: (text: string) => number;
11
- keepSeparator?: boolean | 'start' | 'end';
12
- addStartIndex?: boolean;
13
- stripWhitespace?: boolean;
14
- } = {},
15
- ) {
7
+ constructor(options: BaseChunkOptions = {}) {
16
8
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.LATEX);
17
- super({ separators, isSeparatorRegex: true, options });
9
+ super({ ...options, separators, isSeparatorRegex: true });
18
10
  }
19
11
  }
@@ -1,6 +1,7 @@
1
1
  import { Document } from '../schema';
2
2
 
3
3
  import { Language } from '../types';
4
+ import type { BaseChunkOptions } from '../types';
4
5
 
5
6
  import { RecursiveCharacterTransformer } from './character';
6
7
 
@@ -16,18 +17,9 @@ interface HeaderType {
16
17
  }
17
18
 
18
19
  export class MarkdownTransformer extends RecursiveCharacterTransformer {
19
- constructor(
20
- options: {
21
- chunkSize?: number;
22
- chunkOverlap?: number;
23
- lengthFunction?: (text: string) => number;
24
- keepSeparator?: boolean | 'start' | 'end';
25
- addStartIndex?: boolean;
26
- stripWhitespace?: boolean;
27
- } = {},
28
- ) {
20
+ constructor(options: BaseChunkOptions = {}) {
29
21
  const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.MARKDOWN);
30
- super({ separators, isSeparatorRegex: true, options });
22
+ super({ ...options, separators, isSeparatorRegex: true });
31
23
  }
32
24
  }
33
25