@mastra/rag 1.2.2 → 1.2.3-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/index.cjs +25 -9
  3. package/dist/index.cjs.map +1 -1
  4. package/dist/index.js +25 -9
  5. package/dist/index.js.map +1 -1
  6. package/dist/tools/graph-rag.d.ts.map +1 -1
  7. package/dist/tools/types.d.ts +18 -5
  8. package/dist/tools/types.d.ts.map +1 -1
  9. package/dist/tools/vector-query.d.ts.map +1 -1
  10. package/dist/utils/vector-search.d.ts +6 -7
  11. package/dist/utils/vector-search.d.ts.map +1 -1
  12. package/package.json +19 -6
  13. package/.turbo/turbo-build.log +0 -4
  14. package/docker-compose.yaml +0 -22
  15. package/eslint.config.js +0 -6
  16. package/src/document/document.test.ts +0 -2975
  17. package/src/document/document.ts +0 -335
  18. package/src/document/extractors/base.ts +0 -30
  19. package/src/document/extractors/index.ts +0 -5
  20. package/src/document/extractors/keywords.test.ts +0 -125
  21. package/src/document/extractors/keywords.ts +0 -126
  22. package/src/document/extractors/questions.test.ts +0 -120
  23. package/src/document/extractors/questions.ts +0 -111
  24. package/src/document/extractors/summary.test.ts +0 -107
  25. package/src/document/extractors/summary.ts +0 -122
  26. package/src/document/extractors/title.test.ts +0 -121
  27. package/src/document/extractors/title.ts +0 -185
  28. package/src/document/extractors/types.ts +0 -40
  29. package/src/document/index.ts +0 -2
  30. package/src/document/prompts/base.ts +0 -77
  31. package/src/document/prompts/format.ts +0 -9
  32. package/src/document/prompts/index.ts +0 -15
  33. package/src/document/prompts/prompt.ts +0 -60
  34. package/src/document/prompts/types.ts +0 -29
  35. package/src/document/schema/index.ts +0 -3
  36. package/src/document/schema/node.ts +0 -187
  37. package/src/document/schema/types.ts +0 -40
  38. package/src/document/transformers/character.ts +0 -267
  39. package/src/document/transformers/html.ts +0 -346
  40. package/src/document/transformers/json.ts +0 -536
  41. package/src/document/transformers/latex.ts +0 -11
  42. package/src/document/transformers/markdown.ts +0 -239
  43. package/src/document/transformers/semantic-markdown.ts +0 -227
  44. package/src/document/transformers/sentence.ts +0 -314
  45. package/src/document/transformers/text.ts +0 -158
  46. package/src/document/transformers/token.ts +0 -137
  47. package/src/document/transformers/transformer.ts +0 -5
  48. package/src/document/types.ts +0 -145
  49. package/src/document/validation.ts +0 -158
  50. package/src/graph-rag/index.test.ts +0 -235
  51. package/src/graph-rag/index.ts +0 -306
  52. package/src/index.ts +0 -8
  53. package/src/rerank/index.test.ts +0 -150
  54. package/src/rerank/index.ts +0 -198
  55. package/src/rerank/relevance/cohere/index.ts +0 -56
  56. package/src/rerank/relevance/index.ts +0 -3
  57. package/src/rerank/relevance/mastra-agent/index.ts +0 -32
  58. package/src/rerank/relevance/zeroentropy/index.ts +0 -26
  59. package/src/tools/README.md +0 -153
  60. package/src/tools/document-chunker.ts +0 -34
  61. package/src/tools/graph-rag.test.ts +0 -115
  62. package/src/tools/graph-rag.ts +0 -154
  63. package/src/tools/index.ts +0 -3
  64. package/src/tools/types.ts +0 -110
  65. package/src/tools/vector-query-database-config.test.ts +0 -190
  66. package/src/tools/vector-query.test.ts +0 -418
  67. package/src/tools/vector-query.ts +0 -169
  68. package/src/utils/convert-sources.ts +0 -43
  69. package/src/utils/default-settings.ts +0 -38
  70. package/src/utils/index.ts +0 -3
  71. package/src/utils/tool-schemas.ts +0 -38
  72. package/src/utils/vector-prompts.ts +0 -832
  73. package/src/utils/vector-search.ts +0 -117
  74. package/tsconfig.build.json +0 -9
  75. package/tsconfig.json +0 -5
  76. package/tsup.config.ts +0 -17
  77. package/vitest.config.ts +0 -8
@@ -1,314 +0,0 @@
1
- import type { SentenceChunkOptions } from '../types';
2
- import { TextTransformer } from './text';
3
-
4
- export class SentenceTransformer extends TextTransformer {
5
- protected minSize: number;
6
- protected maxSize: number;
7
- protected targetSize: number;
8
- protected sentenceEnders: string[];
9
- protected fallbackToWords: boolean;
10
- protected fallbackToCharacters: boolean;
11
- protected keepSeparator: boolean | 'start' | 'end';
12
-
13
- constructor(options: SentenceChunkOptions) {
14
- // Ensure overlap doesn't exceed maxSize for parent validation
15
- const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
16
-
17
- const baseOptions = {
18
- ...options,
19
- overlap: parentOverlap, // Use adjusted overlap for parent
20
- };
21
-
22
- super(baseOptions);
23
-
24
- this.maxSize = options.maxSize;
25
- this.minSize = options.minSize ?? 50;
26
- this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
27
- this.sentenceEnders = options.sentenceEnders ?? ['.', '!', '?'];
28
- this.fallbackToWords = options.fallbackToWords ?? true;
29
- this.fallbackToCharacters = options.fallbackToCharacters ?? true;
30
- this.keepSeparator = options.keepSeparator ?? false;
31
-
32
- // Override with original overlap for our sentence logic
33
- this.overlap = options.overlap ?? 0;
34
- }
35
-
36
- private detectSentenceBoundaries(text: string): string[] {
37
- if (!text) return [];
38
-
39
- const sentences: string[] = [];
40
- let currentSentence = '';
41
- let i = 0;
42
-
43
- while (i < text.length) {
44
- const char = text[i];
45
- if (!char) break; // Safety check
46
-
47
- currentSentence += char;
48
-
49
- if (this.sentenceEnders.includes(char)) {
50
- const remainingText = text.slice(i + 1);
51
-
52
- if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
53
- sentences.push(currentSentence.trim());
54
- currentSentence = '';
55
- }
56
- }
57
- i++;
58
- }
59
-
60
- if (currentSentence.trim()) {
61
- sentences.push(currentSentence.trim());
62
- }
63
-
64
- return sentences.filter(s => s.length > 0);
65
- }
66
-
67
- private isRealSentenceBoundary(currentSentence: string, remainingText: string): boolean {
68
- if (!remainingText.trim()) {
69
- return true;
70
- }
71
-
72
- if (!/^\s+[A-Z]/.test(remainingText)) {
73
- return false;
74
- }
75
-
76
- const words = currentSentence.trim().split(/\s+/);
77
- const lastWord = words[words.length - 1] || '';
78
-
79
- const baseWord = lastWord.slice(0, -1);
80
-
81
- if (this.isCommonAbbreviation(baseWord)) {
82
- return false;
83
- }
84
-
85
- return true;
86
- }
87
-
88
- private isCommonAbbreviation(word: string): boolean {
89
- // Common titles
90
- const titles = ['Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr'];
91
- if (titles.includes(word)) {
92
- return true;
93
- }
94
-
95
- // Multi-character abbreviations with periods (U.S.A., a.m., p.m., etc.)
96
- if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
97
- return true;
98
- }
99
-
100
- // Single capital letters (initials)
101
- if (/^[A-Z]$/.test(word)) {
102
- return true;
103
- }
104
-
105
- // Numbers (versions, decimals)
106
- if (/^\d+$/.test(word)) {
107
- return true;
108
- }
109
-
110
- // Time abbreviations
111
- if (/^[ap]\.?m$/i.test(word)) {
112
- return true;
113
- }
114
-
115
- return false;
116
- }
117
-
118
- /**
119
- * Group sentences into chunks with integrated overlap processing
120
- */
121
- private groupSentencesIntoChunks(sentences: string[]): string[] {
122
- const chunks: string[] = [];
123
- let currentChunk: string[] = [];
124
- let currentSize = 0;
125
-
126
- const separator = ' ';
127
-
128
- for (const sentence of sentences) {
129
- const sentenceLength = this.lengthFunction(sentence);
130
- const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
131
- const totalLength = currentSize + sentenceLength + separatorLength;
132
-
133
- // Handle oversized sentences with fallback strategies
134
- if (sentenceLength > this.maxSize) {
135
- if (currentChunk.length > 0) {
136
- chunks.push(currentChunk.join(separator));
137
- currentChunk = [];
138
- currentSize = 0;
139
- }
140
-
141
- const fallbackChunks = this.handleOversizedSentence(sentence);
142
- chunks.push(...fallbackChunks);
143
- continue;
144
- }
145
-
146
- // If adding this sentence would exceed maxSize, finalize current chunk
147
- if (currentChunk.length > 0 && totalLength > this.maxSize) {
148
- chunks.push(currentChunk.join(separator));
149
-
150
- const overlapSentences = this.calculateSentenceOverlap(currentChunk);
151
- currentChunk = overlapSentences;
152
- currentSize = this.calculateChunkSize(currentChunk);
153
- }
154
-
155
- currentChunk.push(sentence);
156
- currentSize += sentenceLength + separatorLength;
157
-
158
- // If we've reached our target size, consider finalizing the chunk
159
- if (currentSize >= this.targetSize) {
160
- chunks.push(currentChunk.join(separator));
161
-
162
- const overlapSentences = this.calculateSentenceOverlap(currentChunk);
163
- currentChunk = overlapSentences;
164
- currentSize = this.calculateChunkSize(currentChunk);
165
- }
166
- }
167
-
168
- if (currentChunk.length > 0) {
169
- chunks.push(currentChunk.join(separator));
170
- }
171
-
172
- return chunks;
173
- }
174
-
175
- /**
176
- * Handle oversized sentences with fallback strategies
177
- */
178
- private handleOversizedSentence(sentence: string): string[] {
179
- // First fallback
180
- if (this.fallbackToWords) {
181
- const wordChunks = this.splitSentenceIntoWords(sentence);
182
- if (wordChunks.length > 1) {
183
- return wordChunks;
184
- }
185
- }
186
-
187
- // Second fallback
188
- if (this.fallbackToCharacters) {
189
- return this.splitSentenceIntoCharacters(sentence);
190
- }
191
-
192
- // Last resort
193
- console.warn(
194
- `Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`,
195
- );
196
- return [sentence];
197
- }
198
-
199
- private splitSentenceIntoWords(sentence: string): string[] {
200
- const words = sentence.split(/\s+/);
201
- const chunks: string[] = [];
202
- let currentChunk = '';
203
-
204
- for (const word of words) {
205
- const testChunk = currentChunk ? currentChunk + ' ' + word : word;
206
-
207
- if (this.lengthFunction(testChunk) <= this.maxSize) {
208
- currentChunk = testChunk;
209
- } else {
210
- if (currentChunk) {
211
- chunks.push(currentChunk);
212
- }
213
-
214
- if (this.lengthFunction(word) > this.maxSize) {
215
- if (this.fallbackToCharacters) {
216
- chunks.push(...this.splitSentenceIntoCharacters(word));
217
- } else {
218
- chunks.push(word);
219
- }
220
- currentChunk = '';
221
- } else {
222
- currentChunk = word;
223
- }
224
- }
225
- }
226
-
227
- if (currentChunk) {
228
- chunks.push(currentChunk);
229
- }
230
-
231
- return chunks;
232
- }
233
-
234
- private splitSentenceIntoCharacters(text: string): string[] {
235
- const chunks: string[] = [];
236
- let currentChunk = '';
237
-
238
- for (const char of text) {
239
- if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
240
- currentChunk += char;
241
- } else {
242
- if (currentChunk) {
243
- chunks.push(currentChunk);
244
- }
245
- currentChunk = char;
246
- }
247
- }
248
-
249
- if (currentChunk) {
250
- chunks.push(currentChunk);
251
- }
252
-
253
- return chunks;
254
- }
255
-
256
- private calculateSentenceOverlap(currentChunk: string[]): string[] {
257
- if (this.overlap === 0 || currentChunk.length === 0) {
258
- return [];
259
- }
260
-
261
- const overlapSentences: string[] = [];
262
- let overlapSize = 0;
263
- const separator = ' ';
264
-
265
- // Work backwards through sentences to build overlap
266
- for (let i = currentChunk.length - 1; i >= 0; i--) {
267
- const sentence = currentChunk[i];
268
- if (!sentence) continue;
269
-
270
- const sentenceLength = this.lengthFunction(sentence);
271
- const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
272
-
273
- if (overlapSize + sentenceLength + separatorLength > this.overlap) {
274
- break;
275
- }
276
-
277
- overlapSentences.unshift(sentence);
278
- overlapSize += sentenceLength + separatorLength;
279
- }
280
-
281
- return overlapSentences;
282
- }
283
-
284
- private calculateChunkSize(sentences: string[]): number {
285
- if (!sentences || sentences.length === 0) {
286
- return 0;
287
- }
288
-
289
- let totalSize = 0;
290
- const separator = ' ';
291
-
292
- for (let i = 0; i < sentences.length; i++) {
293
- const sentence = sentences[i]!;
294
- totalSize += this.lengthFunction(sentence);
295
-
296
- // Add separator length for all but the last sentence
297
- if (i < sentences.length - 1) {
298
- totalSize += this.lengthFunction(separator);
299
- }
300
- }
301
-
302
- return totalSize;
303
- }
304
-
305
- splitText({ text }: { text: string }): string[] {
306
- if (!text) return [];
307
-
308
- const sentences = this.detectSentenceBoundaries(text);
309
-
310
- const chunks = this.groupSentencesIntoChunks(sentences);
311
-
312
- return chunks.filter(chunk => chunk.trim().length > 0);
313
- }
314
- }
@@ -1,158 +0,0 @@
1
- import { Document } from '../schema';
2
-
3
- import type { BaseChunkOptions } from '../types';
4
-
5
- import type { Transformer } from './transformer';
6
-
7
- export abstract class TextTransformer implements Transformer {
8
- protected maxSize: number;
9
- protected overlap: number;
10
- protected lengthFunction: (text: string) => number;
11
- protected keepSeparator: boolean | 'start' | 'end';
12
- protected addStartIndex: boolean;
13
- protected stripWhitespace: boolean;
14
-
15
- constructor({
16
- maxSize = 4000,
17
- overlap = 200,
18
- lengthFunction = (text: string) => text.length,
19
- keepSeparator = false,
20
- addStartIndex = false,
21
- stripWhitespace = true,
22
- }: BaseChunkOptions) {
23
- if (overlap > maxSize) {
24
- throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${maxSize}), should be smaller.`);
25
- }
26
- this.maxSize = maxSize;
27
- this.overlap = overlap;
28
- this.lengthFunction = lengthFunction;
29
- this.keepSeparator = keepSeparator;
30
- this.addStartIndex = addStartIndex;
31
- this.stripWhitespace = stripWhitespace;
32
- }
33
-
34
- setAddStartIndex(value: boolean): void {
35
- this.addStartIndex = value;
36
- }
37
-
38
- abstract splitText({ text }: { text: string }): string[];
39
-
40
- createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
41
- const _metadatas = metadatas || Array(texts.length).fill({});
42
- const documents: Document[] = [];
43
-
44
- texts.forEach((text, i) => {
45
- let index = 0;
46
- let previousChunkLen = 0;
47
-
48
- this.splitText({ text }).forEach(chunk => {
49
- const metadata = { ..._metadatas[i] };
50
- if (this.addStartIndex) {
51
- const offset = index + previousChunkLen - this.overlap;
52
- index = text.indexOf(chunk, Math.max(0, offset));
53
- metadata.startIndex = index;
54
- previousChunkLen = chunk.length;
55
- }
56
- documents.push(
57
- new Document({
58
- text: chunk,
59
- metadata,
60
- }),
61
- );
62
- });
63
- });
64
-
65
- return documents;
66
- }
67
-
68
- splitDocuments(documents: Document[]): Document[] {
69
- const texts: string[] = [];
70
- const metadatas: Record<string, any>[] = [];
71
- for (const doc of documents) {
72
- texts.push(doc.text);
73
- metadatas.push(doc.metadata);
74
- }
75
- return this.createDocuments(texts, metadatas);
76
- }
77
-
78
- transformDocuments(documents: Document[]): Document[] {
79
- const texts: string[] = [];
80
- const metadatas: Record<string, any>[] = [];
81
-
82
- for (const doc of documents) {
83
- texts.push(doc.text);
84
- metadatas.push(doc.metadata);
85
- }
86
-
87
- return this.createDocuments(texts, metadatas);
88
- }
89
-
90
- protected joinDocs(docs: string[], separator: string): string | null {
91
- let text = docs.join(separator);
92
- if (this.stripWhitespace) {
93
- text = text.trim();
94
- }
95
- return text === '' ? null : text;
96
- }
97
-
98
- protected mergeSplits(splits: string[], separator: string): string[] {
99
- const docs: string[] = [];
100
- let currentDoc: string[] = [];
101
- let total = 0;
102
-
103
- for (const d of splits) {
104
- const len = this.lengthFunction(d);
105
- const separatorLen = separator ? this.lengthFunction(separator) : 0;
106
-
107
- if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
108
- if (total > this.maxSize) {
109
- console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
110
- }
111
-
112
- if (currentDoc.length > 0) {
113
- const doc = this.joinDocs(currentDoc, separator);
114
- if (doc !== null) {
115
- docs.push(doc);
116
- }
117
-
118
- // Handle overlap: keep enough content from the end of current chunk
119
- if (this.overlap > 0) {
120
- let overlapContent: string[] = [];
121
- let overlapSize = 0;
122
-
123
- // Work backwards through currentDoc until we have enough overlap
124
- for (let i = currentDoc.length - 1; i >= 0; i--) {
125
- const piece = currentDoc[i]!;
126
- const pieceLen = this.lengthFunction(piece);
127
-
128
- if (overlapSize + pieceLen > this.overlap) {
129
- break;
130
- }
131
-
132
- overlapContent.unshift(piece);
133
- overlapSize += pieceLen + (overlapContent.length > 1 ? separatorLen : 0);
134
- }
135
-
136
- currentDoc = overlapContent;
137
- total = overlapSize;
138
- } else {
139
- currentDoc = [];
140
- total = 0;
141
- }
142
- }
143
- }
144
-
145
- currentDoc.push(d);
146
- total += len + (currentDoc.length > 1 ? separatorLen : 0);
147
- }
148
-
149
- if (currentDoc.length > 0) {
150
- const doc = this.joinDocs(currentDoc, separator);
151
- if (doc !== null) {
152
- docs.push(doc);
153
- }
154
- }
155
-
156
- return docs;
157
- }
158
- }
@@ -1,137 +0,0 @@
1
- import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
2
- import { encodingForModel, getEncoding } from 'js-tiktoken';
3
- import type { TokenChunkOptions } from '../types';
4
-
5
- import { TextTransformer } from './text';
6
-
7
- interface Tokenizer {
8
- overlap: number;
9
- tokensPerChunk: number;
10
- decode: (tokens: number[]) => string;
11
- encode: (text: string) => number[];
12
- }
13
-
14
- export function splitTextOnTokens({ text, tokenizer }: { text: string; tokenizer: Tokenizer }): string[] {
15
- const splits: string[] = [];
16
- const inputIds = tokenizer.encode(text);
17
- let startIdx = 0;
18
- let curIdx = Math.min(startIdx + tokenizer.tokensPerChunk, inputIds.length);
19
- let chunkIds = inputIds.slice(startIdx, curIdx);
20
-
21
- while (startIdx < inputIds.length) {
22
- splits.push(tokenizer.decode(chunkIds));
23
- if (curIdx === inputIds.length) {
24
- break;
25
- }
26
- startIdx += tokenizer.tokensPerChunk - tokenizer.overlap;
27
- curIdx = Math.min(startIdx + tokenizer.tokensPerChunk, inputIds.length);
28
- chunkIds = inputIds.slice(startIdx, curIdx);
29
- }
30
-
31
- return splits;
32
- }
33
-
34
- export class TokenTransformer extends TextTransformer {
35
- private tokenizer: Tiktoken;
36
- private allowedSpecial: Set<string> | 'all';
37
- private disallowedSpecial: Set<string> | 'all';
38
-
39
- constructor({
40
- encodingName = 'cl100k_base',
41
- modelName,
42
- allowedSpecial = new Set(),
43
- disallowedSpecial = 'all',
44
- options = {},
45
- }: {
46
- encodingName?: TiktokenEncoding;
47
- modelName?: TiktokenModel;
48
- allowedSpecial?: Set<string> | 'all';
49
- disallowedSpecial?: Set<string> | 'all';
50
- options: TokenChunkOptions;
51
- }) {
52
- super(options);
53
-
54
- try {
55
- this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
56
- } catch {
57
- throw new Error('Could not load tiktoken encoding. ' + 'Please install it with `npm install js-tiktoken`.');
58
- }
59
-
60
- this.allowedSpecial = allowedSpecial;
61
- this.disallowedSpecial = disallowedSpecial;
62
- }
63
-
64
- splitText({ text }: { text: string }): string[] {
65
- const encode = (text: string): number[] => {
66
- const allowed = this.allowedSpecial === 'all' ? 'all' : Array.from(this.allowedSpecial);
67
-
68
- const disallowed = this.disallowedSpecial === 'all' ? 'all' : Array.from(this.disallowedSpecial);
69
-
70
- // If stripWhitespace is enabled, trim the text before encoding
71
- const processedText = this.stripWhitespace ? text.trim() : text;
72
- return Array.from(this.tokenizer.encode(processedText, allowed, disallowed));
73
- };
74
-
75
- const decode = (tokens: number[]): string => {
76
- const text = this.tokenizer.decode(tokens);
77
- return this.stripWhitespace ? text.trim() : text;
78
- };
79
-
80
- const tokenizer: Tokenizer = {
81
- overlap: this.overlap,
82
- tokensPerChunk: this.maxSize,
83
- decode,
84
- encode,
85
- };
86
-
87
- return splitTextOnTokens({ text, tokenizer });
88
- }
89
-
90
- static fromTikToken({
91
- encodingName = 'cl100k_base',
92
- modelName,
93
- options = {},
94
- }: {
95
- encodingName?: TiktokenEncoding;
96
- modelName?: TiktokenModel;
97
- options?: TokenChunkOptions;
98
- }): TokenTransformer {
99
- let tokenizer: Tiktoken;
100
-
101
- try {
102
- if (modelName) {
103
- tokenizer = encodingForModel(modelName);
104
- } else {
105
- tokenizer = getEncoding(encodingName);
106
- }
107
- } catch {
108
- throw new Error('Could not load tiktoken encoding. ' + 'Please install it with `npm install js-tiktoken`.');
109
- }
110
-
111
- const tikTokenEncoder = (text: string): number => {
112
- const allowed =
113
- options.allowedSpecial === 'all' ? 'all' : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
114
-
115
- const disallowed =
116
- options.disallowedSpecial === 'all'
117
- ? 'all'
118
- : options.disallowedSpecial
119
- ? Array.from(options.disallowedSpecial)
120
- : [];
121
-
122
- return tokenizer.encode(text, allowed, disallowed).length;
123
- };
124
-
125
- return new TokenTransformer({
126
- encodingName,
127
- modelName,
128
- allowedSpecial: options.allowedSpecial,
129
- disallowedSpecial: options.disallowedSpecial,
130
- options: {
131
- maxSize: options.maxSize,
132
- overlap: options.overlap,
133
- lengthFunction: tikTokenEncoder,
134
- },
135
- });
136
- }
137
- }
@@ -1,5 +0,0 @@
1
- import type { Document } from '../schema';
2
-
3
- export interface Transformer {
4
- transformDocuments(documents: Document[]): Document[];
5
- }