@mastra/rag 1.2.2 → 1.2.3-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/dist/index.cjs +25 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +25 -9
- package/dist/index.js.map +1 -1
- package/dist/tools/graph-rag.d.ts.map +1 -1
- package/dist/tools/types.d.ts +18 -5
- package/dist/tools/types.d.ts.map +1 -1
- package/dist/tools/vector-query.d.ts.map +1 -1
- package/dist/utils/vector-search.d.ts +6 -7
- package/dist/utils/vector-search.d.ts.map +1 -1
- package/package.json +19 -6
- package/.turbo/turbo-build.log +0 -4
- package/docker-compose.yaml +0 -22
- package/eslint.config.js +0 -6
- package/src/document/document.test.ts +0 -2975
- package/src/document/document.ts +0 -335
- package/src/document/extractors/base.ts +0 -30
- package/src/document/extractors/index.ts +0 -5
- package/src/document/extractors/keywords.test.ts +0 -125
- package/src/document/extractors/keywords.ts +0 -126
- package/src/document/extractors/questions.test.ts +0 -120
- package/src/document/extractors/questions.ts +0 -111
- package/src/document/extractors/summary.test.ts +0 -107
- package/src/document/extractors/summary.ts +0 -122
- package/src/document/extractors/title.test.ts +0 -121
- package/src/document/extractors/title.ts +0 -185
- package/src/document/extractors/types.ts +0 -40
- package/src/document/index.ts +0 -2
- package/src/document/prompts/base.ts +0 -77
- package/src/document/prompts/format.ts +0 -9
- package/src/document/prompts/index.ts +0 -15
- package/src/document/prompts/prompt.ts +0 -60
- package/src/document/prompts/types.ts +0 -29
- package/src/document/schema/index.ts +0 -3
- package/src/document/schema/node.ts +0 -187
- package/src/document/schema/types.ts +0 -40
- package/src/document/transformers/character.ts +0 -267
- package/src/document/transformers/html.ts +0 -346
- package/src/document/transformers/json.ts +0 -536
- package/src/document/transformers/latex.ts +0 -11
- package/src/document/transformers/markdown.ts +0 -239
- package/src/document/transformers/semantic-markdown.ts +0 -227
- package/src/document/transformers/sentence.ts +0 -314
- package/src/document/transformers/text.ts +0 -158
- package/src/document/transformers/token.ts +0 -137
- package/src/document/transformers/transformer.ts +0 -5
- package/src/document/types.ts +0 -145
- package/src/document/validation.ts +0 -158
- package/src/graph-rag/index.test.ts +0 -235
- package/src/graph-rag/index.ts +0 -306
- package/src/index.ts +0 -8
- package/src/rerank/index.test.ts +0 -150
- package/src/rerank/index.ts +0 -198
- package/src/rerank/relevance/cohere/index.ts +0 -56
- package/src/rerank/relevance/index.ts +0 -3
- package/src/rerank/relevance/mastra-agent/index.ts +0 -32
- package/src/rerank/relevance/zeroentropy/index.ts +0 -26
- package/src/tools/README.md +0 -153
- package/src/tools/document-chunker.ts +0 -34
- package/src/tools/graph-rag.test.ts +0 -115
- package/src/tools/graph-rag.ts +0 -154
- package/src/tools/index.ts +0 -3
- package/src/tools/types.ts +0 -110
- package/src/tools/vector-query-database-config.test.ts +0 -190
- package/src/tools/vector-query.test.ts +0 -418
- package/src/tools/vector-query.ts +0 -169
- package/src/utils/convert-sources.ts +0 -43
- package/src/utils/default-settings.ts +0 -38
- package/src/utils/index.ts +0 -3
- package/src/utils/tool-schemas.ts +0 -38
- package/src/utils/vector-prompts.ts +0 -832
- package/src/utils/vector-search.ts +0 -117
- package/tsconfig.build.json +0 -9
- package/tsconfig.json +0 -5
- package/tsup.config.ts +0 -17
- package/vitest.config.ts +0 -8
|
@@ -1,314 +0,0 @@
|
|
|
1
|
-
import type { SentenceChunkOptions } from '../types';
|
|
2
|
-
import { TextTransformer } from './text';
|
|
3
|
-
|
|
4
|
-
export class SentenceTransformer extends TextTransformer {
|
|
5
|
-
protected minSize: number;
|
|
6
|
-
protected maxSize: number;
|
|
7
|
-
protected targetSize: number;
|
|
8
|
-
protected sentenceEnders: string[];
|
|
9
|
-
protected fallbackToWords: boolean;
|
|
10
|
-
protected fallbackToCharacters: boolean;
|
|
11
|
-
protected keepSeparator: boolean | 'start' | 'end';
|
|
12
|
-
|
|
13
|
-
constructor(options: SentenceChunkOptions) {
|
|
14
|
-
// Ensure overlap doesn't exceed maxSize for parent validation
|
|
15
|
-
const parentOverlap = Math.min(options.overlap ?? 0, options.maxSize - 1);
|
|
16
|
-
|
|
17
|
-
const baseOptions = {
|
|
18
|
-
...options,
|
|
19
|
-
overlap: parentOverlap, // Use adjusted overlap for parent
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
super(baseOptions);
|
|
23
|
-
|
|
24
|
-
this.maxSize = options.maxSize;
|
|
25
|
-
this.minSize = options.minSize ?? 50;
|
|
26
|
-
this.targetSize = options.targetSize ?? Math.floor(options.maxSize * 0.8);
|
|
27
|
-
this.sentenceEnders = options.sentenceEnders ?? ['.', '!', '?'];
|
|
28
|
-
this.fallbackToWords = options.fallbackToWords ?? true;
|
|
29
|
-
this.fallbackToCharacters = options.fallbackToCharacters ?? true;
|
|
30
|
-
this.keepSeparator = options.keepSeparator ?? false;
|
|
31
|
-
|
|
32
|
-
// Override with original overlap for our sentence logic
|
|
33
|
-
this.overlap = options.overlap ?? 0;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
private detectSentenceBoundaries(text: string): string[] {
|
|
37
|
-
if (!text) return [];
|
|
38
|
-
|
|
39
|
-
const sentences: string[] = [];
|
|
40
|
-
let currentSentence = '';
|
|
41
|
-
let i = 0;
|
|
42
|
-
|
|
43
|
-
while (i < text.length) {
|
|
44
|
-
const char = text[i];
|
|
45
|
-
if (!char) break; // Safety check
|
|
46
|
-
|
|
47
|
-
currentSentence += char;
|
|
48
|
-
|
|
49
|
-
if (this.sentenceEnders.includes(char)) {
|
|
50
|
-
const remainingText = text.slice(i + 1);
|
|
51
|
-
|
|
52
|
-
if (this.isRealSentenceBoundary(currentSentence, remainingText)) {
|
|
53
|
-
sentences.push(currentSentence.trim());
|
|
54
|
-
currentSentence = '';
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
i++;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
if (currentSentence.trim()) {
|
|
61
|
-
sentences.push(currentSentence.trim());
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return sentences.filter(s => s.length > 0);
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
private isRealSentenceBoundary(currentSentence: string, remainingText: string): boolean {
|
|
68
|
-
if (!remainingText.trim()) {
|
|
69
|
-
return true;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if (!/^\s+[A-Z]/.test(remainingText)) {
|
|
73
|
-
return false;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
const words = currentSentence.trim().split(/\s+/);
|
|
77
|
-
const lastWord = words[words.length - 1] || '';
|
|
78
|
-
|
|
79
|
-
const baseWord = lastWord.slice(0, -1);
|
|
80
|
-
|
|
81
|
-
if (this.isCommonAbbreviation(baseWord)) {
|
|
82
|
-
return false;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
return true;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
private isCommonAbbreviation(word: string): boolean {
|
|
89
|
-
// Common titles
|
|
90
|
-
const titles = ['Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr'];
|
|
91
|
-
if (titles.includes(word)) {
|
|
92
|
-
return true;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
// Multi-character abbreviations with periods (U.S.A., a.m., p.m., etc.)
|
|
96
|
-
if (/^[A-Z](\.[A-Z])*$/.test(word) || /^[a-z](\.[a-z])*$/.test(word)) {
|
|
97
|
-
return true;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// Single capital letters (initials)
|
|
101
|
-
if (/^[A-Z]$/.test(word)) {
|
|
102
|
-
return true;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
// Numbers (versions, decimals)
|
|
106
|
-
if (/^\d+$/.test(word)) {
|
|
107
|
-
return true;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
// Time abbreviations
|
|
111
|
-
if (/^[ap]\.?m$/i.test(word)) {
|
|
112
|
-
return true;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
return false;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Group sentences into chunks with integrated overlap processing
|
|
120
|
-
*/
|
|
121
|
-
private groupSentencesIntoChunks(sentences: string[]): string[] {
|
|
122
|
-
const chunks: string[] = [];
|
|
123
|
-
let currentChunk: string[] = [];
|
|
124
|
-
let currentSize = 0;
|
|
125
|
-
|
|
126
|
-
const separator = ' ';
|
|
127
|
-
|
|
128
|
-
for (const sentence of sentences) {
|
|
129
|
-
const sentenceLength = this.lengthFunction(sentence);
|
|
130
|
-
const separatorLength = currentChunk.length > 0 ? this.lengthFunction(separator) : 0;
|
|
131
|
-
const totalLength = currentSize + sentenceLength + separatorLength;
|
|
132
|
-
|
|
133
|
-
// Handle oversized sentences with fallback strategies
|
|
134
|
-
if (sentenceLength > this.maxSize) {
|
|
135
|
-
if (currentChunk.length > 0) {
|
|
136
|
-
chunks.push(currentChunk.join(separator));
|
|
137
|
-
currentChunk = [];
|
|
138
|
-
currentSize = 0;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
const fallbackChunks = this.handleOversizedSentence(sentence);
|
|
142
|
-
chunks.push(...fallbackChunks);
|
|
143
|
-
continue;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
// If adding this sentence would exceed maxSize, finalize current chunk
|
|
147
|
-
if (currentChunk.length > 0 && totalLength > this.maxSize) {
|
|
148
|
-
chunks.push(currentChunk.join(separator));
|
|
149
|
-
|
|
150
|
-
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
151
|
-
currentChunk = overlapSentences;
|
|
152
|
-
currentSize = this.calculateChunkSize(currentChunk);
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
currentChunk.push(sentence);
|
|
156
|
-
currentSize += sentenceLength + separatorLength;
|
|
157
|
-
|
|
158
|
-
// If we've reached our target size, consider finalizing the chunk
|
|
159
|
-
if (currentSize >= this.targetSize) {
|
|
160
|
-
chunks.push(currentChunk.join(separator));
|
|
161
|
-
|
|
162
|
-
const overlapSentences = this.calculateSentenceOverlap(currentChunk);
|
|
163
|
-
currentChunk = overlapSentences;
|
|
164
|
-
currentSize = this.calculateChunkSize(currentChunk);
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
if (currentChunk.length > 0) {
|
|
169
|
-
chunks.push(currentChunk.join(separator));
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
return chunks;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
/**
|
|
176
|
-
* Handle oversized sentences with fallback strategies
|
|
177
|
-
*/
|
|
178
|
-
private handleOversizedSentence(sentence: string): string[] {
|
|
179
|
-
// First fallback
|
|
180
|
-
if (this.fallbackToWords) {
|
|
181
|
-
const wordChunks = this.splitSentenceIntoWords(sentence);
|
|
182
|
-
if (wordChunks.length > 1) {
|
|
183
|
-
return wordChunks;
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Second fallback
|
|
188
|
-
if (this.fallbackToCharacters) {
|
|
189
|
-
return this.splitSentenceIntoCharacters(sentence);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
// Last resort
|
|
193
|
-
console.warn(
|
|
194
|
-
`Sentence exceeds maxSize (${this.maxSize}) and fallbacks are disabled: "${sentence.substring(0, 50)}..."`,
|
|
195
|
-
);
|
|
196
|
-
return [sentence];
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
private splitSentenceIntoWords(sentence: string): string[] {
|
|
200
|
-
const words = sentence.split(/\s+/);
|
|
201
|
-
const chunks: string[] = [];
|
|
202
|
-
let currentChunk = '';
|
|
203
|
-
|
|
204
|
-
for (const word of words) {
|
|
205
|
-
const testChunk = currentChunk ? currentChunk + ' ' + word : word;
|
|
206
|
-
|
|
207
|
-
if (this.lengthFunction(testChunk) <= this.maxSize) {
|
|
208
|
-
currentChunk = testChunk;
|
|
209
|
-
} else {
|
|
210
|
-
if (currentChunk) {
|
|
211
|
-
chunks.push(currentChunk);
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
if (this.lengthFunction(word) > this.maxSize) {
|
|
215
|
-
if (this.fallbackToCharacters) {
|
|
216
|
-
chunks.push(...this.splitSentenceIntoCharacters(word));
|
|
217
|
-
} else {
|
|
218
|
-
chunks.push(word);
|
|
219
|
-
}
|
|
220
|
-
currentChunk = '';
|
|
221
|
-
} else {
|
|
222
|
-
currentChunk = word;
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
if (currentChunk) {
|
|
228
|
-
chunks.push(currentChunk);
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
return chunks;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
private splitSentenceIntoCharacters(text: string): string[] {
|
|
235
|
-
const chunks: string[] = [];
|
|
236
|
-
let currentChunk = '';
|
|
237
|
-
|
|
238
|
-
for (const char of text) {
|
|
239
|
-
if (this.lengthFunction(currentChunk + char) <= this.maxSize) {
|
|
240
|
-
currentChunk += char;
|
|
241
|
-
} else {
|
|
242
|
-
if (currentChunk) {
|
|
243
|
-
chunks.push(currentChunk);
|
|
244
|
-
}
|
|
245
|
-
currentChunk = char;
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
if (currentChunk) {
|
|
250
|
-
chunks.push(currentChunk);
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
return chunks;
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
private calculateSentenceOverlap(currentChunk: string[]): string[] {
|
|
257
|
-
if (this.overlap === 0 || currentChunk.length === 0) {
|
|
258
|
-
return [];
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
const overlapSentences: string[] = [];
|
|
262
|
-
let overlapSize = 0;
|
|
263
|
-
const separator = ' ';
|
|
264
|
-
|
|
265
|
-
// Work backwards through sentences to build overlap
|
|
266
|
-
for (let i = currentChunk.length - 1; i >= 0; i--) {
|
|
267
|
-
const sentence = currentChunk[i];
|
|
268
|
-
if (!sentence) continue;
|
|
269
|
-
|
|
270
|
-
const sentenceLength = this.lengthFunction(sentence);
|
|
271
|
-
const separatorLength = overlapSentences.length > 0 ? this.lengthFunction(separator) : 0;
|
|
272
|
-
|
|
273
|
-
if (overlapSize + sentenceLength + separatorLength > this.overlap) {
|
|
274
|
-
break;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
overlapSentences.unshift(sentence);
|
|
278
|
-
overlapSize += sentenceLength + separatorLength;
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
return overlapSentences;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
private calculateChunkSize(sentences: string[]): number {
|
|
285
|
-
if (!sentences || sentences.length === 0) {
|
|
286
|
-
return 0;
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
let totalSize = 0;
|
|
290
|
-
const separator = ' ';
|
|
291
|
-
|
|
292
|
-
for (let i = 0; i < sentences.length; i++) {
|
|
293
|
-
const sentence = sentences[i]!;
|
|
294
|
-
totalSize += this.lengthFunction(sentence);
|
|
295
|
-
|
|
296
|
-
// Add separator length for all but the last sentence
|
|
297
|
-
if (i < sentences.length - 1) {
|
|
298
|
-
totalSize += this.lengthFunction(separator);
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
return totalSize;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
splitText({ text }: { text: string }): string[] {
|
|
306
|
-
if (!text) return [];
|
|
307
|
-
|
|
308
|
-
const sentences = this.detectSentenceBoundaries(text);
|
|
309
|
-
|
|
310
|
-
const chunks = this.groupSentencesIntoChunks(sentences);
|
|
311
|
-
|
|
312
|
-
return chunks.filter(chunk => chunk.trim().length > 0);
|
|
313
|
-
}
|
|
314
|
-
}
|
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import { Document } from '../schema';
|
|
2
|
-
|
|
3
|
-
import type { BaseChunkOptions } from '../types';
|
|
4
|
-
|
|
5
|
-
import type { Transformer } from './transformer';
|
|
6
|
-
|
|
7
|
-
export abstract class TextTransformer implements Transformer {
|
|
8
|
-
protected maxSize: number;
|
|
9
|
-
protected overlap: number;
|
|
10
|
-
protected lengthFunction: (text: string) => number;
|
|
11
|
-
protected keepSeparator: boolean | 'start' | 'end';
|
|
12
|
-
protected addStartIndex: boolean;
|
|
13
|
-
protected stripWhitespace: boolean;
|
|
14
|
-
|
|
15
|
-
constructor({
|
|
16
|
-
maxSize = 4000,
|
|
17
|
-
overlap = 200,
|
|
18
|
-
lengthFunction = (text: string) => text.length,
|
|
19
|
-
keepSeparator = false,
|
|
20
|
-
addStartIndex = false,
|
|
21
|
-
stripWhitespace = true,
|
|
22
|
-
}: BaseChunkOptions) {
|
|
23
|
-
if (overlap > maxSize) {
|
|
24
|
-
throw new Error(`Got a larger chunk overlap (${overlap}) than chunk size ` + `(${maxSize}), should be smaller.`);
|
|
25
|
-
}
|
|
26
|
-
this.maxSize = maxSize;
|
|
27
|
-
this.overlap = overlap;
|
|
28
|
-
this.lengthFunction = lengthFunction;
|
|
29
|
-
this.keepSeparator = keepSeparator;
|
|
30
|
-
this.addStartIndex = addStartIndex;
|
|
31
|
-
this.stripWhitespace = stripWhitespace;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
setAddStartIndex(value: boolean): void {
|
|
35
|
-
this.addStartIndex = value;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
abstract splitText({ text }: { text: string }): string[];
|
|
39
|
-
|
|
40
|
-
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
|
|
41
|
-
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
42
|
-
const documents: Document[] = [];
|
|
43
|
-
|
|
44
|
-
texts.forEach((text, i) => {
|
|
45
|
-
let index = 0;
|
|
46
|
-
let previousChunkLen = 0;
|
|
47
|
-
|
|
48
|
-
this.splitText({ text }).forEach(chunk => {
|
|
49
|
-
const metadata = { ..._metadatas[i] };
|
|
50
|
-
if (this.addStartIndex) {
|
|
51
|
-
const offset = index + previousChunkLen - this.overlap;
|
|
52
|
-
index = text.indexOf(chunk, Math.max(0, offset));
|
|
53
|
-
metadata.startIndex = index;
|
|
54
|
-
previousChunkLen = chunk.length;
|
|
55
|
-
}
|
|
56
|
-
documents.push(
|
|
57
|
-
new Document({
|
|
58
|
-
text: chunk,
|
|
59
|
-
metadata,
|
|
60
|
-
}),
|
|
61
|
-
);
|
|
62
|
-
});
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
return documents;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
splitDocuments(documents: Document[]): Document[] {
|
|
69
|
-
const texts: string[] = [];
|
|
70
|
-
const metadatas: Record<string, any>[] = [];
|
|
71
|
-
for (const doc of documents) {
|
|
72
|
-
texts.push(doc.text);
|
|
73
|
-
metadatas.push(doc.metadata);
|
|
74
|
-
}
|
|
75
|
-
return this.createDocuments(texts, metadatas);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
transformDocuments(documents: Document[]): Document[] {
|
|
79
|
-
const texts: string[] = [];
|
|
80
|
-
const metadatas: Record<string, any>[] = [];
|
|
81
|
-
|
|
82
|
-
for (const doc of documents) {
|
|
83
|
-
texts.push(doc.text);
|
|
84
|
-
metadatas.push(doc.metadata);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
return this.createDocuments(texts, metadatas);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
protected joinDocs(docs: string[], separator: string): string | null {
|
|
91
|
-
let text = docs.join(separator);
|
|
92
|
-
if (this.stripWhitespace) {
|
|
93
|
-
text = text.trim();
|
|
94
|
-
}
|
|
95
|
-
return text === '' ? null : text;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
protected mergeSplits(splits: string[], separator: string): string[] {
|
|
99
|
-
const docs: string[] = [];
|
|
100
|
-
let currentDoc: string[] = [];
|
|
101
|
-
let total = 0;
|
|
102
|
-
|
|
103
|
-
for (const d of splits) {
|
|
104
|
-
const len = this.lengthFunction(d);
|
|
105
|
-
const separatorLen = separator ? this.lengthFunction(separator) : 0;
|
|
106
|
-
|
|
107
|
-
if (total + len + (currentDoc.length > 0 ? separatorLen : 0) > this.maxSize) {
|
|
108
|
-
if (total > this.maxSize) {
|
|
109
|
-
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.maxSize}`);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
if (currentDoc.length > 0) {
|
|
113
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
114
|
-
if (doc !== null) {
|
|
115
|
-
docs.push(doc);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
// Handle overlap: keep enough content from the end of current chunk
|
|
119
|
-
if (this.overlap > 0) {
|
|
120
|
-
let overlapContent: string[] = [];
|
|
121
|
-
let overlapSize = 0;
|
|
122
|
-
|
|
123
|
-
// Work backwards through currentDoc until we have enough overlap
|
|
124
|
-
for (let i = currentDoc.length - 1; i >= 0; i--) {
|
|
125
|
-
const piece = currentDoc[i]!;
|
|
126
|
-
const pieceLen = this.lengthFunction(piece);
|
|
127
|
-
|
|
128
|
-
if (overlapSize + pieceLen > this.overlap) {
|
|
129
|
-
break;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
overlapContent.unshift(piece);
|
|
133
|
-
overlapSize += pieceLen + (overlapContent.length > 1 ? separatorLen : 0);
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
currentDoc = overlapContent;
|
|
137
|
-
total = overlapSize;
|
|
138
|
-
} else {
|
|
139
|
-
currentDoc = [];
|
|
140
|
-
total = 0;
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
currentDoc.push(d);
|
|
146
|
-
total += len + (currentDoc.length > 1 ? separatorLen : 0);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
if (currentDoc.length > 0) {
|
|
150
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
151
|
-
if (doc !== null) {
|
|
152
|
-
docs.push(doc);
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
return docs;
|
|
157
|
-
}
|
|
158
|
-
}
|
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
|
|
2
|
-
import { encodingForModel, getEncoding } from 'js-tiktoken';
|
|
3
|
-
import type { TokenChunkOptions } from '../types';
|
|
4
|
-
|
|
5
|
-
import { TextTransformer } from './text';
|
|
6
|
-
|
|
7
|
-
interface Tokenizer {
|
|
8
|
-
overlap: number;
|
|
9
|
-
tokensPerChunk: number;
|
|
10
|
-
decode: (tokens: number[]) => string;
|
|
11
|
-
encode: (text: string) => number[];
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export function splitTextOnTokens({ text, tokenizer }: { text: string; tokenizer: Tokenizer }): string[] {
|
|
15
|
-
const splits: string[] = [];
|
|
16
|
-
const inputIds = tokenizer.encode(text);
|
|
17
|
-
let startIdx = 0;
|
|
18
|
-
let curIdx = Math.min(startIdx + tokenizer.tokensPerChunk, inputIds.length);
|
|
19
|
-
let chunkIds = inputIds.slice(startIdx, curIdx);
|
|
20
|
-
|
|
21
|
-
while (startIdx < inputIds.length) {
|
|
22
|
-
splits.push(tokenizer.decode(chunkIds));
|
|
23
|
-
if (curIdx === inputIds.length) {
|
|
24
|
-
break;
|
|
25
|
-
}
|
|
26
|
-
startIdx += tokenizer.tokensPerChunk - tokenizer.overlap;
|
|
27
|
-
curIdx = Math.min(startIdx + tokenizer.tokensPerChunk, inputIds.length);
|
|
28
|
-
chunkIds = inputIds.slice(startIdx, curIdx);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
return splits;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
export class TokenTransformer extends TextTransformer {
|
|
35
|
-
private tokenizer: Tiktoken;
|
|
36
|
-
private allowedSpecial: Set<string> | 'all';
|
|
37
|
-
private disallowedSpecial: Set<string> | 'all';
|
|
38
|
-
|
|
39
|
-
constructor({
|
|
40
|
-
encodingName = 'cl100k_base',
|
|
41
|
-
modelName,
|
|
42
|
-
allowedSpecial = new Set(),
|
|
43
|
-
disallowedSpecial = 'all',
|
|
44
|
-
options = {},
|
|
45
|
-
}: {
|
|
46
|
-
encodingName?: TiktokenEncoding;
|
|
47
|
-
modelName?: TiktokenModel;
|
|
48
|
-
allowedSpecial?: Set<string> | 'all';
|
|
49
|
-
disallowedSpecial?: Set<string> | 'all';
|
|
50
|
-
options: TokenChunkOptions;
|
|
51
|
-
}) {
|
|
52
|
-
super(options);
|
|
53
|
-
|
|
54
|
-
try {
|
|
55
|
-
this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
|
|
56
|
-
} catch {
|
|
57
|
-
throw new Error('Could not load tiktoken encoding. ' + 'Please install it with `npm install js-tiktoken`.');
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
this.allowedSpecial = allowedSpecial;
|
|
61
|
-
this.disallowedSpecial = disallowedSpecial;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
splitText({ text }: { text: string }): string[] {
|
|
65
|
-
const encode = (text: string): number[] => {
|
|
66
|
-
const allowed = this.allowedSpecial === 'all' ? 'all' : Array.from(this.allowedSpecial);
|
|
67
|
-
|
|
68
|
-
const disallowed = this.disallowedSpecial === 'all' ? 'all' : Array.from(this.disallowedSpecial);
|
|
69
|
-
|
|
70
|
-
// If stripWhitespace is enabled, trim the text before encoding
|
|
71
|
-
const processedText = this.stripWhitespace ? text.trim() : text;
|
|
72
|
-
return Array.from(this.tokenizer.encode(processedText, allowed, disallowed));
|
|
73
|
-
};
|
|
74
|
-
|
|
75
|
-
const decode = (tokens: number[]): string => {
|
|
76
|
-
const text = this.tokenizer.decode(tokens);
|
|
77
|
-
return this.stripWhitespace ? text.trim() : text;
|
|
78
|
-
};
|
|
79
|
-
|
|
80
|
-
const tokenizer: Tokenizer = {
|
|
81
|
-
overlap: this.overlap,
|
|
82
|
-
tokensPerChunk: this.maxSize,
|
|
83
|
-
decode,
|
|
84
|
-
encode,
|
|
85
|
-
};
|
|
86
|
-
|
|
87
|
-
return splitTextOnTokens({ text, tokenizer });
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
static fromTikToken({
|
|
91
|
-
encodingName = 'cl100k_base',
|
|
92
|
-
modelName,
|
|
93
|
-
options = {},
|
|
94
|
-
}: {
|
|
95
|
-
encodingName?: TiktokenEncoding;
|
|
96
|
-
modelName?: TiktokenModel;
|
|
97
|
-
options?: TokenChunkOptions;
|
|
98
|
-
}): TokenTransformer {
|
|
99
|
-
let tokenizer: Tiktoken;
|
|
100
|
-
|
|
101
|
-
try {
|
|
102
|
-
if (modelName) {
|
|
103
|
-
tokenizer = encodingForModel(modelName);
|
|
104
|
-
} else {
|
|
105
|
-
tokenizer = getEncoding(encodingName);
|
|
106
|
-
}
|
|
107
|
-
} catch {
|
|
108
|
-
throw new Error('Could not load tiktoken encoding. ' + 'Please install it with `npm install js-tiktoken`.');
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
const tikTokenEncoder = (text: string): number => {
|
|
112
|
-
const allowed =
|
|
113
|
-
options.allowedSpecial === 'all' ? 'all' : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
|
|
114
|
-
|
|
115
|
-
const disallowed =
|
|
116
|
-
options.disallowedSpecial === 'all'
|
|
117
|
-
? 'all'
|
|
118
|
-
: options.disallowedSpecial
|
|
119
|
-
? Array.from(options.disallowedSpecial)
|
|
120
|
-
: [];
|
|
121
|
-
|
|
122
|
-
return tokenizer.encode(text, allowed, disallowed).length;
|
|
123
|
-
};
|
|
124
|
-
|
|
125
|
-
return new TokenTransformer({
|
|
126
|
-
encodingName,
|
|
127
|
-
modelName,
|
|
128
|
-
allowedSpecial: options.allowedSpecial,
|
|
129
|
-
disallowedSpecial: options.disallowedSpecial,
|
|
130
|
-
options: {
|
|
131
|
-
maxSize: options.maxSize,
|
|
132
|
-
overlap: options.overlap,
|
|
133
|
-
lengthFunction: tikTokenEncoder,
|
|
134
|
-
},
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
}
|