@mastra/rag 1.2.2 → 1.2.3-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/dist/index.cjs +25 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +25 -9
- package/dist/index.js.map +1 -1
- package/dist/tools/graph-rag.d.ts.map +1 -1
- package/dist/tools/types.d.ts +18 -5
- package/dist/tools/types.d.ts.map +1 -1
- package/dist/tools/vector-query.d.ts.map +1 -1
- package/dist/utils/vector-search.d.ts +6 -7
- package/dist/utils/vector-search.d.ts.map +1 -1
- package/package.json +19 -6
- package/.turbo/turbo-build.log +0 -4
- package/docker-compose.yaml +0 -22
- package/eslint.config.js +0 -6
- package/src/document/document.test.ts +0 -2975
- package/src/document/document.ts +0 -335
- package/src/document/extractors/base.ts +0 -30
- package/src/document/extractors/index.ts +0 -5
- package/src/document/extractors/keywords.test.ts +0 -125
- package/src/document/extractors/keywords.ts +0 -126
- package/src/document/extractors/questions.test.ts +0 -120
- package/src/document/extractors/questions.ts +0 -111
- package/src/document/extractors/summary.test.ts +0 -107
- package/src/document/extractors/summary.ts +0 -122
- package/src/document/extractors/title.test.ts +0 -121
- package/src/document/extractors/title.ts +0 -185
- package/src/document/extractors/types.ts +0 -40
- package/src/document/index.ts +0 -2
- package/src/document/prompts/base.ts +0 -77
- package/src/document/prompts/format.ts +0 -9
- package/src/document/prompts/index.ts +0 -15
- package/src/document/prompts/prompt.ts +0 -60
- package/src/document/prompts/types.ts +0 -29
- package/src/document/schema/index.ts +0 -3
- package/src/document/schema/node.ts +0 -187
- package/src/document/schema/types.ts +0 -40
- package/src/document/transformers/character.ts +0 -267
- package/src/document/transformers/html.ts +0 -346
- package/src/document/transformers/json.ts +0 -536
- package/src/document/transformers/latex.ts +0 -11
- package/src/document/transformers/markdown.ts +0 -239
- package/src/document/transformers/semantic-markdown.ts +0 -227
- package/src/document/transformers/sentence.ts +0 -314
- package/src/document/transformers/text.ts +0 -158
- package/src/document/transformers/token.ts +0 -137
- package/src/document/transformers/transformer.ts +0 -5
- package/src/document/types.ts +0 -145
- package/src/document/validation.ts +0 -158
- package/src/graph-rag/index.test.ts +0 -235
- package/src/graph-rag/index.ts +0 -306
- package/src/index.ts +0 -8
- package/src/rerank/index.test.ts +0 -150
- package/src/rerank/index.ts +0 -198
- package/src/rerank/relevance/cohere/index.ts +0 -56
- package/src/rerank/relevance/index.ts +0 -3
- package/src/rerank/relevance/mastra-agent/index.ts +0 -32
- package/src/rerank/relevance/zeroentropy/index.ts +0 -26
- package/src/tools/README.md +0 -153
- package/src/tools/document-chunker.ts +0 -34
- package/src/tools/graph-rag.test.ts +0 -115
- package/src/tools/graph-rag.ts +0 -154
- package/src/tools/index.ts +0 -3
- package/src/tools/types.ts +0 -110
- package/src/tools/vector-query-database-config.test.ts +0 -190
- package/src/tools/vector-query.test.ts +0 -418
- package/src/tools/vector-query.ts +0 -169
- package/src/utils/convert-sources.ts +0 -43
- package/src/utils/default-settings.ts +0 -38
- package/src/utils/index.ts +0 -3
- package/src/utils/tool-schemas.ts +0 -38
- package/src/utils/vector-prompts.ts +0 -832
- package/src/utils/vector-search.ts +0 -117
- package/tsconfig.build.json +0 -9
- package/tsconfig.json +0 -5
- package/tsup.config.ts +0 -17
- package/vitest.config.ts +0 -8
|
@@ -1,536 +0,0 @@
|
|
|
1
|
-
import { Document } from '../schema';
|
|
2
|
-
import type { JsonChunkOptions } from '../types';
|
|
3
|
-
|
|
4
|
-
export class RecursiveJsonTransformer {
|
|
5
|
-
private maxSize: number;
|
|
6
|
-
private minSize: number;
|
|
7
|
-
private ensureAscii: boolean;
|
|
8
|
-
private convertLists: boolean;
|
|
9
|
-
|
|
10
|
-
constructor({ maxSize = 2000, minSize, ensureAscii = false, convertLists = true }: JsonChunkOptions) {
|
|
11
|
-
this.maxSize = maxSize;
|
|
12
|
-
this.minSize = minSize ?? Math.max(maxSize - 200, 50);
|
|
13
|
-
this.ensureAscii = ensureAscii;
|
|
14
|
-
this.convertLists = convertLists;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
private static jsonSize(data: Record<string, any>): number {
|
|
18
|
-
const seen = new WeakSet();
|
|
19
|
-
|
|
20
|
-
function getStringifiableData(obj: any): any {
|
|
21
|
-
if (obj === null || typeof obj !== 'object') {
|
|
22
|
-
return obj;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
if (seen.has(obj)) {
|
|
26
|
-
return '[Circular]';
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
seen.add(obj);
|
|
30
|
-
|
|
31
|
-
if (Array.isArray(obj)) {
|
|
32
|
-
const safeArray = [];
|
|
33
|
-
for (const item of obj) {
|
|
34
|
-
safeArray.push(getStringifiableData(item));
|
|
35
|
-
}
|
|
36
|
-
return safeArray;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
const safeObj: Record<string, any> = {};
|
|
40
|
-
for (const key in obj) {
|
|
41
|
-
if (Object.prototype.hasOwnProperty.call(obj, key)) {
|
|
42
|
-
safeObj[key] = getStringifiableData(obj[key]);
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
return safeObj;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
const stringifiable = getStringifiableData(data);
|
|
49
|
-
const jsonString = JSON.stringify(stringifiable);
|
|
50
|
-
return jsonString.length;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Transform JSON data while handling circular references
|
|
55
|
-
*/
|
|
56
|
-
public transform(data: Record<string, any>): Record<string, any> {
|
|
57
|
-
const size = RecursiveJsonTransformer.jsonSize(data);
|
|
58
|
-
|
|
59
|
-
const seen = new WeakSet();
|
|
60
|
-
|
|
61
|
-
function createSafeCopy(obj: any): any {
|
|
62
|
-
if (obj === null || typeof obj !== 'object') {
|
|
63
|
-
return obj;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
if (seen.has(obj)) {
|
|
67
|
-
return '[Circular]';
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
seen.add(obj);
|
|
71
|
-
|
|
72
|
-
if (Array.isArray(obj)) {
|
|
73
|
-
return obj.map(item => createSafeCopy(item));
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
const copy: Record<string, any> = {};
|
|
77
|
-
for (const key in obj) {
|
|
78
|
-
if (Object.prototype.hasOwnProperty.call(obj, key)) {
|
|
79
|
-
copy[key] = createSafeCopy(obj[key]);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
return copy;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
return {
|
|
86
|
-
size,
|
|
87
|
-
data: createSafeCopy(data),
|
|
88
|
-
};
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/**
|
|
92
|
-
* Set a value in a nested dictionary based on the given path
|
|
93
|
-
*/
|
|
94
|
-
private static setNestedDict(d: Record<string, any>, path: string[], value: any): void {
|
|
95
|
-
let current = d;
|
|
96
|
-
for (const key of path.slice(0, -1)) {
|
|
97
|
-
current[key] = current[key] || {};
|
|
98
|
-
current = current[key];
|
|
99
|
-
}
|
|
100
|
-
current[path[path.length - 1]!] = value;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Convert lists in the JSON structure to dictionaries with index-based keys
|
|
105
|
-
*/
|
|
106
|
-
private listToDictPreprocessing(data: any): any {
|
|
107
|
-
if (data && typeof data === 'object') {
|
|
108
|
-
if (Array.isArray(data)) {
|
|
109
|
-
return Object.fromEntries(data.map((item, index) => [String(index), this.listToDictPreprocessing(item)]));
|
|
110
|
-
}
|
|
111
|
-
return Object.fromEntries(Object.entries(data).map(([k, v]) => [k, this.listToDictPreprocessing(v)]));
|
|
112
|
-
}
|
|
113
|
-
return data;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
/**
|
|
117
|
-
* Handles primitive values (strings, numbers, etc) by either adding them to the current chunk
|
|
118
|
-
* or creating new chunks if they don't fit
|
|
119
|
-
*/
|
|
120
|
-
private handlePrimitiveValue(
|
|
121
|
-
value: any,
|
|
122
|
-
key: string,
|
|
123
|
-
currentChunk: Record<string, any>,
|
|
124
|
-
chunks: Record<string, any>[],
|
|
125
|
-
fullPath: string[],
|
|
126
|
-
): { currentChunk: Record<string, any>; chunks: Record<string, any>[] } {
|
|
127
|
-
const testValue = { [key]: value };
|
|
128
|
-
|
|
129
|
-
if (RecursiveJsonTransformer.jsonSize(testValue) <= this.maxSize) {
|
|
130
|
-
if (RecursiveJsonTransformer.jsonSize({ ...currentChunk, ...testValue }) <= this.maxSize) {
|
|
131
|
-
return {
|
|
132
|
-
currentChunk: { ...currentChunk, ...testValue },
|
|
133
|
-
chunks,
|
|
134
|
-
};
|
|
135
|
-
} else {
|
|
136
|
-
return {
|
|
137
|
-
currentChunk: testValue,
|
|
138
|
-
chunks: [...chunks, currentChunk],
|
|
139
|
-
};
|
|
140
|
-
}
|
|
141
|
-
} else if (typeof value === 'string') {
|
|
142
|
-
const stringChunks = this.splitLongString(value);
|
|
143
|
-
const newChunks = stringChunks
|
|
144
|
-
.map(chunk => {
|
|
145
|
-
return this.createChunk(chunk, fullPath);
|
|
146
|
-
})
|
|
147
|
-
.filter(chunk => RecursiveJsonTransformer.jsonSize(chunk) <= this.maxSize);
|
|
148
|
-
|
|
149
|
-
return {
|
|
150
|
-
currentChunk,
|
|
151
|
-
chunks: [...chunks, ...newChunks],
|
|
152
|
-
};
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
const newChunk = this.createChunk(value, fullPath);
|
|
156
|
-
return {
|
|
157
|
-
currentChunk,
|
|
158
|
-
chunks: RecursiveJsonTransformer.jsonSize(newChunk) <= this.maxSize ? [...chunks, newChunk] : chunks,
|
|
159
|
-
};
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Creates a nested dictionary chunk from a value and path
|
|
164
|
-
* e.g., path ['a', 'b'], value 'c' becomes { a: { b: 'c' } }
|
|
165
|
-
*/
|
|
166
|
-
private createChunk(value: any, path: string[]): Record<string, any> {
|
|
167
|
-
const chunk: Record<string, any> = {};
|
|
168
|
-
RecursiveJsonTransformer.setNestedDict(chunk, path, value);
|
|
169
|
-
return chunk.root ? chunk.root : chunk;
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/**
|
|
173
|
-
* Checks if value is within size limits
|
|
174
|
-
*/
|
|
175
|
-
private isWithinSizeLimit(value: any, currentSize: number = 0): boolean {
|
|
176
|
-
const size = RecursiveJsonTransformer.jsonSize(value);
|
|
177
|
-
// If this is a new chunk (currentSize = 0), allow items smaller than maxSize
|
|
178
|
-
// If adding to existing chunk, ensure total size doesn't exceed maxSize
|
|
179
|
-
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
/**
|
|
183
|
-
* Splits arrays into chunks based on size limits
|
|
184
|
-
* Handles nested objects by recursing into handleNestedObject
|
|
185
|
-
*/
|
|
186
|
-
private handleArray(
|
|
187
|
-
value: any[],
|
|
188
|
-
key: string,
|
|
189
|
-
currentPath: string[],
|
|
190
|
-
depth: number,
|
|
191
|
-
maxDepth: number,
|
|
192
|
-
): Record<string, any>[] {
|
|
193
|
-
const path = currentPath.length ? [...currentPath, key] : ['root', key];
|
|
194
|
-
|
|
195
|
-
// Try keeping array intact
|
|
196
|
-
const chunk = this.createChunk(value, path);
|
|
197
|
-
if (this.isWithinSizeLimit(chunk)) {
|
|
198
|
-
return [chunk];
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
const chunks: Record<string, any>[] = [];
|
|
202
|
-
let currentGroup: any[] = [];
|
|
203
|
-
|
|
204
|
-
const saveCurrentGroup = () => {
|
|
205
|
-
if (currentGroup.length > 0) {
|
|
206
|
-
const groupChunk = this.createChunk(currentGroup, path);
|
|
207
|
-
if (RecursiveJsonTransformer.jsonSize(groupChunk) >= this.minSize) {
|
|
208
|
-
chunks.push(groupChunk);
|
|
209
|
-
currentGroup = [];
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
};
|
|
213
|
-
|
|
214
|
-
for (const item of value) {
|
|
215
|
-
// Try adding item to current group
|
|
216
|
-
const testGroup = [...currentGroup, item];
|
|
217
|
-
const testChunk = this.createChunk(testGroup, path);
|
|
218
|
-
|
|
219
|
-
if (this.isWithinSizeLimit(testChunk)) {
|
|
220
|
-
currentGroup = testGroup;
|
|
221
|
-
continue;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// Current group is full
|
|
225
|
-
saveCurrentGroup();
|
|
226
|
-
|
|
227
|
-
// Handle the new item
|
|
228
|
-
if (typeof item === 'object' && item !== null) {
|
|
229
|
-
const singleItemArray = [item];
|
|
230
|
-
const singleItemChunk = this.createChunk(singleItemArray, path);
|
|
231
|
-
|
|
232
|
-
if (this.isWithinSizeLimit(singleItemChunk)) {
|
|
233
|
-
currentGroup = singleItemArray;
|
|
234
|
-
} else {
|
|
235
|
-
const itemPath = [...path, String(chunks.length)];
|
|
236
|
-
const nestedChunks = this.handleNestedObject(item, itemPath, depth + 1, maxDepth);
|
|
237
|
-
chunks.push(...nestedChunks);
|
|
238
|
-
}
|
|
239
|
-
} else {
|
|
240
|
-
currentGroup = [item];
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
saveCurrentGroup();
|
|
245
|
-
return chunks;
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/**
|
|
249
|
-
* Splits objects into chunks based on size limits
|
|
250
|
-
* Handles nested arrays and objects by recursing into handleArray and handleNestedObject
|
|
251
|
-
*/
|
|
252
|
-
private handleNestedObject(
|
|
253
|
-
value: Record<string, any>,
|
|
254
|
-
fullPath: string[],
|
|
255
|
-
depth: number,
|
|
256
|
-
maxDepth: number,
|
|
257
|
-
): Record<string, any>[] {
|
|
258
|
-
const path = fullPath.length ? fullPath : ['root'];
|
|
259
|
-
|
|
260
|
-
// Handle max depth
|
|
261
|
-
if (depth > maxDepth) {
|
|
262
|
-
console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`);
|
|
263
|
-
return [this.createChunk(value, path)];
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
// Try keeping object intact
|
|
267
|
-
const wholeChunk = this.createChunk(value, path);
|
|
268
|
-
if (this.isWithinSizeLimit(wholeChunk)) {
|
|
269
|
-
return [wholeChunk];
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
const chunks: Record<string, any>[] = [];
|
|
273
|
-
let currentChunk: Record<string, any> = {};
|
|
274
|
-
|
|
275
|
-
const saveCurrentChunk = () => {
|
|
276
|
-
if (Object.keys(currentChunk).length > 0) {
|
|
277
|
-
const objChunk = this.createChunk(currentChunk, path);
|
|
278
|
-
if (RecursiveJsonTransformer.jsonSize(objChunk) >= this.minSize) {
|
|
279
|
-
chunks.push(objChunk);
|
|
280
|
-
currentChunk = {};
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
};
|
|
284
|
-
|
|
285
|
-
for (const [key, val] of Object.entries(value)) {
|
|
286
|
-
if (val === undefined) continue;
|
|
287
|
-
|
|
288
|
-
// Handle arrays separately
|
|
289
|
-
if (Array.isArray(val)) {
|
|
290
|
-
saveCurrentChunk();
|
|
291
|
-
const arrayChunks = this.handleArray(val, key, path, depth, maxDepth);
|
|
292
|
-
chunks.push(...arrayChunks);
|
|
293
|
-
continue;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
// Try adding to current chunk
|
|
297
|
-
const testChunk = this.createChunk({ ...currentChunk, [key]: val }, path);
|
|
298
|
-
if (this.isWithinSizeLimit(testChunk)) {
|
|
299
|
-
currentChunk[key] = val;
|
|
300
|
-
continue;
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
// Current chunk is full
|
|
304
|
-
saveCurrentChunk();
|
|
305
|
-
|
|
306
|
-
// Handle value that didn't fit
|
|
307
|
-
if (typeof val === 'object' && val !== null) {
|
|
308
|
-
const nestedChunks = this.handleNestedObject(val, [...path, key], depth + 1, maxDepth);
|
|
309
|
-
chunks.push(...nestedChunks);
|
|
310
|
-
} else {
|
|
311
|
-
currentChunk = { [key]: val };
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
saveCurrentChunk();
|
|
316
|
-
return chunks;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
/**
|
|
320
|
-
* Splits long strings into smaller chunks at word boundaries
|
|
321
|
-
* Ensures each chunk is within maxSize limit
|
|
322
|
-
*/
|
|
323
|
-
private splitLongString(value: string): string[] {
|
|
324
|
-
const chunks: string[] = [];
|
|
325
|
-
let remaining = value;
|
|
326
|
-
|
|
327
|
-
while (remaining.length > 0) {
|
|
328
|
-
const overhead = 20;
|
|
329
|
-
const chunkSize = Math.floor(this.maxSize - overhead);
|
|
330
|
-
|
|
331
|
-
if (remaining.length <= chunkSize) {
|
|
332
|
-
chunks.push(remaining);
|
|
333
|
-
break;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
const lastSpace = remaining.slice(0, chunkSize).lastIndexOf(' ');
|
|
337
|
-
const splitAt = lastSpace > 0 ? lastSpace + 1 : chunkSize;
|
|
338
|
-
|
|
339
|
-
chunks.push(remaining.slice(0, splitAt));
|
|
340
|
-
remaining = remaining.slice(splitAt);
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
return chunks;
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
/**
|
|
347
|
-
* Core chunking logic that processes JSON data recursively
|
|
348
|
-
* Handles arrays, objects, and primitive values while maintaining structure
|
|
349
|
-
*/
|
|
350
|
-
private jsonSplit({
|
|
351
|
-
data,
|
|
352
|
-
currentPath = [],
|
|
353
|
-
chunks = [{}],
|
|
354
|
-
depth = 0,
|
|
355
|
-
maxDepth = 100,
|
|
356
|
-
}: {
|
|
357
|
-
data: Record<string, any>;
|
|
358
|
-
currentPath?: string[];
|
|
359
|
-
chunks?: Record<string, any>[];
|
|
360
|
-
depth?: number;
|
|
361
|
-
maxDepth?: number;
|
|
362
|
-
}): Record<string, any>[] {
|
|
363
|
-
if (!data || typeof data !== 'object') {
|
|
364
|
-
return chunks;
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
if (depth > maxDepth) {
|
|
368
|
-
console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`);
|
|
369
|
-
RecursiveJsonTransformer.setNestedDict(chunks[chunks.length - 1] || {}, currentPath, data);
|
|
370
|
-
return chunks;
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
let currentChunk = {};
|
|
374
|
-
let accumulatedChunks = chunks;
|
|
375
|
-
|
|
376
|
-
for (const [key, value] of Object.entries(data)) {
|
|
377
|
-
const fullPath = [...currentPath, key];
|
|
378
|
-
|
|
379
|
-
if (Array.isArray(value)) {
|
|
380
|
-
const arrayChunks = this.handleArray(value, key, currentPath, depth, maxDepth);
|
|
381
|
-
accumulatedChunks = [...accumulatedChunks, ...arrayChunks];
|
|
382
|
-
} else if (typeof value === 'object' && value !== null) {
|
|
383
|
-
const objectChunks = this.handleNestedObject(value, fullPath, depth, maxDepth);
|
|
384
|
-
accumulatedChunks = [...accumulatedChunks, ...objectChunks];
|
|
385
|
-
} else {
|
|
386
|
-
const { currentChunk: newCurrentChunk, chunks: newChunks } = this.handlePrimitiveValue(
|
|
387
|
-
value,
|
|
388
|
-
key,
|
|
389
|
-
currentChunk,
|
|
390
|
-
accumulatedChunks,
|
|
391
|
-
fullPath,
|
|
392
|
-
);
|
|
393
|
-
currentChunk = newCurrentChunk;
|
|
394
|
-
accumulatedChunks = newChunks;
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
if (Object.keys(currentChunk).length > 0) {
|
|
399
|
-
accumulatedChunks = [...accumulatedChunks, currentChunk];
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
return accumulatedChunks.filter(chunk => Object.keys(chunk).length > 0);
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
/**
|
|
406
|
-
* Splits JSON into a list of JSON chunks
|
|
407
|
-
*/
|
|
408
|
-
splitJson({
|
|
409
|
-
jsonData,
|
|
410
|
-
convertLists = false,
|
|
411
|
-
}: {
|
|
412
|
-
jsonData: Record<string, any>;
|
|
413
|
-
convertLists?: boolean;
|
|
414
|
-
}): Record<string, any>[] {
|
|
415
|
-
const processedData = convertLists ? this.listToDictPreprocessing(jsonData) : jsonData;
|
|
416
|
-
|
|
417
|
-
const chunks = this.jsonSplit({ data: processedData });
|
|
418
|
-
|
|
419
|
-
if (Object.keys(chunks[chunks.length - 1] || {}).length === 0) {
|
|
420
|
-
chunks.pop();
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
return chunks;
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
/**
|
|
427
|
-
* Converts Unicode characters to their escaped ASCII representation
|
|
428
|
-
* e.g., 'café' becomes 'caf\u00e9'
|
|
429
|
-
*/
|
|
430
|
-
private escapeNonAscii(obj: any): any {
|
|
431
|
-
if (typeof obj === 'string') {
|
|
432
|
-
return obj.replace(/[\u0080-\uffff]/g, char => {
|
|
433
|
-
return `\\u${char.charCodeAt(0).toString(16).padStart(4, '0')}`;
|
|
434
|
-
});
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
if (Array.isArray(obj)) {
|
|
438
|
-
return obj.map(item => this.escapeNonAscii(item));
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
if (typeof obj === 'object' && obj !== null) {
|
|
442
|
-
return Object.fromEntries(Object.entries(obj).map(([key, value]) => [key, this.escapeNonAscii(value)]));
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
return obj;
|
|
446
|
-
}
|
|
447
|
-
/**
|
|
448
|
-
* Splits JSON into a list of JSON formatted strings
|
|
449
|
-
*/
|
|
450
|
-
splitText({
|
|
451
|
-
jsonData,
|
|
452
|
-
convertLists = false,
|
|
453
|
-
ensureAscii = true,
|
|
454
|
-
}: {
|
|
455
|
-
jsonData: Record<string, any>;
|
|
456
|
-
convertLists?: boolean;
|
|
457
|
-
ensureAscii?: boolean;
|
|
458
|
-
}): string[] {
|
|
459
|
-
const chunks = this.splitJson({ jsonData, convertLists });
|
|
460
|
-
|
|
461
|
-
if (ensureAscii) {
|
|
462
|
-
const escapedChunks = chunks.map(chunk => this.escapeNonAscii(chunk));
|
|
463
|
-
return escapedChunks.map(chunk => JSON.stringify(chunk));
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
return chunks.map(chunk =>
|
|
467
|
-
JSON.stringify(chunk, (key, value) => {
|
|
468
|
-
// Convert escaped Unicode sequences back to actual characters
|
|
469
|
-
// e.g., '\u00e9' -> 'é'
|
|
470
|
-
if (typeof value === 'string') {
|
|
471
|
-
return value.replace(/\\u[\da-f]{4}/gi, match => String.fromCharCode(parseInt(match.slice(2), 16)));
|
|
472
|
-
}
|
|
473
|
-
return value;
|
|
474
|
-
}),
|
|
475
|
-
);
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
/**
|
|
479
|
-
* Create documents from a list of json objects
|
|
480
|
-
*/
|
|
481
|
-
createDocuments({
|
|
482
|
-
texts,
|
|
483
|
-
convertLists = false,
|
|
484
|
-
ensureAscii = true,
|
|
485
|
-
metadatas,
|
|
486
|
-
}: {
|
|
487
|
-
texts: string[];
|
|
488
|
-
convertLists?: boolean;
|
|
489
|
-
ensureAscii?: boolean;
|
|
490
|
-
metadatas?: Record<string, any>[];
|
|
491
|
-
}): Document[] {
|
|
492
|
-
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
493
|
-
const documents: Document[] = [];
|
|
494
|
-
|
|
495
|
-
texts.forEach((text, i) => {
|
|
496
|
-
const chunks = this.splitText({ jsonData: JSON.parse(text), convertLists, ensureAscii });
|
|
497
|
-
chunks.forEach(chunk => {
|
|
498
|
-
const metadata = { ...(_metadatas[i] || {}) };
|
|
499
|
-
documents.push(
|
|
500
|
-
new Document({
|
|
501
|
-
text: chunk,
|
|
502
|
-
metadata,
|
|
503
|
-
}),
|
|
504
|
-
);
|
|
505
|
-
});
|
|
506
|
-
});
|
|
507
|
-
|
|
508
|
-
return documents;
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
transformDocuments({
|
|
512
|
-
ensureAscii,
|
|
513
|
-
documents,
|
|
514
|
-
convertLists,
|
|
515
|
-
}: {
|
|
516
|
-
ensureAscii?: boolean;
|
|
517
|
-
convertLists?: boolean;
|
|
518
|
-
documents: Document[];
|
|
519
|
-
}): Document[] {
|
|
520
|
-
const texts: string[] = [];
|
|
521
|
-
const metadatas: Record<string, any>[] = [];
|
|
522
|
-
|
|
523
|
-
for (const doc of documents) {
|
|
524
|
-
texts.push(doc.text);
|
|
525
|
-
metadatas.push(doc.metadata);
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
return this.createDocuments({
|
|
529
|
-
texts,
|
|
530
|
-
metadatas,
|
|
531
|
-
|
|
532
|
-
ensureAscii,
|
|
533
|
-
convertLists,
|
|
534
|
-
});
|
|
535
|
-
}
|
|
536
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
import { Language } from '../types';
|
|
2
|
-
import type { BaseChunkOptions } from '../types';
|
|
3
|
-
|
|
4
|
-
import { RecursiveCharacterTransformer } from './character';
|
|
5
|
-
|
|
6
|
-
export class LatexTransformer extends RecursiveCharacterTransformer {
|
|
7
|
-
constructor(options: BaseChunkOptions = {}) {
|
|
8
|
-
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.LATEX);
|
|
9
|
-
super({ ...options, separators, isSeparatorRegex: true });
|
|
10
|
-
}
|
|
11
|
-
}
|