@rws-framework/ai-tools 3.8.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/services/LangChainEmbeddingService.ts +56 -6
- package/src/services/LangChainRAGService.ts +78 -34
- package/src/services/OpenAIRateLimitingService.ts +6 -0
- package/src/services/OptimizedVectorSearchService.ts +1 -1
- package/src/services/TextChunker.ts +1 -1
- package/tsconfig.json +4 -1
package/package.json
CHANGED
|
@@ -7,7 +7,7 @@ import { IEmbeddingConfig, IChunkConfig } from '../types';
|
|
|
7
7
|
import { TextChunker } from './TextChunker';
|
|
8
8
|
import RWSVectorStore, { VectorDocType, IVectorStoreConfig } from '../models/convo/VectorStore';
|
|
9
9
|
import { OpenAIRateLimitingService } from './OpenAIRateLimitingService';
|
|
10
|
-
|
|
10
|
+
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
11
11
|
@Injectable()
|
|
12
12
|
export class LangChainEmbeddingService {
|
|
13
13
|
private embeddings: Embeddings;
|
|
@@ -83,17 +83,41 @@ export class LangChainEmbeddingService {
|
|
|
83
83
|
// This method is kept for compatibility but doesn't initialize anything
|
|
84
84
|
}
|
|
85
85
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
86
|
+
async embedDocs(docs: Document[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
|
|
87
|
+
this.ensureInitialized();
|
|
88
|
+
|
|
89
|
+
if (this.config.rateLimiting) {
|
|
90
|
+
return await this.rateLimitingService.executeWithRateLimit(
|
|
91
|
+
docs,
|
|
92
|
+
async (batch: Document[]) => {
|
|
93
|
+
const embeddings = await this.embeddings.embedDocuments(batch.map(d => d.pageContent));
|
|
94
|
+
|
|
95
|
+
if(batchCallback){
|
|
96
|
+
const fragments = batch.map(d => d.pageContent);
|
|
97
|
+
await batchCallback(fragments, embeddings);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return embeddings;
|
|
101
|
+
},
|
|
102
|
+
(doc: Document) => doc.pageContent
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return await this.embeddings.embedDocuments(docs.map(d => d.pageContent));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
async embedTexts(texts: string[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
|
|
90
110
|
this.ensureInitialized();
|
|
91
111
|
|
|
92
112
|
if (this.config.rateLimiting) {
|
|
93
113
|
return await this.rateLimitingService.executeWithRateLimit(
|
|
94
114
|
texts,
|
|
95
115
|
async (batch: string[]) => {
|
|
96
|
-
|
|
116
|
+
const embeddings = await this.embeddings.embedDocuments(batch);
|
|
117
|
+
if (batchCallback) {
|
|
118
|
+
await batchCallback(batch, embeddings);
|
|
119
|
+
}
|
|
120
|
+
return embeddings;
|
|
97
121
|
},
|
|
98
122
|
(text: string) => text // Token extractor
|
|
99
123
|
);
|
|
@@ -135,9 +159,34 @@ export class LangChainEmbeddingService {
|
|
|
135
159
|
const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
|
|
136
160
|
const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
|
|
137
161
|
const separators = ragOverride?.separators || this.chunkConfig?.separators || TextChunker.DEFAULT_SEPARATORS; // Default separators
|
|
162
|
+
|
|
138
163
|
return TextChunker.chunkText(text, maxTokens, overlap, separators);
|
|
139
164
|
}
|
|
140
165
|
|
|
166
|
+
async chunkCSV(rows: Record<string, any>[], ragOverride?: IChunkConfig): Promise<Document[]> {
|
|
167
|
+
// Use safe token limits - the TextChunker handles token estimation internally
|
|
168
|
+
const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
|
|
169
|
+
const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
|
|
170
|
+
|
|
171
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
172
|
+
chunkSize: maxTokens,
|
|
173
|
+
chunkOverlap: overlap
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
const docs = rows.map((row, i) => {
|
|
177
|
+
const text = Object.entries(row)
|
|
178
|
+
.map(([k, v]) => `${k}: ${v}`)
|
|
179
|
+
.join("\n");
|
|
180
|
+
|
|
181
|
+
return new Document({
|
|
182
|
+
pageContent: text,
|
|
183
|
+
metadata: { row: i }
|
|
184
|
+
});
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
return await splitter.splitDocuments(docs);
|
|
188
|
+
}
|
|
189
|
+
|
|
141
190
|
/**
|
|
142
191
|
* Split text and generate embeddings for chunks
|
|
143
192
|
*/
|
|
@@ -205,6 +254,7 @@ export class LangChainEmbeddingService {
|
|
|
205
254
|
|
|
206
255
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
207
256
|
}
|
|
257
|
+
|
|
208
258
|
|
|
209
259
|
/**
|
|
210
260
|
* Ensure the service is initialized
|
|
@@ -43,6 +43,18 @@ export class LangChainRAGService {
|
|
|
43
43
|
private isInitialized = false;
|
|
44
44
|
private logger?: any; // Optional logger interface
|
|
45
45
|
|
|
46
|
+
static SheetMimeType: string[] = [
|
|
47
|
+
'text/csv',
|
|
48
|
+
'text/tab-separated-values',
|
|
49
|
+
'application/vnd.ms-excel',
|
|
50
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
51
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
|
|
52
|
+
'application/vnd.ms-excel.sheet.macroEnabled.12',
|
|
53
|
+
'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
|
|
54
|
+
'application/vnd.oasis.opendocument.spreadsheet',
|
|
55
|
+
'application/vnd.google-apps.spreadsheet',
|
|
56
|
+
];
|
|
57
|
+
|
|
46
58
|
constructor(
|
|
47
59
|
private embeddingService: LangChainEmbeddingService,
|
|
48
60
|
private vectorSearchService: OptimizedVectorSearchService
|
|
@@ -84,45 +96,57 @@ export class LangChainRAGService {
|
|
|
84
96
|
*/
|
|
85
97
|
async indexKnowledge(
|
|
86
98
|
fileId: string | number,
|
|
87
|
-
content: string,
|
|
99
|
+
content: string | Record<string, any>[],
|
|
88
100
|
metadata: Record<string, any> = {},
|
|
101
|
+
batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>,
|
|
89
102
|
ragOverride?: IChunkConfig
|
|
90
|
-
): Promise<IRAGResponse<{
|
|
103
|
+
): Promise<IRAGResponse<{ chunkCount: number }>> {
|
|
91
104
|
this.log('log', `[INDEXING] Starting indexKnowledge for fileId: ${fileId}`);
|
|
92
|
-
this.log('debug', `[INDEXING] Content length: ${content.length} characters`);
|
|
105
|
+
this.log('debug', `[INDEXING] Content length: ${Array.isArray(content) ? content.map(r => Object.values(r).join(' ')).join('\n').length : content.length} characters`);
|
|
93
106
|
|
|
94
107
|
try {
|
|
95
108
|
await this.ensureInitialized();
|
|
96
109
|
|
|
97
|
-
|
|
98
|
-
const chunks = await this.embeddingService.chunkText(content, ragOverride);
|
|
99
|
-
this.log('debug', `[INDEXING] Split content into ${chunks.length} chunks for file ${fileId}`);
|
|
100
|
-
|
|
101
|
-
// Generate embeddings for all chunks at once (batch processing for speed)
|
|
102
|
-
const embeddings = await this.embeddingService.embedTexts(chunks);
|
|
103
|
-
this.log('debug', `[INDEXING] Generated embeddings for ${chunks.length} chunks`);
|
|
104
|
-
|
|
105
|
-
// Create chunk objects with embeddings
|
|
106
|
-
const chunksWithEmbeddings = chunks.map((chunkContent, index) => ({
|
|
107
|
-
content: chunkContent,
|
|
108
|
-
embedding: embeddings[index],
|
|
109
|
-
metadata: {
|
|
110
|
-
...metadata,
|
|
111
|
-
fileId,
|
|
112
|
-
chunkIndex: index,
|
|
113
|
-
id: `knowledge_${fileId}_chunk_${index}`
|
|
114
|
-
}
|
|
115
|
-
}));
|
|
110
|
+
const mime = metadata.mime || null;
|
|
116
111
|
|
|
117
|
-
|
|
118
|
-
|
|
112
|
+
let chunkTexts: string[] = undefined;
|
|
113
|
+
let embeddings: number[][] = undefined;
|
|
119
114
|
|
|
120
|
-
|
|
121
|
-
|
|
115
|
+
if(mime && LangChainRAGService.isSheetDocument(mime)) {
|
|
116
|
+
this.log('debug', `[INDEXING] SHEET extraction mode detected.`);
|
|
117
|
+
|
|
118
|
+
const docs = await this.embeddingService.chunkCSV(content as Record<string, any>[], ragOverride);
|
|
119
|
+
embeddings = await this.embeddingService.embedDocs(docs, batchCallback);
|
|
120
|
+
chunkTexts = docs.map(d => d.pageContent);
|
|
121
|
+
}else{
|
|
122
|
+
chunkTexts = await this.embeddingService.chunkText(content as string, ragOverride);
|
|
123
|
+
embeddings = await this.embeddingService.embedTexts(chunkTexts, batchCallback);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
this.log('debug', `[INDEXING] Generated embeddings for ${chunkTexts.length} chunks`);
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if(!batchCallback){
|
|
131
|
+
// Create chunk objects with embeddings
|
|
132
|
+
const chunksWithEmbeddings = chunkTexts.map((chunkContent, index) => ({
|
|
133
|
+
content: chunkContent,
|
|
134
|
+
embedding: embeddings[index],
|
|
135
|
+
metadata: {
|
|
136
|
+
...metadata,
|
|
137
|
+
fileId,
|
|
138
|
+
chunkIndex: index,
|
|
139
|
+
id: `knowledge_${fileId}_chunk_${index}`
|
|
140
|
+
}
|
|
141
|
+
}));
|
|
142
|
+
await this.saveKnowledgeVector(fileId, chunksWithEmbeddings);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
this.log('log', `[INDEXING] Successfully indexed file ${fileId} with ${chunkTexts.length} chunks using optimized approach`);
|
|
122
146
|
|
|
123
147
|
return {
|
|
124
148
|
success: true,
|
|
125
|
-
data: {
|
|
149
|
+
data: { chunkCount: chunkTexts.length }
|
|
126
150
|
};
|
|
127
151
|
|
|
128
152
|
} catch (error: any) {
|
|
@@ -135,6 +159,10 @@ export class LangChainRAGService {
|
|
|
135
159
|
}
|
|
136
160
|
}
|
|
137
161
|
|
|
162
|
+
static isSheetDocument(mime: string): boolean {
|
|
163
|
+
return LangChainRAGService.SheetMimeType.includes(mime);
|
|
164
|
+
}
|
|
165
|
+
|
|
138
166
|
/**
|
|
139
167
|
* Search for relevant knowledge chunks using optimized vector search
|
|
140
168
|
*/
|
|
@@ -240,6 +268,10 @@ export class LangChainRAGService {
|
|
|
240
268
|
}
|
|
241
269
|
}
|
|
242
270
|
|
|
271
|
+
embedQuery(query: string): Promise<number[]> {
|
|
272
|
+
return this.vectorSearchService.getQueryEmbedding(query);
|
|
273
|
+
}
|
|
274
|
+
|
|
243
275
|
/**
|
|
244
276
|
* Get statistics about the RAG system
|
|
245
277
|
*/
|
|
@@ -318,6 +350,7 @@ export class LangChainRAGService {
|
|
|
318
350
|
|
|
319
351
|
/**
|
|
320
352
|
* Save chunks to knowledge-specific vector file with embeddings
|
|
353
|
+
* Uses streaming JSON write to handle large embedding datasets
|
|
321
354
|
*/
|
|
322
355
|
private async saveKnowledgeVector(fileId: string | number, chunks: Array<{ content: string; embedding: number[]; metadata: any }>): Promise<void> {
|
|
323
356
|
const vectorFilePath = this.getKnowledgeVectorPath(fileId);
|
|
@@ -329,13 +362,24 @@ export class LangChainRAGService {
|
|
|
329
362
|
}
|
|
330
363
|
|
|
331
364
|
try {
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
365
|
+
// Stream JSON to avoid "Invalid string length" on large datasets
|
|
366
|
+
const writeStream = fs.createWriteStream(vectorFilePath);
|
|
367
|
+
|
|
368
|
+
await new Promise<void>((resolve, reject) => {
|
|
369
|
+
writeStream.on('error', reject);
|
|
370
|
+
writeStream.on('finish', resolve);
|
|
371
|
+
|
|
372
|
+
writeStream.write(`{"fileId":${JSON.stringify(fileId)},"timestamp":${JSON.stringify(new Date().toISOString())},"chunks":[`);
|
|
373
|
+
|
|
374
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
375
|
+
if (i > 0) writeStream.write(',');
|
|
376
|
+
writeStream.write(JSON.stringify(chunks[i]));
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
writeStream.write(']}');
|
|
380
|
+
writeStream.end();
|
|
381
|
+
});
|
|
382
|
+
|
|
339
383
|
this.log('debug', `[SAVE] Successfully saved ${chunks.length} chunks with embeddings for file ${fileId} to: "${vectorFilePath}"`);
|
|
340
384
|
|
|
341
385
|
} catch (error) {
|
|
@@ -95,6 +95,7 @@ export class OpenAIRateLimitingService {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
const results = new Array(items.length);
|
|
98
|
+
let doneItems = 0;
|
|
98
99
|
|
|
99
100
|
// Process all batches with queue concurrency control
|
|
100
101
|
await Promise.all(batchStarts.map(meta =>
|
|
@@ -104,9 +105,14 @@ export class OpenAIRateLimitingService {
|
|
|
104
105
|
for (let attempt = 0; attempt < 6; attempt++) {
|
|
105
106
|
try {
|
|
106
107
|
const batchResults = await this.callWithRetry(() => executor(attemptBatch));
|
|
108
|
+
|
|
107
109
|
for (let i = 0; i < batchResults.length; i++) {
|
|
108
110
|
results[meta.start + i] = batchResults[i];
|
|
111
|
+
doneItems++;
|
|
109
112
|
}
|
|
113
|
+
|
|
114
|
+
this.logger.debug(`Embedding chunks done [${doneItems}/${items.length}]`);
|
|
115
|
+
|
|
110
116
|
break;
|
|
111
117
|
} catch (err: any) {
|
|
112
118
|
const status = err?.status || err?.response?.status;
|
|
@@ -108,7 +108,7 @@ export class OptimizedVectorSearchService {
|
|
|
108
108
|
/**
|
|
109
109
|
* Get query embedding with caching
|
|
110
110
|
*/
|
|
111
|
-
|
|
111
|
+
async getQueryEmbedding(query: string): Promise<number[]> {
|
|
112
112
|
// Check cache first
|
|
113
113
|
if (this.queryEmbeddingCache.has(query)) {
|
|
114
114
|
return this.queryEmbeddingCache.get(query)!;
|
|
@@ -16,7 +16,7 @@ export class TextChunker {
|
|
|
16
16
|
* Default separators following LangChain RecursiveCharacterTextSplitter approach
|
|
17
17
|
* Ordered by preference for breaking text
|
|
18
18
|
*/
|
|
19
|
-
|
|
19
|
+
static readonly DEFAULT_SEPARATORS = [
|
|
20
20
|
'\n\n', // Double newlines (paragraphs)
|
|
21
21
|
'\n', // Single newlines
|
|
22
22
|
'. ', // Sentence endings
|
package/tsconfig.json
CHANGED
|
@@ -14,11 +14,14 @@
|
|
|
14
14
|
"allowSyntheticDefaultImports": true,
|
|
15
15
|
"sourceMap": true,
|
|
16
16
|
"declaration": true,
|
|
17
|
+
"types": []
|
|
17
18
|
},
|
|
18
19
|
"include": [
|
|
19
20
|
"src"
|
|
20
21
|
],
|
|
21
22
|
"exclude": [
|
|
22
|
-
"node_modules"
|
|
23
|
+
"node_modules",
|
|
24
|
+
"**/*.d.ts",
|
|
25
|
+
"**/node_modules/**",
|
|
23
26
|
]
|
|
24
27
|
}
|