@rws-framework/ai-tools 3.8.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@rws-framework/ai-tools",
3
3
  "private": false,
4
- "version": "3.8.0",
4
+ "version": "3.9.0",
5
5
  "description": "",
6
6
  "main": "src/index.ts",
7
7
  "scripts": {},
@@ -7,7 +7,7 @@ import { IEmbeddingConfig, IChunkConfig } from '../types';
7
7
  import { TextChunker } from './TextChunker';
8
8
  import RWSVectorStore, { VectorDocType, IVectorStoreConfig } from '../models/convo/VectorStore';
9
9
  import { OpenAIRateLimitingService } from './OpenAIRateLimitingService';
10
-
10
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
11
11
  @Injectable()
12
12
  export class LangChainEmbeddingService {
13
13
  private embeddings: Embeddings;
@@ -83,17 +83,41 @@ export class LangChainEmbeddingService {
83
83
  // This method is kept for compatibility but doesn't initialize anything
84
84
  }
85
85
 
86
- /**
87
- * Generate embeddings for multiple texts with sophisticated rate limiting
88
- */
89
- async embedTexts(texts: string[]): Promise<number[][]> {
86
+ async embedDocs(docs: Document[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
87
+ this.ensureInitialized();
88
+
89
+ if (this.config.rateLimiting) {
90
+ return await this.rateLimitingService.executeWithRateLimit(
91
+ docs,
92
+ async (batch: Document[]) => {
93
+ const embeddings = await this.embeddings.embedDocuments(batch.map(d => d.pageContent));
94
+
95
+ if(batchCallback){
96
+ const fragments = batch.map(d => d.pageContent);
97
+ await batchCallback(fragments, embeddings);
98
+ }
99
+
100
+ return embeddings;
101
+ },
102
+ (doc: Document) => doc.pageContent
103
+ );
104
+ }
105
+
106
+ return await this.embeddings.embedDocuments(docs.map(d => d.pageContent));
107
+ }
108
+
109
+ async embedTexts(texts: string[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
90
110
  this.ensureInitialized();
91
111
 
92
112
  if (this.config.rateLimiting) {
93
113
  return await this.rateLimitingService.executeWithRateLimit(
94
114
  texts,
95
115
  async (batch: string[]) => {
96
- return await this.embeddings.embedDocuments(batch);
116
+ const embeddings = await this.embeddings.embedDocuments(batch);
117
+ if (batchCallback) {
118
+ await batchCallback(batch, embeddings);
119
+ }
120
+ return embeddings;
97
121
  },
98
122
  (text: string) => text // Token extractor
99
123
  );
@@ -135,9 +159,34 @@ export class LangChainEmbeddingService {
135
159
  const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
136
160
  const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
137
161
  const separators = ragOverride?.separators || this.chunkConfig?.separators || TextChunker.DEFAULT_SEPARATORS; // Default separators
162
+
138
163
  return TextChunker.chunkText(text, maxTokens, overlap, separators);
139
164
  }
140
165
 
166
+ async chunkCSV(rows: Record<string, any>[], ragOverride?: IChunkConfig): Promise<Document[]> {
167
+ // Use safe token limits - the TextChunker handles token estimation internally
168
+ const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
169
+ const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
170
+
171
+ const splitter = new RecursiveCharacterTextSplitter({
172
+ chunkSize: maxTokens,
173
+ chunkOverlap: overlap
174
+ });
175
+
176
+ const docs = rows.map((row, i) => {
177
+ const text = Object.entries(row)
178
+ .map(([k, v]) => `${k}: ${v}`)
179
+ .join("\n");
180
+
181
+ return new Document({
182
+ pageContent: text,
183
+ metadata: { row: i }
184
+ });
185
+ });
186
+
187
+ return await splitter.splitDocuments(docs);
188
+ }
189
+
141
190
  /**
142
191
  * Split text and generate embeddings for chunks
143
192
  */
@@ -205,6 +254,7 @@ export class LangChainEmbeddingService {
205
254
 
206
255
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
207
256
  }
257
+
208
258
 
209
259
  /**
210
260
  * Ensure the service is initialized
@@ -43,6 +43,18 @@ export class LangChainRAGService {
43
43
  private isInitialized = false;
44
44
  private logger?: any; // Optional logger interface
45
45
 
46
+ static SheetMimeType: string[] = [
47
+ 'text/csv',
48
+ 'text/tab-separated-values',
49
+ 'application/vnd.ms-excel',
50
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
51
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
52
+ 'application/vnd.ms-excel.sheet.macroEnabled.12',
53
+ 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
54
+ 'application/vnd.oasis.opendocument.spreadsheet',
55
+ 'application/vnd.google-apps.spreadsheet',
56
+ ];
57
+
46
58
  constructor(
47
59
  private embeddingService: LangChainEmbeddingService,
48
60
  private vectorSearchService: OptimizedVectorSearchService
@@ -84,45 +96,57 @@ export class LangChainRAGService {
84
96
  */
85
97
  async indexKnowledge(
86
98
  fileId: string | number,
87
- content: string,
99
+ content: string | Record<string, any>[],
88
100
  metadata: Record<string, any> = {},
101
+ batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>,
89
102
  ragOverride?: IChunkConfig
90
- ): Promise<IRAGResponse<{ chunkIds: string[] }>> {
103
+ ): Promise<IRAGResponse<{ chunkCount: number }>> {
91
104
  this.log('log', `[INDEXING] Starting indexKnowledge for fileId: ${fileId}`);
92
- this.log('debug', `[INDEXING] Content length: ${content.length} characters`);
105
+ this.log('debug', `[INDEXING] Content length: ${Array.isArray(content) ? content.map(r => Object.values(r).join(' ')).join('\n').length : content.length} characters`);
93
106
 
94
107
  try {
95
108
  await this.ensureInitialized();
96
109
 
97
- // Chunk the content using the embedding service
98
- const chunks = await this.embeddingService.chunkText(content, ragOverride);
99
- this.log('debug', `[INDEXING] Split content into ${chunks.length} chunks for file ${fileId}`);
100
-
101
- // Generate embeddings for all chunks at once (batch processing for speed)
102
- const embeddings = await this.embeddingService.embedTexts(chunks);
103
- this.log('debug', `[INDEXING] Generated embeddings for ${chunks.length} chunks`);
104
-
105
- // Create chunk objects with embeddings
106
- const chunksWithEmbeddings = chunks.map((chunkContent, index) => ({
107
- content: chunkContent,
108
- embedding: embeddings[index],
109
- metadata: {
110
- ...metadata,
111
- fileId,
112
- chunkIndex: index,
113
- id: `knowledge_${fileId}_chunk_${index}`
114
- }
115
- }));
110
+ const mime = metadata.mime || null;
116
111
 
117
- // Save to per-knowledge vector file
118
- await this.saveKnowledgeVector(fileId, chunksWithEmbeddings);
112
+ let chunkTexts: string[] = undefined;
113
+ let embeddings: number[][] = undefined;
119
114
 
120
- const chunkIds = chunksWithEmbeddings.map(chunk => chunk.metadata.id);
121
- this.log('log', `[INDEXING] Successfully indexed file ${fileId} with ${chunkIds.length} chunks using optimized approach`);
115
+ if(mime && LangChainRAGService.isSheetDocument(mime)) {
116
+ this.log('debug', `[INDEXING] SHEET extraction mode detected.`);
117
+
118
+ const docs = await this.embeddingService.chunkCSV(content as Record<string, any>[], ragOverride);
119
+ embeddings = await this.embeddingService.embedDocs(docs, batchCallback);
120
+ chunkTexts = docs.map(d => d.pageContent);
121
+ }else{
122
+ chunkTexts = await this.embeddingService.chunkText(content as string, ragOverride);
123
+ embeddings = await this.embeddingService.embedTexts(chunkTexts, batchCallback);
124
+ }
125
+
126
+ this.log('debug', `[INDEXING] Generated embeddings for ${chunkTexts.length} chunks`);
127
+
128
+
129
+
130
+ if(!batchCallback){
131
+ // Create chunk objects with embeddings
132
+ const chunksWithEmbeddings = chunkTexts.map((chunkContent, index) => ({
133
+ content: chunkContent,
134
+ embedding: embeddings[index],
135
+ metadata: {
136
+ ...metadata,
137
+ fileId,
138
+ chunkIndex: index,
139
+ id: `knowledge_${fileId}_chunk_${index}`
140
+ }
141
+ }));
142
+ await this.saveKnowledgeVector(fileId, chunksWithEmbeddings);
143
+ }
144
+
145
+ this.log('log', `[INDEXING] Successfully indexed file ${fileId} with ${chunkTexts.length} chunks using optimized approach`);
122
146
 
123
147
  return {
124
148
  success: true,
125
- data: { chunkIds }
149
+ data: { chunkCount: chunkTexts.length }
126
150
  };
127
151
 
128
152
  } catch (error: any) {
@@ -135,6 +159,10 @@ export class LangChainRAGService {
135
159
  }
136
160
  }
137
161
 
162
+ static isSheetDocument(mime: string): boolean {
163
+ return LangChainRAGService.SheetMimeType.includes(mime);
164
+ }
165
+
138
166
  /**
139
167
  * Search for relevant knowledge chunks using optimized vector search
140
168
  */
@@ -240,6 +268,10 @@ export class LangChainRAGService {
240
268
  }
241
269
  }
242
270
 
271
+ embedQuery(query: string): Promise<number[]> {
272
+ return this.vectorSearchService.getQueryEmbedding(query);
273
+ }
274
+
243
275
  /**
244
276
  * Get statistics about the RAG system
245
277
  */
@@ -318,6 +350,7 @@ export class LangChainRAGService {
318
350
 
319
351
  /**
320
352
  * Save chunks to knowledge-specific vector file with embeddings
353
+ * Uses streaming JSON write to handle large embedding datasets
321
354
  */
322
355
  private async saveKnowledgeVector(fileId: string | number, chunks: Array<{ content: string; embedding: number[]; metadata: any }>): Promise<void> {
323
356
  const vectorFilePath = this.getKnowledgeVectorPath(fileId);
@@ -329,13 +362,24 @@ export class LangChainRAGService {
329
362
  }
330
363
 
331
364
  try {
332
- const vectorData = {
333
- fileId,
334
- chunks,
335
- timestamp: new Date().toISOString()
336
- };
337
-
338
- fs.writeFileSync(vectorFilePath, JSON.stringify(vectorData, null, 2));
365
+ // Stream JSON to avoid "Invalid string length" on large datasets
366
+ const writeStream = fs.createWriteStream(vectorFilePath);
367
+
368
+ await new Promise<void>((resolve, reject) => {
369
+ writeStream.on('error', reject);
370
+ writeStream.on('finish', resolve);
371
+
372
+ writeStream.write(`{"fileId":${JSON.stringify(fileId)},"timestamp":${JSON.stringify(new Date().toISOString())},"chunks":[`);
373
+
374
+ for (let i = 0; i < chunks.length; i++) {
375
+ if (i > 0) writeStream.write(',');
376
+ writeStream.write(JSON.stringify(chunks[i]));
377
+ }
378
+
379
+ writeStream.write(']}');
380
+ writeStream.end();
381
+ });
382
+
339
383
  this.log('debug', `[SAVE] Successfully saved ${chunks.length} chunks with embeddings for file ${fileId} to: "${vectorFilePath}"`);
340
384
 
341
385
  } catch (error) {
@@ -95,6 +95,7 @@ export class OpenAIRateLimitingService {
95
95
  }
96
96
 
97
97
  const results = new Array(items.length);
98
+ let doneItems = 0;
98
99
 
99
100
  // Process all batches with queue concurrency control
100
101
  await Promise.all(batchStarts.map(meta =>
@@ -104,9 +105,14 @@ export class OpenAIRateLimitingService {
104
105
  for (let attempt = 0; attempt < 6; attempt++) {
105
106
  try {
106
107
  const batchResults = await this.callWithRetry(() => executor(attemptBatch));
108
+
107
109
  for (let i = 0; i < batchResults.length; i++) {
108
110
  results[meta.start + i] = batchResults[i];
111
+ doneItems++;
109
112
  }
113
+
114
+ this.logger.debug(`Embedding chunks done [${doneItems}/${items.length}]`);
115
+
110
116
  break;
111
117
  } catch (err: any) {
112
118
  const status = err?.status || err?.response?.status;
@@ -108,7 +108,7 @@ export class OptimizedVectorSearchService {
108
108
  /**
109
109
  * Get query embedding with caching
110
110
  */
111
- private async getQueryEmbedding(query: string): Promise<number[]> {
111
+ async getQueryEmbedding(query: string): Promise<number[]> {
112
112
  // Check cache first
113
113
  if (this.queryEmbeddingCache.has(query)) {
114
114
  return this.queryEmbeddingCache.get(query)!;
@@ -16,7 +16,7 @@ export class TextChunker {
16
16
  * Default separators following LangChain RecursiveCharacterTextSplitter approach
17
17
  * Ordered by preference for breaking text
18
18
  */
19
- private static readonly DEFAULT_SEPARATORS = [
19
+ static readonly DEFAULT_SEPARATORS = [
20
20
  '\n\n', // Double newlines (paragraphs)
21
21
  '\n', // Single newlines
22
22
  '. ', // Sentence endings
package/tsconfig.json CHANGED
@@ -14,11 +14,14 @@
14
14
  "allowSyntheticDefaultImports": true,
15
15
  "sourceMap": true,
16
16
  "declaration": true,
17
+ "types": []
17
18
  },
18
19
  "include": [
19
20
  "src"
20
21
  ],
21
22
  "exclude": [
22
- "node_modules"
23
+ "node_modules",
24
+ "**/*.d.ts",
25
+ "**/node_modules/**",
23
26
  ]
24
27
  }