@rws-framework/ai-tools 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.bin/add-v.sh CHANGED
File without changes
File without changes
File without changes
File without changes
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@rws-framework/ai-tools",
3
3
  "private": false,
4
- "version": "3.7.0",
4
+ "version": "3.9.0",
5
5
  "description": "",
6
6
  "main": "src/index.ts",
7
7
  "scripts": {},
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -7,7 +7,7 @@ import { IEmbeddingConfig, IChunkConfig } from '../types';
7
7
  import { TextChunker } from './TextChunker';
8
8
  import RWSVectorStore, { VectorDocType, IVectorStoreConfig } from '../models/convo/VectorStore';
9
9
  import { OpenAIRateLimitingService } from './OpenAIRateLimitingService';
10
-
10
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
11
11
  @Injectable()
12
12
  export class LangChainEmbeddingService {
13
13
  private embeddings: Embeddings;
@@ -83,17 +83,41 @@ export class LangChainEmbeddingService {
83
83
  // This method is kept for compatibility but doesn't initialize anything
84
84
  }
85
85
 
86
- /**
87
- * Generate embeddings for multiple texts with sophisticated rate limiting
88
- */
89
- async embedTexts(texts: string[]): Promise<number[][]> {
86
+ async embedDocs(docs: Document[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
87
+ this.ensureInitialized();
88
+
89
+ if (this.config.rateLimiting) {
90
+ return await this.rateLimitingService.executeWithRateLimit(
91
+ docs,
92
+ async (batch: Document[]) => {
93
+ const embeddings = await this.embeddings.embedDocuments(batch.map(d => d.pageContent));
94
+
95
+ if(batchCallback){
96
+ const fragments = batch.map(d => d.pageContent);
97
+ await batchCallback(fragments, embeddings);
98
+ }
99
+
100
+ return embeddings;
101
+ },
102
+ (doc: Document) => doc.pageContent
103
+ );
104
+ }
105
+
106
+ return await this.embeddings.embedDocuments(docs.map(d => d.pageContent));
107
+ }
108
+
109
+ async embedTexts(texts: string[], batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>): Promise<number[][]> {
90
110
  this.ensureInitialized();
91
111
 
92
112
  if (this.config.rateLimiting) {
93
113
  return await this.rateLimitingService.executeWithRateLimit(
94
114
  texts,
95
115
  async (batch: string[]) => {
96
- return await this.embeddings.embedDocuments(batch);
116
+ const embeddings = await this.embeddings.embedDocuments(batch);
117
+ if (batchCallback) {
118
+ await batchCallback(batch, embeddings);
119
+ }
120
+ return embeddings;
97
121
  },
98
122
  (text: string) => text // Token extractor
99
123
  );
@@ -127,15 +151,40 @@ export class LangChainEmbeddingService {
127
151
  /**
128
152
  * Split text into chunks
129
153
  */
130
- async chunkText(text: string): Promise<string[]> {
154
+ async chunkText(text: string, ragOverride?: IChunkConfig): Promise<string[]> {
131
155
  this.ensureInitialized();
132
156
 
133
157
  // Use our custom TextChunker instead of LangChain's splitter
134
158
  // Use safe token limits - the TextChunker handles token estimation internally
135
- const maxTokens = this.chunkConfig?.chunkSize || 450; // Safe token limit for embedding models
136
- const overlap = this.chunkConfig?.chunkOverlap || 50; // Character overlap, not token
159
+ const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
160
+ const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
161
+ const separators = ragOverride?.separators || this.chunkConfig?.separators || TextChunker.DEFAULT_SEPARATORS; // Default separators
137
162
 
138
- return TextChunker.chunkText(text, maxTokens, overlap);
163
+ return TextChunker.chunkText(text, maxTokens, overlap, separators);
164
+ }
165
+
166
+ async chunkCSV(rows: Record<string, any>[], ragOverride?: IChunkConfig): Promise<Document[]> {
167
+ // Use safe token limits - the TextChunker handles token estimation internally
168
+ const maxTokens = ragOverride ? ragOverride.chunkSize : (this.chunkConfig?.chunkSize || 450); // Safe token limit for embedding models
169
+ const overlap = ragOverride ? ragOverride.chunkOverlap : (this.chunkConfig?.chunkOverlap || 50); // Character overlap, not token
170
+
171
+ const splitter = new RecursiveCharacterTextSplitter({
172
+ chunkSize: maxTokens,
173
+ chunkOverlap: overlap
174
+ });
175
+
176
+ const docs = rows.map((row, i) => {
177
+ const text = Object.entries(row)
178
+ .map(([k, v]) => `${k}: ${v}`)
179
+ .join("\n");
180
+
181
+ return new Document({
182
+ pageContent: text,
183
+ metadata: { row: i }
184
+ });
185
+ });
186
+
187
+ return await splitter.splitDocuments(docs);
139
188
  }
140
189
 
141
190
  /**
@@ -205,6 +254,7 @@ export class LangChainEmbeddingService {
205
254
 
206
255
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
207
256
  }
257
+
208
258
 
209
259
  /**
210
260
  * Ensure the service is initialized
@@ -43,6 +43,18 @@ export class LangChainRAGService {
43
43
  private isInitialized = false;
44
44
  private logger?: any; // Optional logger interface
45
45
 
46
+ static SheetMimeType: string[] = [
47
+ 'text/csv',
48
+ 'text/tab-separated-values',
49
+ 'application/vnd.ms-excel',
50
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
51
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
52
+ 'application/vnd.ms-excel.sheet.macroEnabled.12',
53
+ 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
54
+ 'application/vnd.oasis.opendocument.spreadsheet',
55
+ 'application/vnd.google-apps.spreadsheet',
56
+ ];
57
+
46
58
  constructor(
47
59
  private embeddingService: LangChainEmbeddingService,
48
60
  private vectorSearchService: OptimizedVectorSearchService
@@ -84,44 +96,57 @@ export class LangChainRAGService {
84
96
  */
85
97
  async indexKnowledge(
86
98
  fileId: string | number,
87
- content: string,
88
- metadata: Record<string, any> = {}
89
- ): Promise<IRAGResponse<{ chunkIds: string[] }>> {
99
+ content: string | Record<string, any>[],
100
+ metadata: Record<string, any> = {},
101
+ batchCallback?: (fragments:string[], batch: number[][]) => Promise<void>,
102
+ ragOverride?: IChunkConfig
103
+ ): Promise<IRAGResponse<{ chunkCount: number }>> {
90
104
  this.log('log', `[INDEXING] Starting indexKnowledge for fileId: ${fileId}`);
91
- this.log('debug', `[INDEXING] Content length: ${content.length} characters`);
105
+ this.log('debug', `[INDEXING] Content length: ${Array.isArray(content) ? content.map(r => Object.values(r).join(' ')).join('\n').length : content.length} characters`);
92
106
 
93
107
  try {
94
108
  await this.ensureInitialized();
95
109
 
96
- // Chunk the content using the embedding service
97
- const chunks = await this.embeddingService.chunkText(content);
98
- this.log('debug', `[INDEXING] Split content into ${chunks.length} chunks for file ${fileId}`);
99
-
100
- // Generate embeddings for all chunks at once (batch processing for speed)
101
- const embeddings = await this.embeddingService.embedTexts(chunks);
102
- this.log('debug', `[INDEXING] Generated embeddings for ${chunks.length} chunks`);
103
-
104
- // Create chunk objects with embeddings
105
- const chunksWithEmbeddings = chunks.map((chunkContent, index) => ({
106
- content: chunkContent,
107
- embedding: embeddings[index],
108
- metadata: {
109
- ...metadata,
110
- fileId,
111
- chunkIndex: index,
112
- id: `knowledge_${fileId}_chunk_${index}`
113
- }
114
- }));
110
+ const mime = metadata.mime || null;
115
111
 
116
- // Save to per-knowledge vector file
117
- await this.saveKnowledgeVector(fileId, chunksWithEmbeddings);
112
+ let chunkTexts: string[] = undefined;
113
+ let embeddings: number[][] = undefined;
118
114
 
119
- const chunkIds = chunksWithEmbeddings.map(chunk => chunk.metadata.id);
120
- this.log('log', `[INDEXING] Successfully indexed file ${fileId} with ${chunkIds.length} chunks using optimized approach`);
115
+ if(mime && LangChainRAGService.isSheetDocument(mime)) {
116
+ this.log('debug', `[INDEXING] SHEET extraction mode detected.`);
117
+
118
+ const docs = await this.embeddingService.chunkCSV(content as Record<string, any>[], ragOverride);
119
+ embeddings = await this.embeddingService.embedDocs(docs, batchCallback);
120
+ chunkTexts = docs.map(d => d.pageContent);
121
+ }else{
122
+ chunkTexts = await this.embeddingService.chunkText(content as string, ragOverride);
123
+ embeddings = await this.embeddingService.embedTexts(chunkTexts, batchCallback);
124
+ }
125
+
126
+ this.log('debug', `[INDEXING] Generated embeddings for ${chunkTexts.length} chunks`);
127
+
128
+
129
+
130
+ if(!batchCallback){
131
+ // Create chunk objects with embeddings
132
+ const chunksWithEmbeddings = chunkTexts.map((chunkContent, index) => ({
133
+ content: chunkContent,
134
+ embedding: embeddings[index],
135
+ metadata: {
136
+ ...metadata,
137
+ fileId,
138
+ chunkIndex: index,
139
+ id: `knowledge_${fileId}_chunk_${index}`
140
+ }
141
+ }));
142
+ await this.saveKnowledgeVector(fileId, chunksWithEmbeddings);
143
+ }
144
+
145
+ this.log('log', `[INDEXING] Successfully indexed file ${fileId} with ${chunkTexts.length} chunks using optimized approach`);
121
146
 
122
147
  return {
123
148
  success: true,
124
- data: { chunkIds }
149
+ data: { chunkCount: chunkTexts.length }
125
150
  };
126
151
 
127
152
  } catch (error: any) {
@@ -134,6 +159,10 @@ export class LangChainRAGService {
134
159
  }
135
160
  }
136
161
 
162
+ static isSheetDocument(mime: string): boolean {
163
+ return LangChainRAGService.SheetMimeType.includes(mime);
164
+ }
165
+
137
166
  /**
138
167
  * Search for relevant knowledge chunks using optimized vector search
139
168
  */
@@ -239,6 +268,10 @@ export class LangChainRAGService {
239
268
  }
240
269
  }
241
270
 
271
+ embedQuery(query: string): Promise<number[]> {
272
+ return this.vectorSearchService.getQueryEmbedding(query);
273
+ }
274
+
242
275
  /**
243
276
  * Get statistics about the RAG system
244
277
  */
@@ -317,6 +350,7 @@ export class LangChainRAGService {
317
350
 
318
351
  /**
319
352
  * Save chunks to knowledge-specific vector file with embeddings
353
+ * Uses streaming JSON write to handle large embedding datasets
320
354
  */
321
355
  private async saveKnowledgeVector(fileId: string | number, chunks: Array<{ content: string; embedding: number[]; metadata: any }>): Promise<void> {
322
356
  const vectorFilePath = this.getKnowledgeVectorPath(fileId);
@@ -328,13 +362,24 @@ export class LangChainRAGService {
328
362
  }
329
363
 
330
364
  try {
331
- const vectorData = {
332
- fileId,
333
- chunks,
334
- timestamp: new Date().toISOString()
335
- };
336
-
337
- fs.writeFileSync(vectorFilePath, JSON.stringify(vectorData, null, 2));
365
+ // Stream JSON to avoid "Invalid string length" on large datasets
366
+ const writeStream = fs.createWriteStream(vectorFilePath);
367
+
368
+ await new Promise<void>((resolve, reject) => {
369
+ writeStream.on('error', reject);
370
+ writeStream.on('finish', resolve);
371
+
372
+ writeStream.write(`{"fileId":${JSON.stringify(fileId)},"timestamp":${JSON.stringify(new Date().toISOString())},"chunks":[`);
373
+
374
+ for (let i = 0; i < chunks.length; i++) {
375
+ if (i > 0) writeStream.write(',');
376
+ writeStream.write(JSON.stringify(chunks[i]));
377
+ }
378
+
379
+ writeStream.write(']}');
380
+ writeStream.end();
381
+ });
382
+
338
383
  this.log('debug', `[SAVE] Successfully saved ${chunks.length} chunks with embeddings for file ${fileId} to: "${vectorFilePath}"`);
339
384
 
340
385
  } catch (error) {
File without changes
@@ -95,6 +95,7 @@ export class OpenAIRateLimitingService {
95
95
  }
96
96
 
97
97
  const results = new Array(items.length);
98
+ let doneItems = 0;
98
99
 
99
100
  // Process all batches with queue concurrency control
100
101
  await Promise.all(batchStarts.map(meta =>
@@ -104,9 +105,14 @@ export class OpenAIRateLimitingService {
104
105
  for (let attempt = 0; attempt < 6; attempt++) {
105
106
  try {
106
107
  const batchResults = await this.callWithRetry(() => executor(attemptBatch));
108
+
107
109
  for (let i = 0; i < batchResults.length; i++) {
108
110
  results[meta.start + i] = batchResults[i];
111
+ doneItems++;
109
112
  }
113
+
114
+ this.logger.debug(`Embedding chunks done [${doneItems}/${items.length}]`);
115
+
110
116
  break;
111
117
  } catch (err: any) {
112
118
  const status = err?.status || err?.response?.status;
@@ -108,7 +108,7 @@ export class OptimizedVectorSearchService {
108
108
  /**
109
109
  * Get query embedding with caching
110
110
  */
111
- private async getQueryEmbedding(query: string): Promise<number[]> {
111
+ async getQueryEmbedding(query: string): Promise<number[]> {
112
112
  // Check cache first
113
113
  if (this.queryEmbeddingCache.has(query)) {
114
114
  return this.queryEmbeddingCache.get(query)!;
@@ -16,7 +16,7 @@ export class TextChunker {
16
16
  * Default separators following LangChain RecursiveCharacterTextSplitter approach
17
17
  * Ordered by preference for breaking text
18
18
  */
19
- private static readonly DEFAULT_SEPARATORS = [
19
+ static readonly DEFAULT_SEPARATORS = [
20
20
  '\n\n', // Double newlines (paragraphs)
21
21
  '\n', // Single newlines
22
22
  '. ', // Sentence endings
File without changes
File without changes
File without changes
File without changes
File without changes
package/tsconfig.json CHANGED
@@ -14,11 +14,14 @@
14
14
  "allowSyntheticDefaultImports": true,
15
15
  "sourceMap": true,
16
16
  "declaration": true,
17
+ "types": []
17
18
  },
18
19
  "include": [
19
20
  "src"
20
21
  ],
21
22
  "exclude": [
22
- "node_modules"
23
+ "node_modules",
24
+ "**/*.d.ts",
25
+ "**/node_modules/**",
23
26
  ]
24
27
  }