@rws-framework/ai-tools 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@rws-framework/ai-tools",
3
3
  "private": false,
4
- "version": "3.1.2",
4
+ "version": "3.2.0",
5
5
  "description": "",
6
6
  "main": "src/index.ts",
7
7
  "scripts": {},
package/src/index.ts CHANGED
@@ -4,6 +4,7 @@ import { ILLMChunk, IRWSPromptRequestExecutor, IRWSSinglePromptRequestExecutor,
4
4
  import { EmbedLoader as RWSEmbed, IConvoDebugXMLData, IEmbeddingsHandler, ISplitterParams } from './models/convo/EmbedLoader';
5
5
  import RWSVectorStore from './models/convo/VectorStore';
6
6
  import { LangChainEmbeddingService } from './services/LangChainEmbeddingService';
7
+ import { OpenAIRateLimitingService, IRateLimitConfig } from './services/OpenAIRateLimitingService';
7
8
  import { LangChainVectorStoreService, IVectorStoreConfig, IDocumentChunk, IVectorSearchRequest, IVectorSearchResponse, ISearchResult } from './services/LangChainVectorStoreService';
8
9
  import { LangChainRAGService, ILangChainRAGConfig, IRAGIndexRequest, IRAGSearchRequest, IRAGResponse, IRAGStats } from './services/LangChainRAGService';
9
10
  import { IContextToken } from './types/IContextToken';
@@ -36,11 +37,13 @@ export {
36
37
  ToolHandler,
37
38
  // New LangChain-based services
38
39
  LangChainEmbeddingService,
40
+ OpenAIRateLimitingService,
39
41
  LangChainVectorStoreService,
40
42
  LangChainRAGService,
41
43
  // Types
42
44
  IEmbeddingConfig,
43
45
  IChunkConfig,
46
+ IRateLimitConfig,
44
47
  IVectorStoreConfig,
45
48
  IDocumentChunk,
46
49
  IVectorSearchRequest,
@@ -1,10 +1,12 @@
1
1
  import { Injectable } from '@nestjs/common';
2
2
  import { Embeddings } from '@langchain/core/embeddings';
3
3
  import { CohereEmbeddings } from '@langchain/cohere';
4
+ import { OpenAIEmbeddings } from '@langchain/openai';
4
5
  import { Document } from '@langchain/core/documents';
5
6
  import { IEmbeddingConfig, IChunkConfig } from '../types';
6
7
  import { TextChunker } from './TextChunker';
7
8
  import RWSVectorStore, { VectorDocType, IVectorStoreConfig } from '../models/convo/VectorStore';
9
+ import { OpenAIRateLimitingService } from './OpenAIRateLimitingService';
8
10
 
9
11
  @Injectable()
10
12
  export class LangChainEmbeddingService {
@@ -14,9 +16,7 @@ export class LangChainEmbeddingService {
14
16
  private isInitialized = false;
15
17
  private vectorStore: RWSVectorStore | null = null;
16
18
 
17
- constructor() {
18
- // Empty constructor for NestJS dependency injection
19
- }
19
+ constructor(private rateLimitingService: OpenAIRateLimitingService) {}
20
20
 
21
21
  /**
22
22
  * Initialize the service with configuration
@@ -35,20 +35,6 @@ export class LangChainEmbeddingService {
35
35
  this.isInitialized = true;
36
36
  }
37
37
 
38
- /**
39
- * Alternative constructor-like method for backward compatibility
40
- */
41
- static create(config: IEmbeddingConfig, chunkConfig?: IChunkConfig): LangChainEmbeddingService {
42
- const service = new LangChainEmbeddingService();
43
- service.config = config;
44
- service.chunkConfig = chunkConfig || {
45
- chunkSize: 1000,
46
- chunkOverlap: 200
47
- };
48
- service.initializeEmbeddings();
49
- service.isInitialized = true;
50
- return service;
51
- }
52
38
 
53
39
  private initializeEmbeddings(): void {
54
40
  switch (this.config.provider) {
@@ -59,10 +45,30 @@ export class LangChainEmbeddingService {
59
45
  batchSize: this.config.batchSize || 96
60
46
  });
61
47
  break;
48
+
49
+ case 'openai':
50
+ this.embeddings = new OpenAIEmbeddings({
51
+ apiKey: this.config.apiKey,
52
+ model: this.config.model || 'text-embedding-3-large',
53
+ batchSize: 1 // We'll handle batching ourselves
54
+ });
55
+
56
+
57
+ this.rateLimitingService.initialize(this.config.model || 'text-embedding-3-large', {
58
+ rpm: 500,
59
+ tpm: 300_000,
60
+ concurrency: 4,
61
+ maxRetries: 6,
62
+ baseBackoffMs: 500,
63
+ safetyFactor: 0.75
64
+ });
65
+ break;
62
66
 
63
67
  default:
64
68
  throw new Error(`Unsupported embedding provider: ${this.config.provider}`);
65
69
  }
70
+
71
+ console.log(`Initialized ${this.config.provider} embeddings with model ${this.config.model}`, this.config.apiKey);
66
72
  }
67
73
 
68
74
  private initializeTextSplitter(chunkConfig?: IChunkConfig): void {
@@ -70,19 +76,44 @@ export class LangChainEmbeddingService {
70
76
  // This method is kept for compatibility but doesn't initialize anything
71
77
  }
72
78
 
73
- /**
74
- * Generate embeddings for multiple texts
79
+ /**
80
+ * Generate embeddings for multiple texts with sophisticated rate limiting
75
81
  */
76
82
  async embedTexts(texts: string[]): Promise<number[][]> {
77
83
  this.ensureInitialized();
84
+
85
+ if (this.config.provider === 'openai' && this.rateLimitingService) {
86
+ return await this.rateLimitingService.executeWithRateLimit(
87
+ texts,
88
+ async (batch: string[]) => {
89
+ return await this.embeddings.embedDocuments(batch);
90
+ },
91
+ (text: string) => text // Token extractor
92
+ );
93
+ }
94
+
95
+ // For other providers (like Cohere), use the standard approach
78
96
  return await this.embeddings.embedDocuments(texts);
79
97
  }
80
98
 
81
99
  /**
82
- * Generate embedding for a single text
100
+ * Generate embedding for a single text with rate limiting
83
101
  */
84
102
  async embedText(text: string): Promise<number[]> {
85
103
  this.ensureInitialized();
104
+
105
+ if (this.config.provider === 'openai' && this.rateLimitingService) {
106
+ // For single texts with OpenAI, use the rate-controlled batch method
107
+ const results = await this.rateLimitingService.executeWithRateLimit(
108
+ [text],
109
+ async (batch: string[]) => {
110
+ return await this.embeddings.embedDocuments(batch);
111
+ },
112
+ (text: string) => text
113
+ );
114
+ return results[0];
115
+ }
116
+
86
117
  return await this.embeddings.embedQuery(text);
87
118
  }
88
119
 
@@ -0,0 +1,232 @@
1
+ import { Injectable } from '@nestjs/common';
2
+ import PQueue from 'p-queue';
3
+
4
+ // Optional tiktoken import
5
+ let encoding_for_model: any = null;
6
+ try {
7
+ const tiktoken = require('tiktoken');
8
+ encoding_for_model = tiktoken.encoding_for_model;
9
+ } catch (e) {
10
+ console.warn('tiktoken not available, using character-based token estimation');
11
+ }
12
+
13
+ export interface IRateLimitConfig {
14
+ rpm?: number; // Requests per minute
15
+ tpm?: number; // Tokens per minute
16
+ concurrency?: number; // Parallel requests
17
+ maxRetries?: number; // Maximum retry attempts
18
+ baseBackoffMs?: number; // Base backoff delay
19
+ safetyFactor?: number; // Safety factor for limits
20
+ }
21
+
22
+ export interface IBatchMetadata<T = any> {
23
+ start: number;
24
+ batch: T[];
25
+ }
26
+
27
+ @Injectable()
28
+ export class OpenAIRateLimitingService {
29
+ private static readonly DEFAULT_CONFIG: Required<IRateLimitConfig> = {
30
+ rpm: 500,
31
+ tpm: 300_000,
32
+ concurrency: 4,
33
+ maxRetries: 6,
34
+ baseBackoffMs: 500,
35
+ safetyFactor: 0.75
36
+ };
37
+
38
+ private tokenizer: any = null;
39
+ private queue: PQueue;
40
+ private config: Required<IRateLimitConfig>;
41
+
42
+ constructor() {
43
+ this.config = { ...OpenAIRateLimitingService.DEFAULT_CONFIG };
44
+ this.queue = new PQueue({ concurrency: this.config.concurrency });
45
+ }
46
+
47
+ /**
48
+ * Initialize the service with a specific model and configuration
49
+ */
50
+ initialize(model: string, config?: Partial<IRateLimitConfig>): void {
51
+ if (config) {
52
+ this.config = { ...this.config, ...config };
53
+ }
54
+
55
+ // Initialize tokenizer for precise token counting
56
+ try {
57
+ if (encoding_for_model) {
58
+ this.tokenizer = encoding_for_model(model);
59
+ } else {
60
+ this.tokenizer = null;
61
+ }
62
+ } catch (e) {
63
+ console.warn(`Could not load tokenizer for model ${model}, using character-based estimation`);
64
+ this.tokenizer = null;
65
+ }
66
+
67
+ // Reinitialize queue with new concurrency
68
+ this.queue = new PQueue({ concurrency: this.config.concurrency });
69
+ }
70
+
71
+ /**
72
+ * Execute a batch of operations with sophisticated rate limiting
73
+ */
74
+ async executeWithRateLimit<T, R>(
75
+ items: T[],
76
+ executor: (batch: T[]) => Promise<R[]>,
77
+ tokenExtractor?: (item: T) => string
78
+ ): Promise<R[]> {
79
+ const tokensPerMinutePerWorker = Math.floor(
80
+ this.config.tpm * this.config.safetyFactor / this.config.concurrency
81
+ );
82
+ const maxTokensPerCall = Math.max(1_000, Math.floor(tokensPerMinutePerWorker / 60));
83
+
84
+ // Build batches based on token limits
85
+ const batches = this.tokenizer && tokenExtractor
86
+ ? Array.from(this.chunkByToken(items, maxTokensPerCall, tokenExtractor))
87
+ : this.fallbackChunking(items, 128);
88
+
89
+ // Map batch -> start index for placing results back
90
+ const batchStarts: Array<IBatchMetadata<T>> = [];
91
+ let idx = 0;
92
+ for (const batch of batches) {
93
+ batchStarts.push({ start: idx, batch });
94
+ idx += batch.length;
95
+ }
96
+
97
+ const results = new Array(items.length);
98
+
99
+ // Process all batches with queue concurrency control
100
+ await Promise.all(batchStarts.map(meta =>
101
+ this.queue.add(async () => {
102
+ // Adaptive shrink loop on repeated 429s
103
+ let attemptBatch = meta.batch;
104
+ for (let attempt = 0; attempt < 6; attempt++) {
105
+ try {
106
+ const batchResults = await this.callWithRetry(() => executor(attemptBatch));
107
+ for (let i = 0; i < batchResults.length; i++) {
108
+ results[meta.start + i] = batchResults[i];
109
+ }
110
+ break;
111
+ } catch (err: any) {
112
+ const status = err?.status || err?.response?.status;
113
+ if (status === 429) {
114
+ // Shrink batch if >1 and retry quickly (binary shrink)
115
+ if (attemptBatch.length <= 1) throw err;
116
+ attemptBatch = attemptBatch.slice(0, Math.ceil(attemptBatch.length / 2));
117
+ console.log(`Rate limit hit, shrinking batch to ${attemptBatch.length} items`);
118
+ // Small sleep to avoid immediate retry stampede
119
+ await this.sleep(200 + Math.random() * 200);
120
+ continue;
121
+ }
122
+ throw err;
123
+ }
124
+ }
125
+ })
126
+ ));
127
+
128
+ await this.queue.onIdle();
129
+ return results;
130
+ }
131
+
132
+ /**
133
+ * Count tokens in text
134
+ */
135
+ private countTokens(text: string): number {
136
+ if (!this.tokenizer) {
137
+ return Math.ceil(text.length / 4); // Conservative fallback
138
+ }
139
+ return this.tokenizer.encode(text).length;
140
+ }
141
+
142
+ /**
143
+ * Chunk items by token budget
144
+ */
145
+ private *chunkByToken<T>(
146
+ items: T[],
147
+ maxTokensPerCall: number,
148
+ tokenExtractor: (item: T) => string
149
+ ): Generator<T[]> {
150
+ let batch: T[] = [];
151
+ let tokens = 0;
152
+
153
+ for (const item of items) {
154
+ const text = tokenExtractor(item);
155
+ const itemTokens = this.countTokens(text);
156
+
157
+ if (batch.length && tokens + itemTokens > maxTokensPerCall) {
158
+ yield batch;
159
+ batch = [];
160
+ tokens = 0;
161
+ }
162
+
163
+ batch.push(item);
164
+ tokens += itemTokens;
165
+ }
166
+
167
+ if (batch.length) {
168
+ yield batch;
169
+ }
170
+ }
171
+
172
+ /**
173
+ * Fallback chunking when tokenizer is not available
174
+ */
175
+ private fallbackChunking<T>(items: T[], itemsPerBatch: number): T[][] {
176
+ const result: T[][] = [];
177
+ for (let i = 0; i < items.length; i += itemsPerBatch) {
178
+ result.push(items.slice(i, i + itemsPerBatch));
179
+ }
180
+ return result;
181
+ }
182
+
183
+ /**
184
+ * Call function with retry logic and exponential backoff
185
+ */
186
+ private async callWithRetry<T>(fn: () => Promise<T>, attempt: number = 0): Promise<T> {
187
+ try {
188
+ return await fn();
189
+ } catch (err: any) {
190
+ const status = err?.status || err?.response?.status;
191
+ const isRateLimit = status === 429 || (status >= 500 && status < 600);
192
+
193
+ if (!isRateLimit || attempt >= this.config.maxRetries) {
194
+ throw err;
195
+ }
196
+
197
+ const delay = Math.min(60_000, this.config.baseBackoffMs * (2 ** attempt));
198
+ const jitter = Math.random() * 300;
199
+
200
+ console.log(`Retrying request in ${delay + jitter}ms (attempt ${attempt + 1}/${this.config.maxRetries})`);
201
+ await this.sleep(delay + jitter);
202
+
203
+ return this.callWithRetry(fn, attempt + 1);
204
+ }
205
+ }
206
+
207
+ /**
208
+ * Sleep utility for delays
209
+ */
210
+ private sleep(ms: number): Promise<void> {
211
+ return new Promise(resolve => setTimeout(resolve, ms));
212
+ }
213
+
214
+ /**
215
+ * Get current configuration
216
+ */
217
+ getConfig(): Required<IRateLimitConfig> {
218
+ return { ...this.config };
219
+ }
220
+
221
+ /**
222
+ * Update configuration
223
+ */
224
+ updateConfig(newConfig: Partial<IRateLimitConfig>): void {
225
+ this.config = { ...this.config, ...newConfig };
226
+
227
+ // Update queue concurrency if changed
228
+ if (newConfig.concurrency && newConfig.concurrency !== this.queue.concurrency) {
229
+ this.queue = new PQueue({ concurrency: this.config.concurrency });
230
+ }
231
+ }
232
+ }
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Example usage of OpenAIRateLimitingService for other AI operations
3
+ *
4
+ * This demonstrates how to use the rate limiting service for:
5
+ * - OpenAI completions
6
+ * - Image generation
7
+ * - Any other OpenAI API calls that need rate limiting
8
+ */
9
+
10
+ import { OpenAIRateLimitingService, IRateLimitConfig } from '../OpenAIRateLimitingService';
11
+ import { OpenAI } from 'openai';
12
+
13
+ export class OpenAICompletionService {
14
+ private rateLimitingService: OpenAIRateLimitingService;
15
+ private openai: OpenAI;
16
+
17
+ constructor(apiKey: string, config?: Partial<IRateLimitConfig>) {
18
+ this.openai = new OpenAI({ apiKey });
19
+ this.rateLimitingService = new OpenAIRateLimitingService();
20
+
21
+ // Initialize with model-specific limits
22
+ this.rateLimitingService.initialize('gpt-4', {
23
+ rpm: 500, // Adjust based on your OpenAI plan
24
+ tpm: 30_000, // Tokens per minute for GPT-4
25
+ concurrency: 3, // Lower concurrency for completion models
26
+ maxRetries: 5,
27
+ ...config
28
+ });
29
+ }
30
+
31
+ /**
32
+ * Generate completions with rate limiting
33
+ */
34
+ async generateCompletions(
35
+ prompts: string[],
36
+ model: string = 'gpt-4-turbo'
37
+ ): Promise<string[]> {
38
+ return await this.rateLimitingService.executeWithRateLimit(
39
+ prompts,
40
+ async (batch: string[]) => {
41
+ // Execute batch of completion requests
42
+ const promises = batch.map(prompt =>
43
+ this.openai.chat.completions.create({
44
+ model,
45
+ messages: [{ role: 'user', content: prompt }],
46
+ max_tokens: 500
47
+ })
48
+ );
49
+
50
+ const results = await Promise.all(promises);
51
+ return results.map(result =>
52
+ result.choices[0]?.message?.content || ''
53
+ );
54
+ },
55
+ (prompt: string) => prompt // Token extractor for accurate batching
56
+ );
57
+ }
58
+
59
+ /**
60
+ * Generate images with rate limiting
61
+ */
62
+ async generateImages(prompts: string[]): Promise<string[]> {
63
+ return await this.rateLimitingService.executeWithRateLimit(
64
+ prompts,
65
+ async (batch: string[]) => {
66
+ const promises = batch.map(prompt =>
67
+ this.openai.images.generate({
68
+ model: 'dall-e-3',
69
+ prompt,
70
+ size: '1024x1024',
71
+ quality: 'standard',
72
+ n: 1
73
+ })
74
+ );
75
+
76
+ const results = await Promise.all(promises);
77
+ return results.map(result =>
78
+ result.data[0]?.url || ''
79
+ );
80
+ },
81
+ (prompt: string) => prompt
82
+ );
83
+ }
84
+
85
+ /**
86
+ * Update rate limiting configuration
87
+ */
88
+ updateRateLimits(config: Partial<IRateLimitConfig>): void {
89
+ this.rateLimitingService.updateConfig(config);
90
+ }
91
+ }
92
+
93
+ /**
94
+ * Usage example:
95
+ *
96
+ * const completionService = new OpenAICompletionService(process.env.OPENAI_API_KEY, {
97
+ * rpm: 100, // Lower RPM for your plan
98
+ * tpm: 10_000, // Lower TPM
99
+ * concurrency: 2
100
+ * });
101
+ *
102
+ * const prompts = [
103
+ * "Explain quantum computing",
104
+ * "Write a haiku about AI",
105
+ * "Summarize the history of computing"
106
+ * ];
107
+ *
108
+ * const completions = await completionService.generateCompletions(prompts);
109
+ * console.log(completions);
110
+ */
@@ -2,7 +2,7 @@
2
2
  * Embedding service configuration interfaces
3
3
  */
4
4
  export interface IEmbeddingConfig {
5
- provider: 'cohere';
5
+ provider: 'cohere' | 'openai';
6
6
  apiKey: string;
7
7
  model?: string;
8
8
  batchSize?: number;