npm - @rws-framework/ai-tools - Versions diffs - 3.1.2 → 3.2.0 - Mend

@rws-framework/ai-tools 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/src/index.ts +3 -0
package/src/services/LangChainEmbeddingService.ts +51 -20
package/src/services/OpenAIRateLimitingService.ts +232 -0
package/src/services/examples/OpenAICompletionService.example.ts +110 -0
package/src/types/embedding.types.ts +1 -1

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@rws-framework/ai-tools",
   "private": false,
-  "version": "3.1.2",
+  "version": "3.2.0",
   "description": "",
   "main": "src/index.ts",
   "scripts": {},

package/src/index.ts CHANGED Viewed

@@ -4,6 +4,7 @@ import { ILLMChunk, IRWSPromptRequestExecutor, IRWSSinglePromptRequestExecutor,
 import { EmbedLoader as RWSEmbed, IConvoDebugXMLData, IEmbeddingsHandler, ISplitterParams } from './models/convo/EmbedLoader';
 import RWSVectorStore from './models/convo/VectorStore';
 import { LangChainEmbeddingService } from './services/LangChainEmbeddingService';
+import { OpenAIRateLimitingService, IRateLimitConfig } from './services/OpenAIRateLimitingService';
 import { LangChainVectorStoreService, IVectorStoreConfig, IDocumentChunk, IVectorSearchRequest, IVectorSearchResponse, ISearchResult } from './services/LangChainVectorStoreService';
 import { LangChainRAGService, ILangChainRAGConfig, IRAGIndexRequest, IRAGSearchRequest, IRAGResponse, IRAGStats } from './services/LangChainRAGService';
 import { IContextToken } from './types/IContextToken';
@@ -36,11 +37,13 @@ export {
     ToolHandler,
     // New LangChain-based services
     LangChainEmbeddingService,
+    OpenAIRateLimitingService,
     LangChainVectorStoreService,
     LangChainRAGService,
     // Types
     IEmbeddingConfig,
     IChunkConfig,
+    IRateLimitConfig,
     IVectorStoreConfig,
     IDocumentChunk,
     IVectorSearchRequest,

package/src/services/LangChainEmbeddingService.ts CHANGED Viewed

@@ -1,10 +1,12 @@
 import { Injectable } from '@nestjs/common';
 import { Embeddings } from '@langchain/core/embeddings';
 import { CohereEmbeddings } from '@langchain/cohere';
+import { OpenAIEmbeddings } from '@langchain/openai';
 import { Document } from '@langchain/core/documents';
 import { IEmbeddingConfig, IChunkConfig } from '../types';
 import { TextChunker } from './TextChunker';
 import RWSVectorStore, { VectorDocType, IVectorStoreConfig } from '../models/convo/VectorStore';
+import { OpenAIRateLimitingService } from './OpenAIRateLimitingService';
 @Injectable()
 export class LangChainEmbeddingService {
@@ -14,9 +16,7 @@ export class LangChainEmbeddingService {
     private isInitialized = false;
     private vectorStore: RWSVectorStore | null = null;
-    constructor() {
-        // Empty constructor for NestJS dependency injection
-    }
+    constructor(private rateLimitingService: OpenAIRateLimitingService) {}
     /**
      * Initialize the service with configuration
@@ -35,20 +35,6 @@ export class LangChainEmbeddingService {
         this.isInitialized = true;
     }
-    /**
-     * Alternative constructor-like method for backward compatibility
-     */
-    static create(config: IEmbeddingConfig, chunkConfig?: IChunkConfig): LangChainEmbeddingService {
-        const service = new LangChainEmbeddingService();
-        service.config = config;
-        service.chunkConfig = chunkConfig || {
-            chunkSize: 1000,
-            chunkOverlap: 200
-        };
-        service.initializeEmbeddings();
-        service.isInitialized = true;
-        return service;
-    }
     private initializeEmbeddings(): void {
         switch (this.config.provider) {
@@ -59,10 +45,30 @@ export class LangChainEmbeddingService {
                     batchSize: this.config.batchSize || 96
                 });
                 break;
+            case 'openai':
+                this.embeddings = new OpenAIEmbeddings({
+                    apiKey: this.config.apiKey,
+                    model: this.config.model || 'text-embedding-3-large',
+                    batchSize: 1 // We'll handle batching ourselves
+                });
+                this.rateLimitingService.initialize(this.config.model || 'text-embedding-3-large', {
+                    rpm: 500,
+                    tpm: 300_000,
+                    concurrency: 4,
+                    maxRetries: 6,
+                    baseBackoffMs: 500,
+                    safetyFactor: 0.75
+                });
+                break;
             default:
                 throw new Error(`Unsupported embedding provider: ${this.config.provider}`);
         }
+        console.log(`Initialized ${this.config.provider} embeddings with model ${this.config.model}`, this.config.apiKey);
     }
     private initializeTextSplitter(chunkConfig?: IChunkConfig): void {
@@ -70,19 +76,44 @@ export class LangChainEmbeddingService {
         // This method is kept for compatibility but doesn't initialize anything
     }
-    /**
-     * Generate embeddings for multiple texts
+        /**
+     * Generate embeddings for multiple texts with sophisticated rate limiting
      */
     async embedTexts(texts: string[]): Promise<number[][]> {
         this.ensureInitialized();
+        if (this.config.provider === 'openai' && this.rateLimitingService) {
+            return await this.rateLimitingService.executeWithRateLimit(
+                texts,
+                async (batch: string[]) => {
+                    return await this.embeddings.embedDocuments(batch);
+                },
+                (text: string) => text // Token extractor
+            );
+        }
+        // For other providers (like Cohere), use the standard approach
         return await this.embeddings.embedDocuments(texts);
     }
     /**
-     * Generate embedding for a single text
+     * Generate embedding for a single text with rate limiting
      */
     async embedText(text: string): Promise<number[]> {
         this.ensureInitialized();
+        if (this.config.provider === 'openai' && this.rateLimitingService) {
+            // For single texts with OpenAI, use the rate-controlled batch method
+            const results = await this.rateLimitingService.executeWithRateLimit(
+                [text],
+                async (batch: string[]) => {
+                    return await this.embeddings.embedDocuments(batch);
+                },
+                (text: string) => text
+            );
+            return results[0];
+        }
         return await this.embeddings.embedQuery(text);
     }

package/src/services/OpenAIRateLimitingService.ts ADDED Viewed

@@ -0,0 +1,232 @@
+import { Injectable } from '@nestjs/common';
+import PQueue from 'p-queue';
+// Optional tiktoken import
+let encoding_for_model: any = null;
+try {
+    const tiktoken = require('tiktoken');
+    encoding_for_model = tiktoken.encoding_for_model;
+} catch (e) {
+    console.warn('tiktoken not available, using character-based token estimation');
+}
+export interface IRateLimitConfig {
+    rpm?: number;           // Requests per minute
+    tpm?: number;           // Tokens per minute
+    concurrency?: number;   // Parallel requests
+    maxRetries?: number;    // Maximum retry attempts
+    baseBackoffMs?: number; // Base backoff delay
+    safetyFactor?: number;  // Safety factor for limits
+}
+export interface IBatchMetadata<T = any> {
+    start: number;
+    batch: T[];
+}
+@Injectable()
+export class OpenAIRateLimitingService {
+    private static readonly DEFAULT_CONFIG: Required<IRateLimitConfig> = {
+        rpm: 500,
+        tpm: 300_000,
+        concurrency: 4,
+        maxRetries: 6,
+        baseBackoffMs: 500,
+        safetyFactor: 0.75
+    };
+    private tokenizer: any = null;
+    private queue: PQueue;
+    private config: Required<IRateLimitConfig>;
+    constructor() {
+        this.config = { ...OpenAIRateLimitingService.DEFAULT_CONFIG };
+        this.queue = new PQueue({ concurrency: this.config.concurrency });
+    }
+    /**
+     * Initialize the service with a specific model and configuration
+     */
+    initialize(model: string, config?: Partial<IRateLimitConfig>): void {
+        if (config) {
+            this.config = { ...this.config, ...config };
+        }
+        // Initialize tokenizer for precise token counting
+        try {
+            if (encoding_for_model) {
+                this.tokenizer = encoding_for_model(model);
+            } else {
+                this.tokenizer = null;
+            }
+        } catch (e) {
+            console.warn(`Could not load tokenizer for model ${model}, using character-based estimation`);
+            this.tokenizer = null;
+        }
+        // Reinitialize queue with new concurrency
+        this.queue = new PQueue({ concurrency: this.config.concurrency });
+    }
+    /**
+     * Execute a batch of operations with sophisticated rate limiting
+     */
+    async executeWithRateLimit<T, R>(
+        items: T[],
+        executor: (batch: T[]) => Promise<R[]>,
+        tokenExtractor?: (item: T) => string
+    ): Promise<R[]> {
+        const tokensPerMinutePerWorker = Math.floor(
+            this.config.tpm * this.config.safetyFactor / this.config.concurrency
+        );
+        const maxTokensPerCall = Math.max(1_000, Math.floor(tokensPerMinutePerWorker / 60));
+        // Build batches based on token limits
+        const batches = this.tokenizer && tokenExtractor
+            ? Array.from(this.chunkByToken(items, maxTokensPerCall, tokenExtractor))
+            : this.fallbackChunking(items, 128);
+        // Map batch -> start index for placing results back
+        const batchStarts: Array<IBatchMetadata<T>> = [];
+        let idx = 0;
+        for (const batch of batches) {
+            batchStarts.push({ start: idx, batch });
+            idx += batch.length;
+        }
+        const results = new Array(items.length);
+        // Process all batches with queue concurrency control
+        await Promise.all(batchStarts.map(meta =>
+            this.queue.add(async () => {
+                // Adaptive shrink loop on repeated 429s
+                let attemptBatch = meta.batch;
+                for (let attempt = 0; attempt < 6; attempt++) {
+                    try {
+                        const batchResults = await this.callWithRetry(() => executor(attemptBatch));
+                        for (let i = 0; i < batchResults.length; i++) {
+                            results[meta.start + i] = batchResults[i];
+                        }
+                        break;
+                    } catch (err: any) {
+                        const status = err?.status || err?.response?.status;
+                        if (status === 429) {
+                            // Shrink batch if >1 and retry quickly (binary shrink)
+                            if (attemptBatch.length <= 1) throw err;
+                            attemptBatch = attemptBatch.slice(0, Math.ceil(attemptBatch.length / 2));
+                            console.log(`Rate limit hit, shrinking batch to ${attemptBatch.length} items`);
+                            // Small sleep to avoid immediate retry stampede
+                            await this.sleep(200 + Math.random() * 200);
+                            continue;
+                        }
+                        throw err;
+                    }
+                }
+            })
+        ));
+        await this.queue.onIdle();
+        return results;
+    }
+    /**
+     * Count tokens in text
+     */
+    private countTokens(text: string): number {
+        if (!this.tokenizer) {
+            return Math.ceil(text.length / 4); // Conservative fallback
+        }
+        return this.tokenizer.encode(text).length;
+    }
+    /**
+     * Chunk items by token budget
+     */
+    private *chunkByToken<T>(
+        items: T[],
+        maxTokensPerCall: number,
+        tokenExtractor: (item: T) => string
+    ): Generator<T[]> {
+        let batch: T[] = [];
+        let tokens = 0;
+        for (const item of items) {
+            const text = tokenExtractor(item);
+            const itemTokens = this.countTokens(text);
+            if (batch.length && tokens + itemTokens > maxTokensPerCall) {
+                yield batch;
+                batch = [];
+                tokens = 0;
+            }
+            batch.push(item);
+            tokens += itemTokens;
+        }
+        if (batch.length) {
+            yield batch;
+        }
+    }
+    /**
+     * Fallback chunking when tokenizer is not available
+     */
+    private fallbackChunking<T>(items: T[], itemsPerBatch: number): T[][] {
+        const result: T[][] = [];
+        for (let i = 0; i < items.length; i += itemsPerBatch) {
+            result.push(items.slice(i, i + itemsPerBatch));
+        }
+        return result;
+    }
+    /**
+     * Call function with retry logic and exponential backoff
+     */
+    private async callWithRetry<T>(fn: () => Promise<T>, attempt: number = 0): Promise<T> {
+        try {
+            return await fn();
+        } catch (err: any) {
+            const status = err?.status || err?.response?.status;
+            const isRateLimit = status === 429 || (status >= 500 && status < 600);
+            if (!isRateLimit || attempt >= this.config.maxRetries) {
+                throw err;
+            }
+            const delay = Math.min(60_000, this.config.baseBackoffMs * (2 ** attempt));
+            const jitter = Math.random() * 300;
+            console.log(`Retrying request in ${delay + jitter}ms (attempt ${attempt + 1}/${this.config.maxRetries})`);
+            await this.sleep(delay + jitter);
+            return this.callWithRetry(fn, attempt + 1);
+        }
+    }
+    /**
+     * Sleep utility for delays
+     */
+    private sleep(ms: number): Promise<void> {
+        return new Promise(resolve => setTimeout(resolve, ms));
+    }
+    /**
+     * Get current configuration
+     */
+    getConfig(): Required<IRateLimitConfig> {
+        return { ...this.config };
+    }
+    /**
+     * Update configuration
+     */
+    updateConfig(newConfig: Partial<IRateLimitConfig>): void {
+        this.config = { ...this.config, ...newConfig };
+        // Update queue concurrency if changed
+        if (newConfig.concurrency && newConfig.concurrency !== this.queue.concurrency) {
+            this.queue = new PQueue({ concurrency: this.config.concurrency });
+        }
+    }
+}

package/src/services/examples/OpenAICompletionService.example.ts ADDED Viewed

@@ -0,0 +1,110 @@
+/**
+ * Example usage of OpenAIRateLimitingService for other AI operations
+ *
+ * This demonstrates how to use the rate limiting service for:
+ * - OpenAI completions
+ * - Image generation
+ * - Any other OpenAI API calls that need rate limiting
+ */
+import { OpenAIRateLimitingService, IRateLimitConfig } from '../OpenAIRateLimitingService';
+import { OpenAI } from 'openai';
+export class OpenAICompletionService {
+    private rateLimitingService: OpenAIRateLimitingService;
+    private openai: OpenAI;
+    constructor(apiKey: string, config?: Partial<IRateLimitConfig>) {
+        this.openai = new OpenAI({ apiKey });
+        this.rateLimitingService = new OpenAIRateLimitingService();
+        // Initialize with model-specific limits
+        this.rateLimitingService.initialize('gpt-4', {
+            rpm: 500,        // Adjust based on your OpenAI plan
+            tpm: 30_000,     // Tokens per minute for GPT-4
+            concurrency: 3,  // Lower concurrency for completion models
+            maxRetries: 5,
+            ...config
+        });
+    }
+    /**
+     * Generate completions with rate limiting
+     */
+    async generateCompletions(
+        prompts: string[],
+        model: string = 'gpt-4-turbo'
+    ): Promise<string[]> {
+        return await this.rateLimitingService.executeWithRateLimit(
+            prompts,
+            async (batch: string[]) => {
+                // Execute batch of completion requests
+                const promises = batch.map(prompt =>
+                    this.openai.chat.completions.create({
+                        model,
+                        messages: [{ role: 'user', content: prompt }],
+                        max_tokens: 500
+                    })
+                );
+                const results = await Promise.all(promises);
+                return results.map(result =>
+                    result.choices[0]?.message?.content || ''
+                );
+            },
+            (prompt: string) => prompt // Token extractor for accurate batching
+        );
+    }
+    /**
+     * Generate images with rate limiting
+     */
+    async generateImages(prompts: string[]): Promise<string[]> {
+        return await this.rateLimitingService.executeWithRateLimit(
+            prompts,
+            async (batch: string[]) => {
+                const promises = batch.map(prompt =>
+                    this.openai.images.generate({
+                        model: 'dall-e-3',
+                        prompt,
+                        size: '1024x1024',
+                        quality: 'standard',
+                        n: 1
+                    })
+                );
+                const results = await Promise.all(promises);
+                return results.map(result =>
+                    result.data[0]?.url || ''
+                );
+            },
+            (prompt: string) => prompt
+        );
+    }
+    /**
+     * Update rate limiting configuration
+     */
+    updateRateLimits(config: Partial<IRateLimitConfig>): void {
+        this.rateLimitingService.updateConfig(config);
+    }
+}
+/**
+ * Usage example:
+ *
+ * const completionService = new OpenAICompletionService(process.env.OPENAI_API_KEY, {
+ *     rpm: 100,     // Lower RPM for your plan
+ *     tpm: 10_000,  // Lower TPM
+ *     concurrency: 2
+ * });
+ *
+ * const prompts = [
+ *     "Explain quantum computing",
+ *     "Write a haiku about AI",
+ *     "Summarize the history of computing"
+ * ];
+ *
+ * const completions = await completionService.generateCompletions(prompts);
+ * console.log(completions);
+ */

package/src/types/embedding.types.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Embedding service configuration interfaces
  */
 export interface IEmbeddingConfig {
-    provider: 'cohere';
+    provider: 'cohere' | 'openai';
     apiKey: string;
     model?: string;
     batchSize?: number;