npm - viberag - Versions diffs - 0.3.2 → 0.3.3 - Mend

viberag 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/cli/commands/handlers.js +9 -0
package/dist/cli/components/InitWizard.js +55 -4
package/dist/common/types.d.ts +6 -0
package/dist/rag/config/index.d.ts +2 -0
package/dist/rag/embeddings/api-utils.d.ts +81 -0
package/dist/rag/embeddings/api-utils.js +150 -0
package/dist/rag/embeddings/gemini.d.ts +0 -8
package/dist/rag/embeddings/gemini.js +10 -70
package/dist/rag/embeddings/index.d.ts +3 -1
package/dist/rag/embeddings/index.js +4 -1
package/dist/rag/embeddings/local.d.ts +6 -1
package/dist/rag/embeddings/local.js +45 -12
package/dist/rag/embeddings/mistral.d.ts +0 -8
package/dist/rag/embeddings/mistral.js +10 -70
package/dist/rag/embeddings/mock.d.ts +35 -0
package/dist/rag/embeddings/mock.js +69 -0
package/dist/rag/embeddings/openai.d.ts +7 -9
package/dist/rag/embeddings/openai.js +25 -73
package/dist/rag/indexer/indexer.js +3 -2
package/dist/rag/search/index.js +1 -1
package/package.json +4 -1

package/dist/cli/commands/handlers.js CHANGED Viewed

@@ -41,12 +41,21 @@ export async function runInit(projectRoot, isReinit = false, wizardConfig) {
     // Build config from wizard choices
     const provider = wizardConfig?.provider ?? 'gemini';
     const { model, dimensions } = PROVIDER_CONFIGS[provider];
+    // Map OpenAI region to base URL
+    const openaiBaseUrl = wizardConfig?.openaiRegion
+        ? {
+            default: undefined,
+            us: 'https://us.api.openai.com/v1',
+            eu: 'https://eu.api.openai.com/v1',
+        }[wizardConfig.openaiRegion]
+        : undefined;
     const config = {
         ...DEFAULT_CONFIG,
         embeddingProvider: provider,
         embeddingModel: model,
         embeddingDimensions: dimensions,
         ...(wizardConfig?.apiKey && { apiKey: wizardConfig.apiKey }),
+        ...(openaiBaseUrl && { openaiBaseUrl }),
     };
     // Save config
     await saveConfig(projectRoot, config);

package/dist/cli/components/InitWizard.js CHANGED Viewed

@@ -176,6 +176,21 @@ const API_KEY_ACTION_ITEMS = [
     { label: 'Keep existing API key', value: 'keep' },
     { label: 'Enter new API key', value: 'new' },
 ];
+// OpenAI region options for data residency
+const OPENAI_REGION_ITEMS = [
+    {
+        label: 'Default (api.openai.com) - Recommended',
+        value: 'default',
+    },
+    {
+        label: 'US (us.api.openai.com) - US Data Residency',
+        value: 'us',
+    },
+    {
+        label: 'EU (eu.api.openai.com) - EU Data Residency',
+        value: 'eu',
+    },
+];
 /**
  * Simple text input component for API key entry.
  * Uses a ref to accumulate input, which handles paste better than
@@ -227,6 +242,8 @@ export function InitWizard({ step, config, isReinit, existingApiKey, existingPro
     // State for API key input
     const [apiKeyInput, setApiKeyInput] = useState('');
     const [apiKeyAction, setApiKeyAction] = useState(null);
+    // State for OpenAI region selection (shown after API key for OpenAI)
+    const [showRegionSelect, setShowRegionSelect] = useState(false);
     // Handle Escape to cancel
     useInput((input, key) => {
         if (key.escape || (key.ctrl && input === 'c')) {
@@ -277,9 +294,10 @@ export function InitWizard({ step, config, isReinit, existingApiKey, existingPro
             React.createElement(Text, { bold: true }, "Choose Embedding Provider"),
             React.createElement(Box, { marginTop: 1 },
                 React.createElement(SelectInput, { items: PROVIDER_ITEMS, onSelect: item => {
-                        // Reset API key state when provider changes
+                        // Reset API key and region state when provider changes
                         setApiKeyInput('');
                         setApiKeyAction(null);
+                        setShowRegionSelect(false);
                         // Use relative increment: step + 1
                         onStepChange(normalizedStep + 1, { provider: item.value });
                     } })),
@@ -298,6 +316,21 @@ export function InitWizard({ step, config, isReinit, existingApiKey, existingPro
         const provider = currentProvider;
         const info = PROVIDER_CONFIG[provider];
         const apiKeyUrl = API_KEY_URLS[provider];
+        const isOpenAI = provider === 'openai';
+        // Show OpenAI region selection after API key is entered
+        if (isOpenAI && showRegionSelect) {
+            return (React.createElement(Box, { flexDirection: "column", borderStyle: "round", paddingX: 2, paddingY: 1 },
+                React.createElement(Text, { bold: true }, "Select OpenAI API Region"),
+                React.createElement(Box, { marginTop: 1, flexDirection: "column" },
+                    React.createElement(Text, { dimColor: true }, "Corporate accounts with data residency require regional endpoints."),
+                    React.createElement(Text, { dimColor: true }, "Most users should select Default.")),
+                React.createElement(Box, { marginTop: 1 },
+                    React.createElement(SelectInput, { items: OPENAI_REGION_ITEMS, onSelect: item => {
+                            onStepChange(normalizedStep + 1, { openaiRegion: item.value });
+                        } })),
+                React.createElement(Box, { marginTop: 1 },
+                    React.createElement(Text, { dimColor: true }, "\u2191/\u2193 navigate, Enter select, Esc cancel"))));
+        }
         return (React.createElement(Box, { flexDirection: "column", borderStyle: "round", paddingX: 2, paddingY: 1 },
             React.createElement(Text, { bold: true },
                 "Configure ",
@@ -316,8 +349,18 @@ export function InitWizard({ step, config, isReinit, existingApiKey, existingPro
                 React.createElement(Box, { marginTop: 1 },
                     React.createElement(SelectInput, { items: API_KEY_ACTION_ITEMS, onSelect: item => {
                             if (item.value === 'keep') {
-                                // Keep existing key, advance to confirmation
-                                onStepChange(normalizedStep + 1, { apiKey: existingApiKey });
+                                // Keep existing key
+                                onStepChange(normalizedStep, { apiKey: existingApiKey });
+                                if (isOpenAI) {
+                                    // Show region selection for OpenAI
+                                    setShowRegionSelect(true);
+                                }
+                                else {
+                                    // Advance to confirmation for other providers
+                                    onStepChange(normalizedStep + 1, {
+                                        apiKey: existingApiKey,
+                                    });
+                                }
                             }
                             else {
                                 // Show text input for new key
@@ -325,7 +368,15 @@ export function InitWizard({ step, config, isReinit, existingApiKey, existingPro
                             }
                         } })))) : (React.createElement(ApiKeyInputStep, { providerName: info.name, apiKeyInput: apiKeyInput, setApiKeyInput: setApiKeyInput, onSubmit: key => {
                     if (key.trim()) {
-                        onStepChange(normalizedStep + 1, { apiKey: key.trim() });
+                        onStepChange(normalizedStep, { apiKey: key.trim() });
+                        if (isOpenAI) {
+                            // Show region selection for OpenAI
+                            setShowRegionSelect(true);
+                        }
+                        else {
+                            // Advance to confirmation for other providers
+                            onStepChange(normalizedStep + 1, { apiKey: key.trim() });
+                        }
                     }
                 } })),
             React.createElement(Box, { marginTop: 1 },

package/dist/common/types.d.ts CHANGED Viewed

@@ -91,6 +91,10 @@ export type IndexDisplayStats = {
  * - openai: text-embedding-3-small (1536d) - Fast API
  */
 export type EmbeddingProviderType = 'local' | 'local-4b' | 'gemini' | 'mistral' | 'openai';
+/**
+ * OpenAI API regional endpoints for data residency.
+ */
+export type OpenAIRegion = 'default' | 'us' | 'eu';
 /**
  * Configuration collected from the init wizard.
  */
@@ -98,6 +102,8 @@ export type InitWizardConfig = {
     provider: EmbeddingProviderType;
     /** API key for cloud providers (gemini, mistral, openai) */
     apiKey?: string;
+    /** OpenAI regional endpoint (for corporate accounts with data residency) */
+    openaiRegion?: OpenAIRegion;
 };
 /**
  * MCP editor identifiers.

package/dist/rag/config/index.d.ts CHANGED Viewed

@@ -20,6 +20,8 @@ export interface ViberagConfig {
     embeddingDimensions: number;
     /** API key for cloud providers (gemini, mistral, openai) */
     apiKey?: string;
+    /** OpenAI API base URL (for corporate accounts with data residency) */
+    openaiBaseUrl?: string;
     extensions: string[];
     excludePatterns: string[];
     chunkMaxSize: number;

package/dist/rag/embeddings/api-utils.d.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * Shared utilities for API-based embedding providers.
+ * Provides common retry logic, rate limiting, and concurrency patterns.
+ */
+/** Max concurrent API requests */
+export declare const CONCURRENCY = 5;
+/** Delay (ms) between batch completion and next batch start (per slot) */
+export declare const BATCH_DELAY_MS = 200;
+/** Max retry attempts on rate limit */
+export declare const MAX_RETRIES = 12;
+/** Initial backoff (ms) */
+export declare const INITIAL_BACKOFF_MS = 1000;
+/** Maximum backoff (ms) */
+export declare const MAX_BACKOFF_MS = 60000;
+/**
+ * Sleep for a specified duration.
+ */
+export declare function sleep(ms: number): Promise<void>;
+/**
+ * Check if an error is a rate limit error (429 or quota exceeded).
+ */
+export declare function isRateLimitError(error: unknown): boolean;
+/**
+ * Check if an error is a known transient API error that should be retried.
+ *
+ * GEMINI TRANSIENT BUG:
+ * The Gemini API has a known server-side bug where it intermittently returns
+ * a 400 "API key expired" error even when the key is valid. This is NOT an
+ * actual authentication failure - it's a transient error that resolves on retry.
+ *
+ * Evidence:
+ * - Users report: "if I try the same request again a few times, it usually works fine"
+ * - New API keys don't fix it
+ * - Same key works in curl but fails randomly via API clients
+ * - Google has acknowledged this as a P1/P2 bug
+ *
+ * GitHub issues documenting this bug:
+ * - https://github.com/google-gemini/gemini-cli/issues/4430
+ * - https://github.com/google-gemini/gemini-cli/issues/1712
+ * - https://github.com/google-gemini/gemini-cli/issues/8675
+ *
+ * We detect this specific error and retry it rather than failing immediately.
+ */
+export declare function isTransientApiError(error: unknown): boolean;
+/**
+ * Check if an error should trigger a retry (rate limit OR transient error).
+ */
+export declare function isRetriableError(error: unknown): boolean;
+/**
+ * Callbacks for rate limiting and progress reporting.
+ */
+export interface ApiProviderCallbacks {
+    onThrottle?: (message: string | null) => void;
+    onBatchProgress?: (processed: number, total: number) => void;
+}
+/**
+ * Execute an async function with exponential backoff retry on retriable errors.
+ *
+ * Retries on:
+ * - Rate limit errors (429, quota exceeded)
+ * - Transient API errors (e.g., Gemini's spurious "API key expired" bug)
+ *
+ * @param fn - The async function to execute
+ * @param callbacks - Optional callbacks for throttle notifications
+ * @returns The result of the function
+ */
+export declare function withRetry<T>(fn: () => Promise<T>, callbacks?: ApiProviderCallbacks): Promise<T>;
+/**
+ * Process batches with p-limit sliding window concurrency and inter-batch delay.
+ * Reports progress per-batch (more granular than group-based).
+ *
+ * @param batches - Array of batches to process
+ * @param processBatch - Function to process a single batch
+ * @param callbacks - Optional callbacks for progress reporting
+ * @returns Flattened array of results
+ */
+export declare function processBatchesWithLimit<T>(batches: T[][], processBatch: (batch: T[]) => Promise<number[][]>, callbacks?: ApiProviderCallbacks): Promise<number[][]>;
+/**
+ * Split an array into batches of a specified size.
+ */
+export declare function chunk<T>(array: T[], size: number): T[][];

package/dist/rag/embeddings/api-utils.js ADDED Viewed

@@ -0,0 +1,150 @@
+/**
+ * Shared utilities for API-based embedding providers.
+ * Provides common retry logic, rate limiting, and concurrency patterns.
+ */
+import pLimit from 'p-limit';
+// ============================================================================
+// Constants
+// ============================================================================
+/** Max concurrent API requests */
+export const CONCURRENCY = 5;
+/** Delay (ms) between batch completion and next batch start (per slot) */
+export const BATCH_DELAY_MS = 200;
+/** Max retry attempts on rate limit */
+export const MAX_RETRIES = 12;
+/** Initial backoff (ms) */
+export const INITIAL_BACKOFF_MS = 1000;
+/** Maximum backoff (ms) */
+export const MAX_BACKOFF_MS = 60000;
+// ============================================================================
+// Utility Functions
+// ============================================================================
+/**
+ * Sleep for a specified duration.
+ */
+export function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms));
+}
+/**
+ * Check if an error is a rate limit error (429 or quota exceeded).
+ */
+export function isRateLimitError(error) {
+    if (error instanceof Error) {
+        const msg = error.message.toLowerCase();
+        return msg.includes('429') || msg.includes('rate') || msg.includes('quota');
+    }
+    return false;
+}
+/**
+ * Check if an error is a known transient API error that should be retried.
+ *
+ * GEMINI TRANSIENT BUG:
+ * The Gemini API has a known server-side bug where it intermittently returns
+ * a 400 "API key expired" error even when the key is valid. This is NOT an
+ * actual authentication failure - it's a transient error that resolves on retry.
+ *
+ * Evidence:
+ * - Users report: "if I try the same request again a few times, it usually works fine"
+ * - New API keys don't fix it
+ * - Same key works in curl but fails randomly via API clients
+ * - Google has acknowledged this as a P1/P2 bug
+ *
+ * GitHub issues documenting this bug:
+ * - https://github.com/google-gemini/gemini-cli/issues/4430
+ * - https://github.com/google-gemini/gemini-cli/issues/1712
+ * - https://github.com/google-gemini/gemini-cli/issues/8675
+ *
+ * We detect this specific error and retry it rather than failing immediately.
+ */
+export function isTransientApiError(error) {
+    if (error instanceof Error) {
+        const msg = error.message.toLowerCase();
+        // Gemini transient "API key expired" bug (400 status)
+        // The specific message is: "API key expired. Please renew the API key."
+        // We check for this specific pattern to avoid retrying actual auth failures
+        if (msg.includes('api key expired') &&
+            (msg.includes('400') || msg.includes('invalid_argument'))) {
+            return true;
+        }
+    }
+    return false;
+}
+/**
+ * Check if an error should trigger a retry (rate limit OR transient error).
+ */
+export function isRetriableError(error) {
+    return isRateLimitError(error) || isTransientApiError(error);
+}
+/**
+ * Execute an async function with exponential backoff retry on retriable errors.
+ *
+ * Retries on:
+ * - Rate limit errors (429, quota exceeded)
+ * - Transient API errors (e.g., Gemini's spurious "API key expired" bug)
+ *
+ * @param fn - The async function to execute
+ * @param callbacks - Optional callbacks for throttle notifications
+ * @returns The result of the function
+ */
+export async function withRetry(fn, callbacks) {
+    let attempt = 0;
+    let backoffMs = INITIAL_BACKOFF_MS;
+    while (true) {
+        try {
+            const result = await fn();
+            // Clear throttle message on success (if was throttling)
+            if (attempt > 0)
+                callbacks?.onThrottle?.(null);
+            return result;
+        }
+        catch (error) {
+            if (isRetriableError(error) && attempt < MAX_RETRIES) {
+                attempt++;
+                const secs = Math.round(backoffMs / 1000);
+                // Provide context-appropriate message
+                const isTransient = isTransientApiError(error);
+                const reason = isTransient ? 'Transient API error' : 'Rate limited';
+                callbacks?.onThrottle?.(`${reason} - retry ${attempt}/${MAX_RETRIES} in ${secs}s`);
+                await sleep(backoffMs);
+                backoffMs = Math.min(backoffMs * 2, MAX_BACKOFF_MS);
+            }
+            else {
+                throw error;
+            }
+        }
+    }
+}
+/**
+ * Process batches with p-limit sliding window concurrency and inter-batch delay.
+ * Reports progress per-batch (more granular than group-based).
+ *
+ * @param batches - Array of batches to process
+ * @param processBatch - Function to process a single batch
+ * @param callbacks - Optional callbacks for progress reporting
+ * @returns Flattened array of results
+ */
+export async function processBatchesWithLimit(batches, processBatch, callbacks) {
+    const limit = pLimit(CONCURRENCY);
+    let processedItems = 0;
+    const totalItems = batches.reduce((sum, batch) => sum + batch.length, 0);
+    const batchResults = await Promise.all(batches.map(batch => limit(async () => {
+        const result = await processBatch(batch);
+        // Delay before releasing the slot (rate limit protection)
+        await sleep(BATCH_DELAY_MS);
+        // Report progress per-batch
+        processedItems += batch.length;
+        callbacks?.onBatchProgress?.(processedItems, totalItems);
+        return result;
+    })));
+    return batchResults.flat();
+}
+/**
+ * Split an array into batches of a specified size.
+ */
+export function chunk(array, size) {
+    const batches = [];
+    for (let i = 0; i < array.length; i += size) {
+        batches.push(array.slice(i, i + size));
+    }
+    return batches;
+}

package/dist/rag/embeddings/gemini.d.ts CHANGED Viewed

@@ -22,14 +22,6 @@ export declare class GeminiEmbeddingProvider implements EmbeddingProvider {
     constructor(apiKey?: string);
     initialize(_onProgress?: ModelProgressCallback): Promise<void>;
     embed(texts: string[]): Promise<number[][]>;
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    private embedBatchWithRetry;
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    private isRateLimitError;
     private embedBatch;
     embedSingle(text: string): Promise<number[]>;
     close(): void;

package/dist/rag/embeddings/gemini.js CHANGED Viewed

@@ -8,19 +8,13 @@
  *
  * Free tier available with generous limits.
  */
+import { chunk, processBatchesWithLimit, withRetry, } from './api-utils.js';
 const GEMINI_API_BASE = 'https://generativelanguage.googleapis.com/v1beta/models';
 const MODEL = 'gemini-embedding-001';
 // Gemini limits: 2,048 tokens/text, 20,000 tokens/batch, 100-250 texts/batch
-// With avg ~1000 tokens/chunk, safe limit is 20 texts.
-const BATCH_SIZE = 20;
-// Concurrency and rate limiting
-const CONCURRENCY = 5; // Max concurrent API requests
-const MAX_RETRIES = 12; // Max retry attempts on rate limit
-const INITIAL_BACKOFF_MS = 1000; // Start at 1s
-const MAX_BACKOFF_MS = 60000; // Cap at 60s (1 min)
-function sleep(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-}
+// Chunks are ~2000 chars + context header ≈ 800-1000 tokens each
+// 16 chunks × 1000 tokens = 16,000 tokens (safe margin under 20k limit)
+const BATCH_SIZE = 16;
 /**
  * Gemini embedding provider.
  * Uses gemini-embedding-001 model via Google's Generative AI API.
@@ -75,66 +69,12 @@ export class GeminiEmbeddingProvider {
         if (texts.length === 0) {
             return [];
         }
-        // Split into batches
-        const batches = [];
-        for (let i = 0; i < texts.length; i += BATCH_SIZE) {
-            batches.push(texts.slice(i, i + BATCH_SIZE));
-        }
-        // Process batches with limited concurrency
-        const results = [];
-        let completed = 0;
-        for (let i = 0; i < batches.length; i += CONCURRENCY) {
-            const concurrentBatches = batches.slice(i, i + CONCURRENCY);
-            // Fire concurrent requests
-            const batchResults = await Promise.all(concurrentBatches.map(batch => this.embedBatchWithRetry(batch)));
-            // Flatten and collect results (Promise.all preserves order)
-            for (const result of batchResults) {
-                results.push(...result);
-            }
-            // Report progress after concurrent group completes
-            completed += concurrentBatches.length;
-            const processed = Math.min(completed * BATCH_SIZE, texts.length);
-            this.onBatchProgress?.(processed, texts.length);
-        }
-        return results;
-    }
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    async embedBatchWithRetry(batch) {
-        let attempt = 0;
-        let backoffMs = INITIAL_BACKOFF_MS;
-        while (true) {
-            try {
-                const result = await this.embedBatch(batch);
-                // Clear throttle message on success (if was throttling)
-                if (attempt > 0)
-                    this.onThrottle?.(null);
-                return result;
-            }
-            catch (error) {
-                if (this.isRateLimitError(error) && attempt < MAX_RETRIES) {
-                    attempt++;
-                    const secs = Math.round(backoffMs / 1000);
-                    this.onThrottle?.(`Rate limited - retry ${attempt}/${MAX_RETRIES} in ${secs}s`);
-                    await sleep(backoffMs);
-                    backoffMs = Math.min(backoffMs * 2, MAX_BACKOFF_MS);
-                }
-                else {
-                    throw error;
-                }
-            }
-        }
-    }
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    isRateLimitError(error) {
-        if (error instanceof Error) {
-            const msg = error.message.toLowerCase();
-            return (msg.includes('429') || msg.includes('rate') || msg.includes('quota'));
-        }
-        return false;
+        const batches = chunk(texts, BATCH_SIZE);
+        const callbacks = {
+            onThrottle: this.onThrottle,
+            onBatchProgress: this.onBatchProgress,
+        };
+        return processBatchesWithLimit(batches, batch => withRetry(() => this.embedBatch(batch), callbacks), callbacks);
     }
     async embedBatch(texts) {
         const url = `${GEMINI_API_BASE}/${MODEL}:batchEmbedContents`;

package/dist/rag/embeddings/index.d.ts CHANGED Viewed

@@ -4,8 +4,10 @@
  */
 export { GeminiEmbeddingProvider } from './gemini.js';
 export { Local4BEmbeddingProvider } from './local-4b.js';
-export { LocalEmbeddingProvider } from './local.js';
+export { LocalEmbeddingProvider, clearCachedPipeline } from './local.js';
 export { MistralEmbeddingProvider } from './mistral.js';
+export { MockEmbeddingProvider } from './mock.js';
 export { OpenAIEmbeddingProvider } from './openai.js';
 export { validateApiKey, type ValidationResult } from './validate.js';
 export type { EmbeddingProvider, ModelProgressCallback } from './types.js';
+export { CONCURRENCY, BATCH_DELAY_MS, MAX_RETRIES, INITIAL_BACKOFF_MS, MAX_BACKOFF_MS, sleep, isRateLimitError, isTransientApiError, isRetriableError, withRetry, processBatchesWithLimit, chunk, type ApiProviderCallbacks, } from './api-utils.js';

package/dist/rag/embeddings/index.js CHANGED Viewed

@@ -4,7 +4,10 @@
  */
 export { GeminiEmbeddingProvider } from './gemini.js';
 export { Local4BEmbeddingProvider } from './local-4b.js';
-export { LocalEmbeddingProvider } from './local.js';
+export { LocalEmbeddingProvider, clearCachedPipeline } from './local.js';
 export { MistralEmbeddingProvider } from './mistral.js';
+export { MockEmbeddingProvider } from './mock.js';
 export { OpenAIEmbeddingProvider } from './openai.js';
 export { validateApiKey } from './validate.js';
+// Shared utilities for API-based providers
+export { CONCURRENCY, BATCH_DELAY_MS, MAX_RETRIES, INITIAL_BACKOFF_MS, MAX_BACKOFF_MS, sleep, isRateLimitError, isTransientApiError, isRetriableError, withRetry, processBatchesWithLimit, chunk, } from './api-utils.js';

package/dist/rag/embeddings/local.d.ts CHANGED Viewed

@@ -14,14 +14,19 @@
  * - Data never leaves your machine
  */
 import type { EmbeddingProvider, ModelProgressCallback } from './types.js';
+/**
+ * Clear the cached pipeline.
+ * Useful for tests that need to reset state between runs.
+ */
+export declare function clearCachedPipeline(): void;
 /**
  * Local embedding provider using Qwen3-Embedding-0.6B Q8.
  */
 export declare class LocalEmbeddingProvider implements EmbeddingProvider {
     readonly dimensions = 1024;
-    private extractor;
     private initialized;
     initialize(onProgress?: ModelProgressCallback): Promise<void>;
+    private loadModel;
     embed(texts: string[]): Promise<number[][]>;
     private embedBatch;
     embedSingle(text: string): Promise<number[]>;

package/dist/rag/embeddings/local.js CHANGED Viewed

@@ -17,6 +17,19 @@ import { pipeline } from '@huggingface/transformers';
 const MODEL_NAME = 'onnx-community/Qwen3-Embedding-0.6B-ONNX';
 const DIMENSIONS = 1024;
 const BATCH_SIZE = 8;
+// Module-level cache for the ONNX pipeline
+// Shared across all LocalEmbeddingProvider instances to avoid reloading the model
+// eslint-disable-next-line @typescript-eslint/no-explicit-any -- HuggingFace pipeline type is too complex
+let cachedExtractor = null;
+let initPromise = null;
+/**
+ * Clear the cached pipeline.
+ * Useful for tests that need to reset state between runs.
+ */
+export function clearCachedPipeline() {
+    cachedExtractor = null;
+    initPromise = null;
+}
 /**
  * Local embedding provider using Qwen3-Embedding-0.6B Q8.
  */
@@ -28,13 +41,6 @@ export class LocalEmbeddingProvider {
             writable: true,
             value: DIMENSIONS
         });
-        // eslint-disable-next-line @typescript-eslint/no-explicit-any -- HuggingFace pipeline type is too complex
-        Object.defineProperty(this, "extractor", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: null
-        });
         Object.defineProperty(this, "initialized", {
             enumerable: true,
             configurable: true,
@@ -45,6 +51,33 @@ export class LocalEmbeddingProvider {
     async initialize(onProgress) {
         if (this.initialized)
             return;
+        // Reuse cached model if available
+        if (cachedExtractor) {
+            this.initialized = true;
+            onProgress?.('ready');
+            return;
+        }
+        // If another instance is already loading, wait for it
+        if (initPromise) {
+            await initPromise;
+            this.initialized = true;
+            onProgress?.('ready');
+            return;
+        }
+        // First load - this instance will load the model and cache it
+        initPromise = this.loadModel(onProgress);
+        try {
+            await initPromise;
+            this.initialized = true;
+        }
+        catch (error) {
+            // Clear the cached promise so future calls can retry
+            // (e.g., after network recovery or freeing memory)
+            initPromise = null;
+            throw error;
+        }
+    }
+    async loadModel(onProgress) {
         // Track download progress for the model files
         let lastProgress = 0;
         const progressCallback = onProgress
@@ -67,12 +100,11 @@ export class LocalEmbeddingProvider {
         onProgress?.('loading');
         // Load the model with q8 (int8) quantization for smaller size and faster inference
         // First load will download the model (~700MB)
-        this.extractor = await pipeline('feature-extraction', MODEL_NAME, {
+        cachedExtractor = await pipeline('feature-extraction', MODEL_NAME, {
             dtype: 'q8', // int8 quantization
             progress_callback: progressCallback,
         });
         onProgress?.('ready');
-        this.initialized = true;
     }
     async embed(texts) {
         if (!this.initialized) {
@@ -93,7 +125,7 @@ export class LocalEmbeddingProvider {
     async embedBatch(texts) {
         const results = [];
         for (const text of texts) {
-            const output = await this.extractor(text, {
+            const output = await cachedExtractor(text, {
                 pooling: 'mean',
                 normalize: true,
             });
@@ -107,14 +139,15 @@ export class LocalEmbeddingProvider {
         if (!this.initialized) {
             await this.initialize();
         }
-        const output = await this.extractor(text, {
+        const output = await cachedExtractor(text, {
             pooling: 'mean',
             normalize: true,
         });
         return Array.from(output.data);
     }
     close() {
-        this.extractor = null;
+        // Mark this instance as uninitialized, but don't clear the cached model
+        // Other instances may still be using it
         this.initialized = false;
     }
 }

package/dist/rag/embeddings/mistral.d.ts CHANGED Viewed

@@ -18,14 +18,6 @@ export declare class MistralEmbeddingProvider implements EmbeddingProvider {
     constructor(apiKey?: string);
     initialize(_onProgress?: ModelProgressCallback): Promise<void>;
     embed(texts: string[]): Promise<number[][]>;
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    private embedBatchWithRetry;
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    private isRateLimitError;
     private embedBatch;
     embedSingle(text: string): Promise<number[]>;
     close(): void;

package/dist/rag/embeddings/mistral.js CHANGED Viewed

@@ -4,19 +4,13 @@
  * Uses codestral-embed model (1536 dimensions).
  * Optimized for code and technical content.
  */
+import { chunk, processBatchesWithLimit, withRetry, } from './api-utils.js';
 const MISTRAL_API_BASE = 'https://api.mistral.ai/v1';
 const MODEL = 'codestral-embed';
 // Mistral limits: 8,192 tokens/text, 16,000 tokens/batch TOTAL
-// With avg ~500 tokens/chunk, can fit ~32. Use 24 for safety margin.
-const BATCH_SIZE = 24;
-// Concurrency and rate limiting
-const CONCURRENCY = 5; // Max concurrent API requests
-const MAX_RETRIES = 12; // Max retry attempts on rate limit
-const INITIAL_BACKOFF_MS = 1000; // Start at 1s
-const MAX_BACKOFF_MS = 60000; // Cap at 60s (1 min)
-function sleep(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-}
+// Chunks are ~2000 chars + context header ≈ 800-1000 tokens each
+// 12 chunks × 1000 tokens = 12,000 tokens (safe margin under 16k limit)
+const BATCH_SIZE = 12;
 /**
  * Mistral embedding provider.
  * Uses codestral-embed model via Mistral AI API.
@@ -71,66 +65,12 @@ export class MistralEmbeddingProvider {
         if (texts.length === 0) {
             return [];
         }
-        // Split into batches
-        const batches = [];
-        for (let i = 0; i < texts.length; i += BATCH_SIZE) {
-            batches.push(texts.slice(i, i + BATCH_SIZE));
-        }
-        // Process batches with limited concurrency
-        const results = [];
-        let completed = 0;
-        for (let i = 0; i < batches.length; i += CONCURRENCY) {
-            const concurrentBatches = batches.slice(i, i + CONCURRENCY);
-            // Fire concurrent requests
-            const batchResults = await Promise.all(concurrentBatches.map(batch => this.embedBatchWithRetry(batch)));
-            // Flatten and collect results (Promise.all preserves order)
-            for (const result of batchResults) {
-                results.push(...result);
-            }
-            // Report progress after concurrent group completes
-            completed += concurrentBatches.length;
-            const processed = Math.min(completed * BATCH_SIZE, texts.length);
-            this.onBatchProgress?.(processed, texts.length);
-        }
-        return results;
-    }
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    async embedBatchWithRetry(batch) {
-        let attempt = 0;
-        let backoffMs = INITIAL_BACKOFF_MS;
-        while (true) {
-            try {
-                const result = await this.embedBatch(batch);
-                // Clear throttle message on success (if was throttling)
-                if (attempt > 0)
-                    this.onThrottle?.(null);
-                return result;
-            }
-            catch (error) {
-                if (this.isRateLimitError(error) && attempt < MAX_RETRIES) {
-                    attempt++;
-                    const secs = Math.round(backoffMs / 1000);
-                    this.onThrottle?.(`Rate limited - retry ${attempt}/${MAX_RETRIES} in ${secs}s`);
-                    await sleep(backoffMs);
-                    backoffMs = Math.min(backoffMs * 2, MAX_BACKOFF_MS);
-                }
-                else {
-                    throw error;
-                }
-            }
-        }
-    }
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    isRateLimitError(error) {
-        if (error instanceof Error) {
-            const msg = error.message.toLowerCase();
-            return (msg.includes('429') || msg.includes('rate') || msg.includes('quota'));
-        }
-        return false;
+        const batches = chunk(texts, BATCH_SIZE);
+        const callbacks = {
+            onThrottle: this.onThrottle,
+            onBatchProgress: this.onBatchProgress,
+        };
+        return processBatchesWithLimit(batches, batch => withRetry(() => this.embedBatch(batch), callbacks), callbacks);
     }
     async embedBatch(texts) {
         const response = await fetch(`${MISTRAL_API_BASE}/embeddings`, {

package/dist/rag/embeddings/mock.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Mock embedding provider for testing.
+ *
+ * Generates deterministic hash-based embeddings that:
+ * - Run instantly (no model loading)
+ * - Are deterministic (same input = same output)
+ * - Normalized to unit length
+ * - Support any dimension count
+ *
+ * Usage:
+ * - Unit tests that need embeddings but don't need semantic quality
+ * - Testing search infrastructure without ONNX overhead
+ * - CI pipeline fast checks
+ */
+import type { EmbeddingProvider, ModelProgressCallback } from './types.js';
+/**
+ * Mock embedding provider using deterministic hash-based vectors.
+ */
+export declare class MockEmbeddingProvider implements EmbeddingProvider {
+    readonly dimensions: number;
+    constructor(dimensions?: number);
+    initialize(_onProgress?: ModelProgressCallback): Promise<void>;
+    embed(texts: string[]): Promise<number[][]>;
+    embedSingle(text: string): Promise<number[]>;
+    /**
+     * Convert text to a deterministic unit vector.
+     * Uses a simple hash-based approach to generate pseudo-random but repeatable values.
+     */
+    private hashToVector;
+    /**
+     * Simple string hash function (djb2).
+     */
+    private hash;
+    close(): void;
+}

package/dist/rag/embeddings/mock.js ADDED Viewed

@@ -0,0 +1,69 @@
+/**
+ * Mock embedding provider for testing.
+ *
+ * Generates deterministic hash-based embeddings that:
+ * - Run instantly (no model loading)
+ * - Are deterministic (same input = same output)
+ * - Normalized to unit length
+ * - Support any dimension count
+ *
+ * Usage:
+ * - Unit tests that need embeddings but don't need semantic quality
+ * - Testing search infrastructure without ONNX overhead
+ * - CI pipeline fast checks
+ */
+const DEFAULT_DIMENSIONS = 1024;
+/**
+ * Mock embedding provider using deterministic hash-based vectors.
+ */
+export class MockEmbeddingProvider {
+    constructor(dimensions = DEFAULT_DIMENSIONS) {
+        Object.defineProperty(this, "dimensions", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        this.dimensions = dimensions;
+    }
+    async initialize(_onProgress) {
+        // No initialization needed - instant startup
+    }
+    async embed(texts) {
+        return texts.map(t => this.hashToVector(t));
+    }
+    async embedSingle(text) {
+        return this.hashToVector(text);
+    }
+    /**
+     * Convert text to a deterministic unit vector.
+     * Uses a simple hash-based approach to generate pseudo-random but repeatable values.
+     */
+    hashToVector(text) {
+        const seed = this.hash(text);
+        // Generate deterministic pseudo-random values
+        const vec = new Array(this.dimensions).fill(0).map((_, i) => {
+            // LCG-like pseudo-random based on seed and index
+            const state = (((seed * (i + 1) * 1103515245 + 12345) >>> 0) % 0x7fffffff) /
+                0x7fffffff;
+            return state * 2 - 1; // Range [-1, 1]
+        });
+        // Normalize to unit length
+        const magnitude = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0));
+        return vec.map(v => (magnitude > 0 ? v / magnitude : 0));
+    }
+    /**
+     * Simple string hash function (djb2).
+     */
+    hash(str) {
+        let h = 5381;
+        for (let i = 0; i < str.length; i++) {
+            h = (h * 33) ^ str.charCodeAt(i);
+            h = h >>> 0; // Convert to unsigned 32-bit
+        }
+        return h;
+    }
+    close() {
+        // Nothing to close
+    }
+}

package/dist/rag/embeddings/openai.d.ts CHANGED Viewed

@@ -8,24 +8,22 @@ import type { EmbeddingProvider, ModelProgressCallback } from './types.js';
 /**
  * OpenAI embedding provider.
  * Uses text-embedding-3-small model via OpenAI API.
+ *
+ * Supports regional endpoints for corporate accounts with data residency:
+ * - Default: https://api.openai.com/v1
+ * - US: https://us.api.openai.com/v1
+ * - EU: https://eu.api.openai.com/v1
  */
 export declare class OpenAIEmbeddingProvider implements EmbeddingProvider {
     readonly dimensions = 1536;
     private apiKey;
+    private apiBase;
     private initialized;
     onThrottle?: (message: string | null) => void;
     onBatchProgress?: (processed: number, total: number) => void;
-    constructor(apiKey?: string);
+    constructor(apiKey?: string, baseUrl?: string);
     initialize(_onProgress?: ModelProgressCallback): Promise<void>;
     embed(texts: string[]): Promise<number[][]>;
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    private embedBatchWithRetry;
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    private isRateLimitError;
     private embedBatch;
     embedSingle(text: string): Promise<number[]>;
     close(): void;

package/dist/rag/embeddings/openai.js CHANGED Viewed

@@ -4,25 +4,24 @@
  * Uses text-embedding-3-small model (1536 dimensions).
  * Good quality with fast API responses and low cost ($0.02/1M tokens).
  */
-const OPENAI_API_BASE = 'https://api.openai.com/v1';
+import { chunk, processBatchesWithLimit, withRetry, } from './api-utils.js';
+const DEFAULT_API_BASE = 'https://api.openai.com/v1';
 const MODEL = 'text-embedding-3-small';
 // OpenAI limits: 8,191 tokens/text, 300,000 tokens/batch, 2,048 texts/batch
-// With avg ~1000 tokens/chunk, safe limit is 300 texts. Use 256 for margin.
-const BATCH_SIZE = 256;
-// Concurrency and rate limiting
-const CONCURRENCY = 5; // Max concurrent API requests
-const MAX_RETRIES = 12; // Max retry attempts on rate limit
-const INITIAL_BACKOFF_MS = 1000; // Start at 1s
-const MAX_BACKOFF_MS = 60000; // Cap at 60s (1 min)
-function sleep(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-}
+// Chunks are ~2000 chars + context header ≈ 800-1000 tokens each
+// 200 chunks × 1000 tokens = 200,000 tokens (safe margin under 300k limit)
+const BATCH_SIZE = 200;
 /**
  * OpenAI embedding provider.
  * Uses text-embedding-3-small model via OpenAI API.
+ *
+ * Supports regional endpoints for corporate accounts with data residency:
+ * - Default: https://api.openai.com/v1
+ * - US: https://us.api.openai.com/v1
+ * - EU: https://eu.api.openai.com/v1
  */
 export class OpenAIEmbeddingProvider {
-    constructor(apiKey) {
+    constructor(apiKey, baseUrl) {
         Object.defineProperty(this, "dimensions", {
             enumerable: true,
             configurable: true,
@@ -35,6 +34,12 @@ export class OpenAIEmbeddingProvider {
             writable: true,
             value: void 0
         });
+        Object.defineProperty(this, "apiBase", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
         Object.defineProperty(this, "initialized", {
             enumerable: true,
             configurable: true,
@@ -57,6 +62,7 @@ export class OpenAIEmbeddingProvider {
         });
         // Trim the key to remove any accidental whitespace
         this.apiKey = (apiKey ?? '').trim();
+        this.apiBase = baseUrl ?? DEFAULT_API_BASE;
     }
     async initialize(_onProgress) {
         if (!this.apiKey) {
@@ -75,69 +81,15 @@ export class OpenAIEmbeddingProvider {
         if (texts.length === 0) {
             return [];
         }
-        // Split into batches
-        const batches = [];
-        for (let i = 0; i < texts.length; i += BATCH_SIZE) {
-            batches.push(texts.slice(i, i + BATCH_SIZE));
-        }
-        // Process batches with limited concurrency
-        const results = [];
-        let completed = 0;
-        for (let i = 0; i < batches.length; i += CONCURRENCY) {
-            const concurrentBatches = batches.slice(i, i + CONCURRENCY);
-            // Fire concurrent requests
-            const batchResults = await Promise.all(concurrentBatches.map(batch => this.embedBatchWithRetry(batch)));
-            // Flatten and collect results (Promise.all preserves order)
-            for (const result of batchResults) {
-                results.push(...result);
-            }
-            // Report progress after concurrent group completes
-            completed += concurrentBatches.length;
-            const processed = Math.min(completed * BATCH_SIZE, texts.length);
-            this.onBatchProgress?.(processed, texts.length);
-        }
-        return results;
-    }
-    /**
-     * Embed a batch with exponential backoff retry on rate limit errors.
-     */
-    async embedBatchWithRetry(batch) {
-        let attempt = 0;
-        let backoffMs = INITIAL_BACKOFF_MS;
-        while (true) {
-            try {
-                const result = await this.embedBatch(batch);
-                // Clear throttle message on success (if was throttling)
-                if (attempt > 0)
-                    this.onThrottle?.(null);
-                return result;
-            }
-            catch (error) {
-                if (this.isRateLimitError(error) && attempt < MAX_RETRIES) {
-                    attempt++;
-                    const secs = Math.round(backoffMs / 1000);
-                    this.onThrottle?.(`Rate limited - retry ${attempt}/${MAX_RETRIES} in ${secs}s`);
-                    await sleep(backoffMs);
-                    backoffMs = Math.min(backoffMs * 2, MAX_BACKOFF_MS);
-                }
-                else {
-                    throw error;
-                }
-            }
-        }
-    }
-    /**
-     * Check if an error is a rate limit error (429 or quota exceeded).
-     */
-    isRateLimitError(error) {
-        if (error instanceof Error) {
-            const msg = error.message.toLowerCase();
-            return (msg.includes('429') || msg.includes('rate') || msg.includes('quota'));
-        }
-        return false;
+        const batches = chunk(texts, BATCH_SIZE);
+        const callbacks = {
+            onThrottle: this.onThrottle,
+            onBatchProgress: this.onBatchProgress,
+        };
+        return processBatchesWithLimit(batches, batch => withRetry(() => this.embedBatch(batch), callbacks), callbacks);
     }
     async embedBatch(texts) {
-        const response = await fetch(`${OPENAI_API_BASE}/embeddings`, {
+        const response = await fetch(`${this.apiBase}/embeddings`, {
             method: 'POST',
             headers: {
                 'Content-Type': 'application/json',

package/dist/rag/indexer/indexer.js CHANGED Viewed

@@ -285,7 +285,8 @@ export class Indexer {
             // Track chunks processed for progress updates
             let lastReportedChunks = 0;
             // Wire batch progress callback to report incremental chunks
-            if (progressContext?.onChunksProcessed && 'onBatchProgress' in embeddings) {
+            if (progressContext?.onChunksProcessed &&
+                'onBatchProgress' in embeddings) {
                 embeddings.onBatchProgress = (processed, _total) => {
                     // Report only the delta since last update
                     const delta = processed - lastReportedChunks;
@@ -414,7 +415,7 @@ export class Indexer {
             case 'mistral':
                 return new MistralEmbeddingProvider(apiKey);
             case 'openai':
-                return new OpenAIEmbeddingProvider(apiKey);
+                return new OpenAIEmbeddingProvider(apiKey, config.openaiBaseUrl);
             default:
                 throw new Error(`Unknown embedding provider: ${config.embeddingProvider}`);
         }

package/dist/rag/search/index.js CHANGED Viewed

@@ -362,7 +362,7 @@ export class SearchEngine {
             case 'mistral':
                 return new MistralEmbeddingProvider(apiKey);
             case 'openai':
-                return new OpenAIEmbeddingProvider(apiKey);
+                return new OpenAIEmbeddingProvider(apiKey, config.openaiBaseUrl);
             default:
                 throw new Error(`Unknown embedding provider: ${config.embeddingProvider}`);
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "viberag",
-	"version": "0.3.2",
+	"version": "0.3.3",
 	"description": "Local code RAG for AI coding assistants - semantic search via MCP server",
 	"license": "AGPL-3.0",
 	"keywords": [
@@ -38,6 +38,8 @@
 		"build": "tsc",
 		"dev": "tsc --watch",
 		"test": "prettier --check . && eslint . && vitest run",
+		"test:fast": "vitest run --project=fast",
+		"test:rag": "vitest run --project=rag",
 		"test:smoke": "vitest run --testNamePattern='Grammar Smoke'",
 		"lint": "eslint .",
 		"lint:fix": "eslint . --fix",
@@ -62,6 +64,7 @@
 	],
 	"dependencies": {
 		"@huggingface/transformers": "^3.8.1",
+		"p-limit": "^6.2.0",
 		"@lancedb/lancedb": "^0.23.0",
 		"apache-arrow": "^18.1.0",
 		"chalk": "^5.6.2",