npm - @juspay/neurolink - Versions diffs - 9.67.1 → 9.67.3 - Mend

@juspay/neurolink 9.67.1 → 9.67.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CHANGELOG.md +4 -0
package/dist/browser/neurolink.min.js +346 -346
package/dist/lib/providers/googleVertex.js +8 -7
package/dist/lib/providers/litellm.d.ts +32 -32
package/dist/lib/providers/litellm.js +188 -458
package/dist/lib/providers/openaiChatCompletionsBase.d.ts +93 -0
package/dist/lib/providers/openaiChatCompletionsBase.js +644 -0
package/dist/lib/providers/openaiChatCompletionsClient.d.ts +67 -0
package/dist/lib/providers/openaiChatCompletionsClient.js +526 -0
package/dist/lib/providers/openaiCompatible.d.ts +7 -63
package/dist/lib/providers/openaiCompatible.js +27 -1168
package/dist/lib/types/openaiCompatible.d.ts +20 -0
package/dist/lib/types/providers.d.ts +2 -0
package/dist/providers/googleVertex.js +8 -7
package/dist/providers/litellm.d.ts +32 -32
package/dist/providers/litellm.js +188 -458
package/dist/providers/openaiChatCompletionsBase.d.ts +93 -0
package/dist/providers/openaiChatCompletionsBase.js +643 -0
package/dist/providers/openaiChatCompletionsClient.d.ts +67 -0
package/dist/providers/openaiChatCompletionsClient.js +525 -0
package/dist/providers/openaiCompatible.d.ts +7 -63
package/dist/providers/openaiCompatible.js +27 -1168
package/dist/types/openaiCompatible.d.ts +20 -0
package/dist/types/providers.d.ts +2 -0
package/package.json +1 -1

package/dist/providers/litellm.js CHANGED Viewed

@@ -1,77 +1,59 @@
-import { createOpenAI } from "@ai-sdk/openai";
 import { SpanKind, SpanStatusCode, trace } from "@opentelemetry/api";
-import { BaseProvider } from "../core/baseProvider.js";
-import { DEFAULT_MAX_STEPS } from "../core/constants.js";
-import { streamAnalyticsCollector } from "../core/streamAnalytics.js";
 import { createProxyFetch } from "../proxy/proxyFetch.js";
 import { AuthenticationError, InvalidModelError, ModelAccessDeniedError, NetworkError, ProviderError, RateLimitError, isModelAccessDeniedMessage, parseAllowedModels, } from "../types/index.js";
 import { isAbortError } from "../utils/errorHandling.js";
-import { emitToolEndFromStepFinish } from "../utils/toolEndEmitter.js";
 import { logger } from "../utils/logger.js";
-import { buildNoOutputSentinel, detectPostStreamNoOutput, stampNoOutputSpan, } from "../utils/noOutputSentinel.js";
+import { isGemini25Model as isCanonicalGemini25Model } from "../utils/modelDetection.js";
 import { calculateCost } from "../utils/pricing.js";
 import { getProviderModel } from "../utils/providerConfig.js";
-import { composeAbortSignals, createTimeoutController, TimeoutError, withTimeout, } from "../utils/timeout.js";
-import { resolveToolChoice } from "../utils/toolChoice.js";
-import { getModelId } from "./providerTypeUtils.js";
-import { NoOutputGeneratedError } from "../utils/generationErrors.js";
-import { Output, stepCountIs } from "../utils/tool.js";
-import { streamText } from "../utils/generation.js";
+import { createTimeoutController, TimeoutError } from "../utils/timeout.js";
+import { stripTrailingSlash } from "./openaiChatCompletionsClient.js";
+import { OpenAIChatCompletionsProvider } from "./openaiChatCompletionsBase.js";
 const streamTracer = trace.getTracer("neurolink.provider.litellm");
-// Configuration helpers
-const getLiteLLMConfig = () => {
-    return {
-        baseURL: process.env.LITELLM_BASE_URL || "http://localhost:4000",
-        apiKey: process.env.LITELLM_API_KEY || "sk-anything",
-    };
-};
+const FALLBACK_LITELLM_MODEL = "openai/gpt-4o-mini";
+const getLiteLLMConfig = () => ({
+    baseURL: process.env.LITELLM_BASE_URL || "http://localhost:4000",
+    apiKey: process.env.LITELLM_API_KEY || "sk-anything",
+});
 /**
- * Returns the default model name for LiteLLM.
- *
- * LiteLLM uses a 'provider/model' format for model names.
- * For example:
- *   - 'openai/gpt-4o-mini'
- *   - 'openai/gpt-3.5-turbo'
- *   - 'anthropic/claude-3-sonnet-20240229'
- *   - 'google/gemini-pro'
- *
- * You can override the default by setting the LITELLM_MODEL environment variable.
+ * LiteLLM uses a 'provider/model' format. Override via LITELLM_MODEL env var.
  */
-const getDefaultLiteLLMModel = () => {
-    return getProviderModel("LITELLM_MODEL", "openai/gpt-4o-mini");
+const getDefaultLiteLLMModel = () => getProviderModel("LITELLM_MODEL", FALLBACK_LITELLM_MODEL);
+// LiteLLM model ids come in `provider/model` form (e.g. "google/gemini-2.5-flash").
+// Strip the provider prefix and delegate to the canonical anchored-regex
+// check in src/lib/utils/modelDetection.ts so the truth lives in one place.
+const isGemini25Model = (modelName) => {
+    const lastSegment = modelName.includes("/")
+        ? modelName.slice(modelName.lastIndexOf("/") + 1)
+        : modelName;
+    return isCanonicalGemini25Model(lastSegment);
 };
 /**
- * LiteLLM Provider - BaseProvider Implementation
- * Provides access to 100+ models via LiteLLM proxy server
+ * LiteLLM Provider — direct HTTP, no AI SDK. Talks to a LiteLLM proxy
+ * server (or any deployment that speaks OpenAI chat-completions + the
+ * `/v1/models` and `/v1/embeddings` endpoints).
+ *
+ * All request/stream/tool-loop orchestration lives in
+ * `OpenAIChatCompletionsProvider`. This class adds LiteLLM-specific
+ * behaviour: OTel span wrap with cost (`onStreamStart`), Gemini 2.5
+ * maxTokens skip (`adjustBuildBodyOptions`), ModelAccessDeniedError on
+ * 403, 10-minute model cache (`getAvailableModels`), `LITELLM_FALLBACK_MODELS`
+ * env-driven fallback list, and native `/v1/embeddings`.
  */
-export class LiteLLMProvider extends BaseProvider {
-    model;
-    credentials;
-    // Cache for available models to avoid repeated API calls
+export class LiteLLMProvider extends OpenAIChatCompletionsProvider {
     static modelsCache = [];
     static modelsCacheTime = 0;
     static MODELS_CACHE_DURATION = 10 * 60 * 1000; // 10 minutes
     constructor(modelName, sdk, _region, credentials) {
-        super(modelName, "litellm", sdk);
-        // Store per-request credentials for use in embed/embedMany/fetchModelsFromAPI
-        this.credentials = credentials;
-        // Initialize LiteLLM using OpenAI SDK with explicit configuration
-        const config = getLiteLLMConfig();
-        // Create OpenAI SDK instance configured for LiteLLM proxy
-        // LiteLLM acts as a proxy server that implements the OpenAI-compatible API.
-        // To communicate with LiteLLM instead of the default OpenAI endpoint, we use createOpenAI
-        // with a custom baseURL and apiKey. This ensures all requests are routed through the LiteLLM
-        // proxy, allowing access to multiple models and custom authentication.
-        const customOpenAI = createOpenAI({
-            baseURL: credentials?.baseURL ?? config.baseURL,
-            apiKey: credentials?.apiKey ?? config.apiKey,
-            fetch: createProxyFetch(),
+        const envConfig = getLiteLLMConfig();
+        super("litellm", modelName, sdk, {
+            baseURL: credentials?.baseURL ?? envConfig.baseURL,
+            apiKey: credentials?.apiKey ?? envConfig.apiKey,
         });
-        this.model = customOpenAI.chat(this.modelName || getDefaultLiteLLMModel());
         logger.debug("LiteLLM Provider initialized", {
             modelName: this.modelName,
             provider: this.providerName,
-            baseURL: config.baseURL,
+            baseURL: this.config.baseURL,
         });
     }
     getProviderName() {
@@ -80,17 +62,84 @@ export class LiteLLMProvider extends BaseProvider {
     getDefaultModel() {
         return getDefaultLiteLLMModel();
     }
+    getFallbackModelName() {
+        return FALLBACK_LITELLM_MODEL;
+    }
+    getFallbackModels() {
+        return (process.env.LITELLM_FALLBACK_MODELS?.split(",")
+            .map((m) => m.trim())
+            .filter((m) => m.length > 0) || [
+            "openai/gpt-4o",
+            "anthropic/claude-3-haiku",
+            "meta-llama/llama-3.1-8b-instruct",
+            "google/gemini-2.5-flash",
+        ]);
+    }
+    /**
+     * Gemini 2.5 models on LiteLLM have a known compatibility issue with
+     * `max_tokens` — strip it before the wire body is built. Applies to
+     * both streaming and non-streaming paths.
+     */
+    adjustBuildBodyOptions(modelId, opts) {
+        if (isGemini25Model(modelId) && opts.maxTokens !== undefined) {
+            if (logger.shouldLog("debug")) {
+                logger.debug("LiteLLM: Skipping maxTokens for Gemini 2.5 model (known compatibility issue)", { modelId, requestedMaxTokens: opts.maxTokens });
+            }
+            return { ...opts, maxTokens: undefined };
+        }
+        return opts;
+    }
     /**
-     * Returns the Vercel AI SDK model instance for LiteLLM
+     * Wrap the stream in an OTel span to capture provider-level latency,
+     * token usage, finish reason, and cost. Matches the pre-migration
+     * behaviour where streamText was wrapped in `neurolink.provider.streamText`.
      */
-    getAISDKModel() {
-        return this.model;
+    onStreamStart(modelId) {
+        const span = streamTracer.startSpan("neurolink.provider.streamText", {
+            kind: SpanKind.CLIENT,
+            attributes: {
+                "gen_ai.system": "litellm",
+                "gen_ai.request.model": modelId,
+            },
+        });
+        let spanEnded = false;
+        const endSpan = () => {
+            if (!spanEnded) {
+                spanEnded = true;
+                span.end();
+            }
+        };
+        return {
+            onUsage: (usage) => {
+                span.setAttribute("gen_ai.usage.input_tokens", usage.promptTokens);
+                span.setAttribute("gen_ai.usage.output_tokens", usage.completionTokens);
+                const cost = calculateCost(this.providerName, this.modelName, {
+                    input: usage.promptTokens,
+                    output: usage.completionTokens,
+                    total: usage.totalTokens,
+                });
+                if (cost && cost > 0) {
+                    span.setAttribute("neurolink.cost", cost);
+                }
+            },
+            onFinish: (reason, capturedError) => {
+                span.setAttribute("gen_ai.response.finish_reason", reason || "unknown");
+                if (reason === "error") {
+                    span.setStatus({
+                        code: SpanStatusCode.ERROR,
+                        message: capturedError instanceof Error
+                            ? capturedError.message
+                            : String(capturedError ?? "stream error"),
+                    });
+                }
+                endSpan();
+            },
+        };
     }
     formatProviderError(error) {
         if (error instanceof TimeoutError) {
             return new NetworkError(`Request timed out: ${error.message}`, this.providerName);
         }
-        // Check for timeout by error name and message as fallback
         const errorRecord = error;
         if (errorRecord?.name === "TimeoutError" ||
             (typeof errorRecord?.message === "string" &&
@@ -103,10 +152,10 @@ export class LiteLLMProvider extends BaseProvider {
                 return new NetworkError("LiteLLM proxy server not available. Please start the LiteLLM proxy server at " +
                     `${process.env.LITELLM_BASE_URL || "http://localhost:4000"}`, this.providerName);
             }
-            // Curator P1-1: detect "team not allowed to access model" responses
-            // and surface as ModelAccessDeniedError with the allowed_models array
-            // parsed from the body. Must run before the generic "API key" check
-            // because LiteLLM phrases this as a 403 distinct from auth.
+            // Curator P1-1: detect "team not allowed to access model" responses and
+            // surface as ModelAccessDeniedError with the allowed_models array parsed
+            // from the body. Must run before the generic "API key" check because
+            // LiteLLM phrases this as a 403 distinct from auth.
             if (isModelAccessDeniedMessage(errorRecord.message)) {
                 return new ModelAccessDeniedError(errorRecord.message, {
                     provider: this.providerName,
@@ -130,446 +179,127 @@ export class LiteLLMProvider extends BaseProvider {
         return new ProviderError(`LiteLLM error: ${errorRecord?.message || "Unknown error"}`, this.providerName);
     }
     /**
-     * LiteLLM supports tools for compatible models
-     */
-    supportsTools() {
-        return true;
-    }
-    /**
-     * Provider-specific streaming implementation
-     * Note: This is only used when tools are disabled
-     */
-    async executeStream(options, analysisSchema) {
-        this.validateStreamOptions(options);
-        const startTime = Date.now();
-        let chunkCount = 0; // Track chunk count for debugging
-        // Reviewer follow-up: capture upstream provider errors via onError so
-        // the post-stream NoOutput detect can propagate the *real* cause
-        // (content_filter, provider crash, etc.) into the sentinel's
-        // providerError / modelResponseRaw instead of "No output generated".
-        let capturedProviderError;
-        const timeout = this.getTimeout(options);
-        const timeoutController = createTimeoutController(timeout, this.providerName, "stream");
-        try {
-            // Build message array from options with multimodal support
-            // Using protected helper from BaseProvider to eliminate code duplication
-            const messages = await this.buildMessagesForStream(options);
-            const model = await this.getAISDKModelWithMiddleware(options); // This is where network connection happens!
-            // Get tools - options.tools is pre-merged by BaseProvider.stream()
-            const shouldUseTools = !options.disableTools && this.supportsTools();
-            const tools = shouldUseTools
-                ? options.tools || (await this.getAllTools())
-                : {};
-            logger.debug(`LiteLLM: Tools for streaming`, {
-                shouldUseTools,
-                toolCount: Object.keys(tools).length,
-                toolNames: Object.keys(tools),
-            });
-            // Model-specific maxTokens handling - Gemini 2.5 models have issues with maxTokens
-            const modelName = this.modelName || getDefaultLiteLLMModel();
-            const isGemini25Model = modelName.includes("gemini-2.5") || modelName.includes("gemini/2.5");
-            const maxTokens = isGemini25Model ? undefined : options.maxTokens;
-            if (isGemini25Model && options.maxTokens) {
-                logger.debug(`LiteLLM: Skipping maxTokens for Gemini 2.5 model (known compatibility issue)`, {
-                    modelName,
-                    requestedMaxTokens: options.maxTokens,
-                });
-            }
-            // Build complete stream options with proper typing - matching Vertex pattern
-            let streamOptions = {
-                model: model,
-                messages: messages,
-                temperature: options.temperature,
-                ...(maxTokens && { maxTokens }), // Conditionally include maxTokens
-                ...(shouldUseTools &&
-                    Object.keys(tools).length > 0 && {
-                    tools,
-                    toolChoice: resolveToolChoice(options, tools, shouldUseTools),
-                    stopWhen: stepCountIs(options.maxSteps || DEFAULT_MAX_STEPS),
-                }),
-                abortSignal: composeAbortSignals(options.abortSignal, timeoutController?.controller.signal),
-                experimental_telemetry: this.telemetryHandler.getTelemetryConfig(options),
-                experimental_repairToolCall: this.getToolCallRepairFn(options),
-                onError: (event) => {
-                    const error = event.error;
-                    const errorMessage = error instanceof Error ? error.message : String(error);
-                    // Reviewer follow-up: propagate the captured error to the
-                    // post-stream NoOutput sentinel so telemetry sees the real
-                    // provider cause instead of "No output generated".
-                    capturedProviderError = error;
-                    logger.error(`LiteLLM: Stream error`, {
-                        provider: this.providerName,
-                        modelName: this.modelName,
-                        error: errorMessage,
-                        chunkCount,
-                    });
-                },
-                onFinish: (event) => {
-                    logger.debug(`LiteLLM: Stream finished`, {
-                        finishReason: event.finishReason,
-                        totalChunks: chunkCount,
-                    });
-                },
-                onChunk: () => {
-                    chunkCount++;
-                },
-                onStepFinish: ({ toolCalls, toolResults }) => {
-                    emitToolEndFromStepFinish(this.neurolink?.getEventEmitter(), toolResults);
-                    logger.info("Tool execution completed", { toolResults, toolCalls });
-                    for (const toolCall of toolCalls) {
-                        collectedToolCalls.push({
-                            toolCallId: toolCall.toolCallId,
-                            toolName: toolCall.toolName,
-                            args: toolCall.args ??
-                                toolCall.input ??
-                                toolCall
-                                    .parameters ??
-                                {},
-                        });
-                    }
-                    for (const toolResult of toolResults) {
-                        const rawToolResult = toolResult;
-                        collectedToolResults.push({
-                            toolName: toolResult.toolName,
-                            status: rawToolResult.error ? "failure" : "success",
-                            output: (rawToolResult.output ??
-                                rawToolResult.result) ??
-                                undefined,
-                            error: rawToolResult.error,
-                            id: rawToolResult.toolCallId ?? toolResult.toolName,
-                        });
-                    }
-                    this.handleToolExecutionStorage(toolCalls, toolResults, options, new Date()).catch((error) => {
-                        logger.warn("[LiteLLMProvider] Failed to store tool executions", {
-                            provider: this.providerName,
-                            error: error instanceof Error ? error.message : String(error),
-                        });
-                    });
-                },
-            };
-            // Add analysisSchema support if provided
-            if (analysisSchema) {
-                try {
-                    streamOptions = {
-                        ...streamOptions,
-                        experimental_output: Output.object({
-                            schema: analysisSchema,
-                        }),
-                    };
-                }
-                catch (error) {
-                    logger.warn("Schema application failed, continuing without schema", {
-                        error: String(error),
-                    });
-                }
-            }
-            // Wrap streamText in an OTel span to capture provider-level latency, token usage, and cost
-            const streamSpan = streamTracer.startSpan("neurolink.provider.streamText", {
-                kind: SpanKind.CLIENT,
-                attributes: {
-                    "gen_ai.system": "litellm",
-                    "gen_ai.request.model": getModelId(model, this.modelName || "unknown"),
-                },
-            });
-            let result;
-            const collectedToolCalls = [];
-            const collectedToolResults = [];
-            try {
-                result = streamText(streamOptions);
-            }
-            catch (streamError) {
-                streamSpan.setStatus({
-                    code: SpanStatusCode.ERROR,
-                    message: streamError instanceof Error
-                        ? streamError.message
-                        : String(streamError),
-                });
-                streamSpan.end();
-                throw streamError;
-            }
-            // Collect token usage, cost, and finish reason asynchronously when the stream completes,
-            // then end the span. This avoids blocking the stream consumer.
-            Promise.resolve(result.usage)
-                .then((usage) => {
-                streamSpan.setAttribute("gen_ai.usage.input_tokens", usage.inputTokens || 0);
-                streamSpan.setAttribute("gen_ai.usage.output_tokens", usage.outputTokens || 0);
-                const cost = calculateCost(this.providerName, this.modelName, {
-                    input: usage.inputTokens || 0,
-                    output: usage.outputTokens || 0,
-                    total: (usage.inputTokens || 0) + (usage.outputTokens || 0),
-                });
-                if (cost && cost > 0) {
-                    streamSpan.setAttribute("neurolink.cost", cost);
-                }
-            })
-                .catch(() => {
-                // Usage may not be available if the stream is aborted
-            });
-            Promise.resolve(result.finishReason)
-                .then((reason) => {
-                streamSpan.setAttribute("gen_ai.response.finish_reason", reason || "unknown");
-            })
-                .catch(() => {
-                // Finish reason may not be available if the stream is aborted
-            });
-            Promise.resolve(result.text)
-                .then(() => {
-                streamSpan.end();
-            })
-                .catch((err) => {
-                streamSpan.setStatus({
-                    code: SpanStatusCode.ERROR,
-                    message: err instanceof Error ? err.message : String(err),
-                });
-                streamSpan.end();
-            });
-            timeoutController?.cleanup();
-            const transformedStream = this.createLiteLLMTransformedStream(result, () => capturedProviderError);
-            // Create analytics promise that resolves after stream completion
-            const analyticsPromise = streamAnalyticsCollector.createAnalytics(this.providerName, this.modelName, result, Date.now() - startTime, {
-                requestId: options.requestId ??
-                    `litellm-stream-${Date.now()}`,
-                streamingMode: true,
-            });
-            return {
-                stream: transformedStream,
-                provider: this.providerName,
-                model: this.modelName,
-                ...(shouldUseTools && {
-                    toolCalls: collectedToolCalls,
-                    toolResults: collectedToolResults,
-                }),
-                analytics: analyticsPromise,
-                metadata: {
-                    startTime,
-                    streamId: `litellm-${Date.now()}`,
-                },
-            };
-        }
-        catch (error) {
-            timeoutController?.cleanup();
-            throw this.handleProviderError(error);
-        }
-    }
-    async *createLiteLLMTransformedStream(result, getCapturedProviderError) {
-        // Reviewer follow-up: gate the post-stream NoOutput detect on
-        // *content yielded*, not raw chunk count. AI SDK fullStream emits
-        // control events ({ type: "start" }, "step-start", etc.) before any
-        // text-delta — those incremented chunkCount and made the post-stream
-        // detect dead even when zero text was produced.
-        let contentYielded = 0;
-        try {
-            const streamToUse = result.fullStream || result.textStream;
-            for await (const chunk of streamToUse) {
-                if (chunk && typeof chunk === "object") {
-                    if ("type" in chunk && chunk.type === "error") {
-                        const errorChunk = chunk;
-                        logger.error(`LiteLLM: Error chunk received:`, {
-                            errorType: errorChunk.type,
-                            errorDetails: errorChunk.error,
-                        });
-                        throw this.formatProviderError(new Error(`LiteLLM streaming error: ${errorChunk.error?.message || "Unknown error"}`));
-                    }
-                    if ("textDelta" in chunk) {
-                        const textDelta = chunk.textDelta;
-                        if (textDelta) {
-                            contentYielded++;
-                            yield { content: textDelta };
-                        }
-                    }
-                    else if ("type" in chunk &&
-                        chunk.type === "tool-call" &&
-                        "toolCallId" in chunk) {
-                        logger.debug("LiteLLM: Tool call", {
-                            toolCallId: String(chunk.toolCallId),
-                            toolName: "toolName" in chunk ? String(chunk.toolName) : "unknown",
-                        });
-                    }
-                }
-                else if (typeof chunk === "string") {
-                    contentYielded++;
-                    yield { content: chunk };
-                }
-            }
-        }
-        catch (streamError) {
-            if (NoOutputGeneratedError.isInstance(streamError)) {
-                logger.warn("LiteLLM: Stream produced no output (NoOutputGeneratedError) — caught from textStream");
-                // Yield the enriched sentinel so downstream telemetry has
-                // finishReason / usage / providerError. Match the other
-                // providers' pattern: yield + return (no throw). NeuroLink's
-                // iteration fallback at neurolink.ts only fires for
-                // looksLikeModelAccessDenied errors, so a NoOutput throw here
-                // would NOT trigger any fallback — and it would mask the
-                // already-yielded sentinel from consumers expecting a clean
-                // stream. The sentinel itself signals the no-output condition.
-                const sentinel = await buildNoOutputSentinel(streamError, result, getCapturedProviderError?.());
-                stampNoOutputSpan(sentinel);
-                yield sentinel;
-                return;
-            }
-            throw streamError;
-        }
-        // Curator P3-6 (round-2 fix): production trigger sets the error on
-        // result.finishReason rejection (NOT thrown from textStream).
-        // Surface that path here, matching the catch above (yield + return).
-        if (contentYielded === 0) {
-            const detected = await detectPostStreamNoOutput(result, getCapturedProviderError?.());
-            if (detected) {
-                logger.warn("LiteLLM: Stream produced no output (NoOutputGeneratedError) — caught from finishReason rejection");
-                stampNoOutputSpan(detected.sentinel);
-                yield detected.sentinel;
-            }
-        }
-    }
-    /**
-     * Generate an embedding for a single text input
-     * Uses the LiteLLM proxy with OpenAI-compatible embedding API
-     */
-    async embed(text, modelName) {
-        const { embed: aiEmbed } = await import("../utils/generation.js");
-        const { createOpenAI } = await import("@ai-sdk/openai");
-        const config = getLiteLLMConfig();
-        const embeddingModelName = modelName ||
-            process.env.LITELLM_EMBEDDING_MODEL ||
-            "gemini-embedding-001";
-        const customOpenAI = createOpenAI({
-            baseURL: this.credentials?.baseURL ?? config.baseURL,
-            apiKey: this.credentials?.apiKey ?? config.apiKey,
-            fetch: createProxyFetch(),
-        });
-        const embeddingModel = customOpenAI.textEmbeddingModel(embeddingModelName);
-        // Wrap in withTimeout so stalled upstream embedding requests abort instead
-        // of hanging forever. 30s matches the default for embedding endpoints
-        // across the OpenAI-compatible cluster.
-        const result = await withTimeout(aiEmbed({ model: embeddingModel, value: text }), 30_000, "litellm", "generate");
-        return result.embedding;
-    }
-    /**
-     * Generate embeddings for multiple text inputs
-     * Uses the LiteLLM proxy with OpenAI-compatible embedding API
-     */
-    async embedMany(texts, modelName) {
-        const { embedMany: aiEmbedMany } = await import("../utils/generation.js");
-        const { createOpenAI } = await import("@ai-sdk/openai");
-        const config = getLiteLLMConfig();
-        const embeddingModelName = modelName ||
-            process.env.LITELLM_EMBEDDING_MODEL ||
-            "gemini-embedding-001";
-        const customOpenAI = createOpenAI({
-            baseURL: this.credentials?.baseURL ?? config.baseURL,
-            apiKey: this.credentials?.apiKey ?? config.apiKey,
-            fetch: createProxyFetch(),
-        });
-        const embeddingModel = customOpenAI.textEmbeddingModel(embeddingModelName);
-        // Wrap in withTimeout so a single slow batch doesn't hang indefinitely.
-        const result = await withTimeout(aiEmbedMany({ model: embeddingModel, values: texts }), 30_000, "litellm", "generate");
-        return result.embeddings;
-    }
-    /**
-     * Get available models from LiteLLM proxy server
-     * Dynamically fetches from /v1/models endpoint with caching and fallback
+     * Get available models from LiteLLM proxy `/v1/models` endpoint.
+     * Caches results for 10 minutes; falls back to env-driven list or a
+     * minimal safe default if the API fetch fails.
      */
     async getAvailableModels() {
-        const functionTag = "LiteLLMProvider.getAvailableModels";
         const now = Date.now();
-        // Check if cached models are still valid
         if (LiteLLMProvider.modelsCache.length > 0 &&
             now - LiteLLMProvider.modelsCacheTime <
                 LiteLLMProvider.MODELS_CACHE_DURATION) {
-            logger.debug(`[${functionTag}] Using cached models`, {
+            logger.debug("[LiteLLMProvider.getAvailableModels] Using cached models", {
                 cacheAge: Math.round((now - LiteLLMProvider.modelsCacheTime) / 1000),
                 modelCount: LiteLLMProvider.modelsCache.length,
             });
             return LiteLLMProvider.modelsCache;
         }
-        // Try to fetch models dynamically
         try {
             const dynamicModels = await this.fetchModelsFromAPI();
             if (dynamicModels.length > 0) {
-                // Cache successful result
                 LiteLLMProvider.modelsCache = dynamicModels;
                 LiteLLMProvider.modelsCacheTime = now;
-                logger.debug(`[${functionTag}] Successfully fetched models from API`, {
-                    modelCount: dynamicModels.length,
-                });
                 return dynamicModels;
             }
         }
         catch (error) {
-            logger.warn(`[${functionTag}] Failed to fetch models from API, using fallback`, {
-                error: error instanceof Error ? error.message : String(error),
-            });
+            logger.warn("[LiteLLMProvider.getAvailableModels] Failed to fetch models from API, using fallback", { error: error instanceof Error ? error.message : String(error) });
         }
-        // Fallback to hardcoded list if API fetch fails
-        const fallbackModels = process.env.LITELLM_FALLBACK_MODELS?.split(",")
-            .map((m) => m.trim())
-            .filter((m) => m.length > 0) || [
-            "openai/gpt-4o", // minimal safe baseline
-            "anthropic/claude-3-haiku",
-            "meta-llama/llama-3.1-8b-instruct",
-            "google/gemini-2.5-flash",
-        ];
-        logger.debug(`[${functionTag}] Using fallback model list`, {
-            modelCount: fallbackModels.length,
-        });
-        return fallbackModels;
+        return this.getFallbackModels();
     }
-    /**
-     * Fetch available models from LiteLLM proxy /v1/models endpoint
-     * @private
-     */
     async fetchModelsFromAPI() {
-        const functionTag = "LiteLLMProvider.fetchModelsFromAPI";
-        const config = getLiteLLMConfig();
-        const resolvedBaseURL = this.credentials?.baseURL ?? config.baseURL;
-        const resolvedApiKey = this.credentials?.apiKey ?? config.apiKey;
-        const modelsUrl = `${resolvedBaseURL}/v1/models`;
+        const modelsUrl = `${stripTrailingSlash(this.config.baseURL)}/v1/models`;
+        const proxyFetch = createProxyFetch();
         const controller = new AbortController();
-        const timeoutId = setTimeout(() => controller.abort(), 5000); // 5 second timeout
+        const timeoutId = setTimeout(() => controller.abort(), 5000);
         try {
-            logger.debug(`[${functionTag}] Fetching models from ${modelsUrl}`);
-            const proxyFetch = createProxyFetch();
             const response = await proxyFetch(modelsUrl, {
                 method: "GET",
                 headers: {
-                    Authorization: `Bearer ${resolvedApiKey}`,
+                    Authorization: `Bearer ${this.config.apiKey}`,
                     "Content-Type": "application/json",
                 },
                 signal: controller.signal,
             });
-            clearTimeout(timeoutId);
             if (!response.ok) {
                 throw new Error(`HTTP ${response.status}: ${response.statusText}`);
             }
-            const data = await response.json();
-            // Parse OpenAI-compatible models response
-            if (data && Array.isArray(data.data)) {
-                const models = data.data
-                    .map((model) => typeof model === "object" &&
-                    model !== null &&
-                    "id" in model &&
-                    typeof model.id === "string"
-                    ? model.id
-                    : undefined)
-                    .filter((id) => typeof id === "string" && id.length > 0)
-                    .sort();
-                logger.debug(`[${functionTag}] Successfully parsed models`, {
-                    totalModels: models.length,
-                    sampleModels: models.slice(0, 5),
-                });
-                return models;
-            }
-            else {
+            const data = (await response.json());
+            if (!Array.isArray(data.data)) {
                 throw new Error("Invalid response format: expected data.data array");
             }
+            return data.data
+                .map((m) => m.id)
+                .filter((id) => typeof id === "string" && id.length > 0)
+                .sort();
         }
         catch (error) {
-            clearTimeout(timeoutId);
             if (isAbortError(error)) {
                 throw new NetworkError("Request timed out after 5 seconds", this.providerName);
             }
             throw error;
         }
+        finally {
+            clearTimeout(timeoutId);
+        }
+    }
+    /**
+     * Generate an embedding for a single text input via native /v1/embeddings.
+     */
+    async embed(text, modelName) {
+        const embeddingModelName = modelName ||
+            process.env.LITELLM_EMBEDDING_MODEL ||
+            "gemini-embedding-001";
+        const [embedding] = await this.callEmbeddings(embeddingModelName, [text], "embed");
+        return embedding;
+    }
+    /**
+     * Generate embeddings for multiple text inputs via native /v1/embeddings.
+     */
+    async embedMany(texts, modelName) {
+        const embeddingModelName = modelName ||
+            process.env.LITELLM_EMBEDDING_MODEL ||
+            "gemini-embedding-001";
+        return this.callEmbeddings(embeddingModelName, texts, "embedMany");
+    }
+    async callEmbeddings(modelName, input, operation) {
+        const url = `${stripTrailingSlash(this.config.baseURL)}/embeddings`;
+        const fetchImpl = createProxyFetch();
+        const timeoutController = createTimeoutController(30_000, this.providerName, "generate");
+        try {
+            const res = await fetchImpl(url, {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json",
+                    Authorization: `Bearer ${this.config.apiKey}`,
+                },
+                body: JSON.stringify({
+                    model: modelName,
+                    input: input.length === 1 ? input[0] : input,
+                }),
+                ...(timeoutController?.controller.signal
+                    ? { signal: timeoutController.controller.signal }
+                    : {}),
+            });
+            if (!res.ok) {
+                const bodyText = await res.text().catch(() => "");
+                const parsed = bodyText
+                    ? JSON.parse(bodyText)
+                    : undefined;
+                throw this.formatProviderError(new Error(parsed?.error?.message ||
+                    `LiteLLM ${operation} failed with status ${res.status}`));
+            }
+            const json = (await res.json());
+            const embeddings = (json.data ?? [])
+                .map((row) => row.embedding)
+                .filter((e) => Array.isArray(e));
+            if (embeddings.length === 0) {
+                throw new ProviderError(`LiteLLM ${operation} returned no embeddings`, this.providerName);
+            }
+            return embeddings;
+        }
+        finally {
+            timeoutController?.cleanup();
+        }
     }
 }