@juspay/neurolink 9.67.1 → 9.67.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/dist/browser/neurolink.min.js +346 -346
- package/dist/lib/providers/googleVertex.js +8 -7
- package/dist/lib/providers/litellm.d.ts +32 -32
- package/dist/lib/providers/litellm.js +188 -458
- package/dist/lib/providers/openaiChatCompletionsBase.d.ts +93 -0
- package/dist/lib/providers/openaiChatCompletionsBase.js +644 -0
- package/dist/lib/providers/openaiChatCompletionsClient.d.ts +67 -0
- package/dist/lib/providers/openaiChatCompletionsClient.js +526 -0
- package/dist/lib/providers/openaiCompatible.d.ts +7 -63
- package/dist/lib/providers/openaiCompatible.js +27 -1168
- package/dist/lib/types/openaiCompatible.d.ts +20 -0
- package/dist/lib/types/providers.d.ts +2 -0
- package/dist/providers/googleVertex.js +8 -7
- package/dist/providers/litellm.d.ts +32 -32
- package/dist/providers/litellm.js +188 -458
- package/dist/providers/openaiChatCompletionsBase.d.ts +93 -0
- package/dist/providers/openaiChatCompletionsBase.js +643 -0
- package/dist/providers/openaiChatCompletionsClient.d.ts +67 -0
- package/dist/providers/openaiChatCompletionsClient.js +525 -0
- package/dist/providers/openaiCompatible.d.ts +7 -63
- package/dist/providers/openaiCompatible.js +27 -1168
- package/dist/types/openaiCompatible.d.ts +20 -0
- package/dist/types/providers.d.ts +2 -0
- package/package.json +1 -1
|
@@ -1,77 +1,59 @@
|
|
|
1
|
-
import { createOpenAI } from "@ai-sdk/openai";
|
|
2
1
|
import { SpanKind, SpanStatusCode, trace } from "@opentelemetry/api";
|
|
3
|
-
import { BaseProvider } from "../core/baseProvider.js";
|
|
4
|
-
import { DEFAULT_MAX_STEPS } from "../core/constants.js";
|
|
5
|
-
import { streamAnalyticsCollector } from "../core/streamAnalytics.js";
|
|
6
2
|
import { createProxyFetch } from "../proxy/proxyFetch.js";
|
|
7
3
|
import { AuthenticationError, InvalidModelError, ModelAccessDeniedError, NetworkError, ProviderError, RateLimitError, isModelAccessDeniedMessage, parseAllowedModels, } from "../types/index.js";
|
|
8
4
|
import { isAbortError } from "../utils/errorHandling.js";
|
|
9
|
-
import { emitToolEndFromStepFinish } from "../utils/toolEndEmitter.js";
|
|
10
5
|
import { logger } from "../utils/logger.js";
|
|
11
|
-
import {
|
|
6
|
+
import { isGemini25Model as isCanonicalGemini25Model } from "../utils/modelDetection.js";
|
|
12
7
|
import { calculateCost } from "../utils/pricing.js";
|
|
13
8
|
import { getProviderModel } from "../utils/providerConfig.js";
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {
|
|
17
|
-
import { NoOutputGeneratedError } from "../utils/generationErrors.js";
|
|
18
|
-
import { Output, stepCountIs } from "../utils/tool.js";
|
|
19
|
-
import { streamText } from "../utils/generation.js";
|
|
9
|
+
import { createTimeoutController, TimeoutError } from "../utils/timeout.js";
|
|
10
|
+
import { stripTrailingSlash } from "./openaiChatCompletionsClient.js";
|
|
11
|
+
import { OpenAIChatCompletionsProvider } from "./openaiChatCompletionsBase.js";
|
|
20
12
|
const streamTracer = trace.getTracer("neurolink.provider.litellm");
|
|
21
|
-
|
|
22
|
-
const getLiteLLMConfig = () => {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
};
|
|
27
|
-
};
|
|
13
|
+
const FALLBACK_LITELLM_MODEL = "openai/gpt-4o-mini";
|
|
14
|
+
const getLiteLLMConfig = () => ({
|
|
15
|
+
baseURL: process.env.LITELLM_BASE_URL || "http://localhost:4000",
|
|
16
|
+
apiKey: process.env.LITELLM_API_KEY || "sk-anything",
|
|
17
|
+
});
|
|
28
18
|
/**
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
* LiteLLM uses a 'provider/model' format for model names.
|
|
32
|
-
* For example:
|
|
33
|
-
* - 'openai/gpt-4o-mini'
|
|
34
|
-
* - 'openai/gpt-3.5-turbo'
|
|
35
|
-
* - 'anthropic/claude-3-sonnet-20240229'
|
|
36
|
-
* - 'google/gemini-pro'
|
|
37
|
-
*
|
|
38
|
-
* You can override the default by setting the LITELLM_MODEL environment variable.
|
|
19
|
+
* LiteLLM uses a 'provider/model' format. Override via LITELLM_MODEL env var.
|
|
39
20
|
*/
|
|
40
|
-
const getDefaultLiteLLMModel = () =>
|
|
41
|
-
|
|
21
|
+
const getDefaultLiteLLMModel = () => getProviderModel("LITELLM_MODEL", FALLBACK_LITELLM_MODEL);
|
|
22
|
+
// LiteLLM model ids come in `provider/model` form (e.g. "google/gemini-2.5-flash").
|
|
23
|
+
// Strip the provider prefix and delegate to the canonical anchored-regex
|
|
24
|
+
// check in src/lib/utils/modelDetection.ts so the truth lives in one place.
|
|
25
|
+
const isGemini25Model = (modelName) => {
|
|
26
|
+
const lastSegment = modelName.includes("/")
|
|
27
|
+
? modelName.slice(modelName.lastIndexOf("/") + 1)
|
|
28
|
+
: modelName;
|
|
29
|
+
return isCanonicalGemini25Model(lastSegment);
|
|
42
30
|
};
|
|
43
31
|
/**
|
|
44
|
-
* LiteLLM Provider
|
|
45
|
-
*
|
|
32
|
+
* LiteLLM Provider — direct HTTP, no AI SDK. Talks to a LiteLLM proxy
|
|
33
|
+
* server (or any deployment that speaks OpenAI chat-completions + the
|
|
34
|
+
* `/v1/models` and `/v1/embeddings` endpoints).
|
|
35
|
+
*
|
|
36
|
+
* All request/stream/tool-loop orchestration lives in
|
|
37
|
+
* `OpenAIChatCompletionsProvider`. This class adds LiteLLM-specific
|
|
38
|
+
* behaviour: OTel span wrap with cost (`onStreamStart`), Gemini 2.5
|
|
39
|
+
* maxTokens skip (`adjustBuildBodyOptions`), ModelAccessDeniedError on
|
|
40
|
+
* 403, 10-minute model cache (`getAvailableModels`), `LITELLM_FALLBACK_MODELS`
|
|
41
|
+
* env-driven fallback list, and native `/v1/embeddings`.
|
|
46
42
|
*/
|
|
47
|
-
export class LiteLLMProvider extends
|
|
48
|
-
model;
|
|
49
|
-
credentials;
|
|
50
|
-
// Cache for available models to avoid repeated API calls
|
|
43
|
+
export class LiteLLMProvider extends OpenAIChatCompletionsProvider {
|
|
51
44
|
static modelsCache = [];
|
|
52
45
|
static modelsCacheTime = 0;
|
|
53
46
|
static MODELS_CACHE_DURATION = 10 * 60 * 1000; // 10 minutes
|
|
54
47
|
constructor(modelName, sdk, _region, credentials) {
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const config = getLiteLLMConfig();
|
|
60
|
-
// Create OpenAI SDK instance configured for LiteLLM proxy
|
|
61
|
-
// LiteLLM acts as a proxy server that implements the OpenAI-compatible API.
|
|
62
|
-
// To communicate with LiteLLM instead of the default OpenAI endpoint, we use createOpenAI
|
|
63
|
-
// with a custom baseURL and apiKey. This ensures all requests are routed through the LiteLLM
|
|
64
|
-
// proxy, allowing access to multiple models and custom authentication.
|
|
65
|
-
const customOpenAI = createOpenAI({
|
|
66
|
-
baseURL: credentials?.baseURL ?? config.baseURL,
|
|
67
|
-
apiKey: credentials?.apiKey ?? config.apiKey,
|
|
68
|
-
fetch: createProxyFetch(),
|
|
48
|
+
const envConfig = getLiteLLMConfig();
|
|
49
|
+
super("litellm", modelName, sdk, {
|
|
50
|
+
baseURL: credentials?.baseURL ?? envConfig.baseURL,
|
|
51
|
+
apiKey: credentials?.apiKey ?? envConfig.apiKey,
|
|
69
52
|
});
|
|
70
|
-
this.model = customOpenAI.chat(this.modelName || getDefaultLiteLLMModel());
|
|
71
53
|
logger.debug("LiteLLM Provider initialized", {
|
|
72
54
|
modelName: this.modelName,
|
|
73
55
|
provider: this.providerName,
|
|
74
|
-
baseURL: config.baseURL,
|
|
56
|
+
baseURL: this.config.baseURL,
|
|
75
57
|
});
|
|
76
58
|
}
|
|
77
59
|
getProviderName() {
|
|
@@ -80,17 +62,84 @@ export class LiteLLMProvider extends BaseProvider {
|
|
|
80
62
|
getDefaultModel() {
|
|
81
63
|
return getDefaultLiteLLMModel();
|
|
82
64
|
}
|
|
65
|
+
getFallbackModelName() {
|
|
66
|
+
return FALLBACK_LITELLM_MODEL;
|
|
67
|
+
}
|
|
68
|
+
getFallbackModels() {
|
|
69
|
+
return (process.env.LITELLM_FALLBACK_MODELS?.split(",")
|
|
70
|
+
.map((m) => m.trim())
|
|
71
|
+
.filter((m) => m.length > 0) || [
|
|
72
|
+
"openai/gpt-4o",
|
|
73
|
+
"anthropic/claude-3-haiku",
|
|
74
|
+
"meta-llama/llama-3.1-8b-instruct",
|
|
75
|
+
"google/gemini-2.5-flash",
|
|
76
|
+
]);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Gemini 2.5 models on LiteLLM have a known compatibility issue with
|
|
80
|
+
* `max_tokens` — strip it before the wire body is built. Applies to
|
|
81
|
+
* both streaming and non-streaming paths.
|
|
82
|
+
*/
|
|
83
|
+
adjustBuildBodyOptions(modelId, opts) {
|
|
84
|
+
if (isGemini25Model(modelId) && opts.maxTokens !== undefined) {
|
|
85
|
+
if (logger.shouldLog("debug")) {
|
|
86
|
+
logger.debug("LiteLLM: Skipping maxTokens for Gemini 2.5 model (known compatibility issue)", { modelId, requestedMaxTokens: opts.maxTokens });
|
|
87
|
+
}
|
|
88
|
+
return { ...opts, maxTokens: undefined };
|
|
89
|
+
}
|
|
90
|
+
return opts;
|
|
91
|
+
}
|
|
83
92
|
/**
|
|
84
|
-
*
|
|
93
|
+
* Wrap the stream in an OTel span to capture provider-level latency,
|
|
94
|
+
* token usage, finish reason, and cost. Matches the pre-migration
|
|
95
|
+
* behaviour where streamText was wrapped in `neurolink.provider.streamText`.
|
|
85
96
|
*/
|
|
86
|
-
|
|
87
|
-
|
|
97
|
+
onStreamStart(modelId) {
|
|
98
|
+
const span = streamTracer.startSpan("neurolink.provider.streamText", {
|
|
99
|
+
kind: SpanKind.CLIENT,
|
|
100
|
+
attributes: {
|
|
101
|
+
"gen_ai.system": "litellm",
|
|
102
|
+
"gen_ai.request.model": modelId,
|
|
103
|
+
},
|
|
104
|
+
});
|
|
105
|
+
let spanEnded = false;
|
|
106
|
+
const endSpan = () => {
|
|
107
|
+
if (!spanEnded) {
|
|
108
|
+
spanEnded = true;
|
|
109
|
+
span.end();
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
return {
|
|
113
|
+
onUsage: (usage) => {
|
|
114
|
+
span.setAttribute("gen_ai.usage.input_tokens", usage.promptTokens);
|
|
115
|
+
span.setAttribute("gen_ai.usage.output_tokens", usage.completionTokens);
|
|
116
|
+
const cost = calculateCost(this.providerName, this.modelName, {
|
|
117
|
+
input: usage.promptTokens,
|
|
118
|
+
output: usage.completionTokens,
|
|
119
|
+
total: usage.totalTokens,
|
|
120
|
+
});
|
|
121
|
+
if (cost && cost > 0) {
|
|
122
|
+
span.setAttribute("neurolink.cost", cost);
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
onFinish: (reason, capturedError) => {
|
|
126
|
+
span.setAttribute("gen_ai.response.finish_reason", reason || "unknown");
|
|
127
|
+
if (reason === "error") {
|
|
128
|
+
span.setStatus({
|
|
129
|
+
code: SpanStatusCode.ERROR,
|
|
130
|
+
message: capturedError instanceof Error
|
|
131
|
+
? capturedError.message
|
|
132
|
+
: String(capturedError ?? "stream error"),
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
endSpan();
|
|
136
|
+
},
|
|
137
|
+
};
|
|
88
138
|
}
|
|
89
139
|
formatProviderError(error) {
|
|
90
140
|
if (error instanceof TimeoutError) {
|
|
91
141
|
return new NetworkError(`Request timed out: ${error.message}`, this.providerName);
|
|
92
142
|
}
|
|
93
|
-
// Check for timeout by error name and message as fallback
|
|
94
143
|
const errorRecord = error;
|
|
95
144
|
if (errorRecord?.name === "TimeoutError" ||
|
|
96
145
|
(typeof errorRecord?.message === "string" &&
|
|
@@ -103,10 +152,10 @@ export class LiteLLMProvider extends BaseProvider {
|
|
|
103
152
|
return new NetworkError("LiteLLM proxy server not available. Please start the LiteLLM proxy server at " +
|
|
104
153
|
`${process.env.LITELLM_BASE_URL || "http://localhost:4000"}`, this.providerName);
|
|
105
154
|
}
|
|
106
|
-
// Curator P1-1: detect "team not allowed to access model" responses
|
|
107
|
-
//
|
|
108
|
-
//
|
|
109
|
-
//
|
|
155
|
+
// Curator P1-1: detect "team not allowed to access model" responses and
|
|
156
|
+
// surface as ModelAccessDeniedError with the allowed_models array parsed
|
|
157
|
+
// from the body. Must run before the generic "API key" check because
|
|
158
|
+
// LiteLLM phrases this as a 403 distinct from auth.
|
|
110
159
|
if (isModelAccessDeniedMessage(errorRecord.message)) {
|
|
111
160
|
return new ModelAccessDeniedError(errorRecord.message, {
|
|
112
161
|
provider: this.providerName,
|
|
@@ -130,447 +179,128 @@ export class LiteLLMProvider extends BaseProvider {
|
|
|
130
179
|
return new ProviderError(`LiteLLM error: ${errorRecord?.message || "Unknown error"}`, this.providerName);
|
|
131
180
|
}
|
|
132
181
|
/**
|
|
133
|
-
*
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
return true;
|
|
137
|
-
}
|
|
138
|
-
/**
|
|
139
|
-
* Provider-specific streaming implementation
|
|
140
|
-
* Note: This is only used when tools are disabled
|
|
141
|
-
*/
|
|
142
|
-
async executeStream(options, analysisSchema) {
|
|
143
|
-
this.validateStreamOptions(options);
|
|
144
|
-
const startTime = Date.now();
|
|
145
|
-
let chunkCount = 0; // Track chunk count for debugging
|
|
146
|
-
// Reviewer follow-up: capture upstream provider errors via onError so
|
|
147
|
-
// the post-stream NoOutput detect can propagate the *real* cause
|
|
148
|
-
// (content_filter, provider crash, etc.) into the sentinel's
|
|
149
|
-
// providerError / modelResponseRaw instead of "No output generated".
|
|
150
|
-
let capturedProviderError;
|
|
151
|
-
const timeout = this.getTimeout(options);
|
|
152
|
-
const timeoutController = createTimeoutController(timeout, this.providerName, "stream");
|
|
153
|
-
try {
|
|
154
|
-
// Build message array from options with multimodal support
|
|
155
|
-
// Using protected helper from BaseProvider to eliminate code duplication
|
|
156
|
-
const messages = await this.buildMessagesForStream(options);
|
|
157
|
-
const model = await this.getAISDKModelWithMiddleware(options); // This is where network connection happens!
|
|
158
|
-
// Get tools - options.tools is pre-merged by BaseProvider.stream()
|
|
159
|
-
const shouldUseTools = !options.disableTools && this.supportsTools();
|
|
160
|
-
const tools = shouldUseTools
|
|
161
|
-
? options.tools || (await this.getAllTools())
|
|
162
|
-
: {};
|
|
163
|
-
logger.debug(`LiteLLM: Tools for streaming`, {
|
|
164
|
-
shouldUseTools,
|
|
165
|
-
toolCount: Object.keys(tools).length,
|
|
166
|
-
toolNames: Object.keys(tools),
|
|
167
|
-
});
|
|
168
|
-
// Model-specific maxTokens handling - Gemini 2.5 models have issues with maxTokens
|
|
169
|
-
const modelName = this.modelName || getDefaultLiteLLMModel();
|
|
170
|
-
const isGemini25Model = modelName.includes("gemini-2.5") || modelName.includes("gemini/2.5");
|
|
171
|
-
const maxTokens = isGemini25Model ? undefined : options.maxTokens;
|
|
172
|
-
if (isGemini25Model && options.maxTokens) {
|
|
173
|
-
logger.debug(`LiteLLM: Skipping maxTokens for Gemini 2.5 model (known compatibility issue)`, {
|
|
174
|
-
modelName,
|
|
175
|
-
requestedMaxTokens: options.maxTokens,
|
|
176
|
-
});
|
|
177
|
-
}
|
|
178
|
-
// Build complete stream options with proper typing - matching Vertex pattern
|
|
179
|
-
let streamOptions = {
|
|
180
|
-
model: model,
|
|
181
|
-
messages: messages,
|
|
182
|
-
temperature: options.temperature,
|
|
183
|
-
...(maxTokens && { maxTokens }), // Conditionally include maxTokens
|
|
184
|
-
...(shouldUseTools &&
|
|
185
|
-
Object.keys(tools).length > 0 && {
|
|
186
|
-
tools,
|
|
187
|
-
toolChoice: resolveToolChoice(options, tools, shouldUseTools),
|
|
188
|
-
stopWhen: stepCountIs(options.maxSteps || DEFAULT_MAX_STEPS),
|
|
189
|
-
}),
|
|
190
|
-
abortSignal: composeAbortSignals(options.abortSignal, timeoutController?.controller.signal),
|
|
191
|
-
experimental_telemetry: this.telemetryHandler.getTelemetryConfig(options),
|
|
192
|
-
experimental_repairToolCall: this.getToolCallRepairFn(options),
|
|
193
|
-
onError: (event) => {
|
|
194
|
-
const error = event.error;
|
|
195
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
196
|
-
// Reviewer follow-up: propagate the captured error to the
|
|
197
|
-
// post-stream NoOutput sentinel so telemetry sees the real
|
|
198
|
-
// provider cause instead of "No output generated".
|
|
199
|
-
capturedProviderError = error;
|
|
200
|
-
logger.error(`LiteLLM: Stream error`, {
|
|
201
|
-
provider: this.providerName,
|
|
202
|
-
modelName: this.modelName,
|
|
203
|
-
error: errorMessage,
|
|
204
|
-
chunkCount,
|
|
205
|
-
});
|
|
206
|
-
},
|
|
207
|
-
onFinish: (event) => {
|
|
208
|
-
logger.debug(`LiteLLM: Stream finished`, {
|
|
209
|
-
finishReason: event.finishReason,
|
|
210
|
-
totalChunks: chunkCount,
|
|
211
|
-
});
|
|
212
|
-
},
|
|
213
|
-
onChunk: () => {
|
|
214
|
-
chunkCount++;
|
|
215
|
-
},
|
|
216
|
-
onStepFinish: ({ toolCalls, toolResults }) => {
|
|
217
|
-
emitToolEndFromStepFinish(this.neurolink?.getEventEmitter(), toolResults);
|
|
218
|
-
logger.info("Tool execution completed", { toolResults, toolCalls });
|
|
219
|
-
for (const toolCall of toolCalls) {
|
|
220
|
-
collectedToolCalls.push({
|
|
221
|
-
toolCallId: toolCall.toolCallId,
|
|
222
|
-
toolName: toolCall.toolName,
|
|
223
|
-
args: toolCall.args ??
|
|
224
|
-
toolCall.input ??
|
|
225
|
-
toolCall
|
|
226
|
-
.parameters ??
|
|
227
|
-
{},
|
|
228
|
-
});
|
|
229
|
-
}
|
|
230
|
-
for (const toolResult of toolResults) {
|
|
231
|
-
const rawToolResult = toolResult;
|
|
232
|
-
collectedToolResults.push({
|
|
233
|
-
toolName: toolResult.toolName,
|
|
234
|
-
status: rawToolResult.error ? "failure" : "success",
|
|
235
|
-
output: (rawToolResult.output ??
|
|
236
|
-
rawToolResult.result) ??
|
|
237
|
-
undefined,
|
|
238
|
-
error: rawToolResult.error,
|
|
239
|
-
id: rawToolResult.toolCallId ?? toolResult.toolName,
|
|
240
|
-
});
|
|
241
|
-
}
|
|
242
|
-
this.handleToolExecutionStorage(toolCalls, toolResults, options, new Date()).catch((error) => {
|
|
243
|
-
logger.warn("[LiteLLMProvider] Failed to store tool executions", {
|
|
244
|
-
provider: this.providerName,
|
|
245
|
-
error: error instanceof Error ? error.message : String(error),
|
|
246
|
-
});
|
|
247
|
-
});
|
|
248
|
-
},
|
|
249
|
-
};
|
|
250
|
-
// Add analysisSchema support if provided
|
|
251
|
-
if (analysisSchema) {
|
|
252
|
-
try {
|
|
253
|
-
streamOptions = {
|
|
254
|
-
...streamOptions,
|
|
255
|
-
experimental_output: Output.object({
|
|
256
|
-
schema: analysisSchema,
|
|
257
|
-
}),
|
|
258
|
-
};
|
|
259
|
-
}
|
|
260
|
-
catch (error) {
|
|
261
|
-
logger.warn("Schema application failed, continuing without schema", {
|
|
262
|
-
error: String(error),
|
|
263
|
-
});
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
// Wrap streamText in an OTel span to capture provider-level latency, token usage, and cost
|
|
267
|
-
const streamSpan = streamTracer.startSpan("neurolink.provider.streamText", {
|
|
268
|
-
kind: SpanKind.CLIENT,
|
|
269
|
-
attributes: {
|
|
270
|
-
"gen_ai.system": "litellm",
|
|
271
|
-
"gen_ai.request.model": getModelId(model, this.modelName || "unknown"),
|
|
272
|
-
},
|
|
273
|
-
});
|
|
274
|
-
let result;
|
|
275
|
-
const collectedToolCalls = [];
|
|
276
|
-
const collectedToolResults = [];
|
|
277
|
-
try {
|
|
278
|
-
result = streamText(streamOptions);
|
|
279
|
-
}
|
|
280
|
-
catch (streamError) {
|
|
281
|
-
streamSpan.setStatus({
|
|
282
|
-
code: SpanStatusCode.ERROR,
|
|
283
|
-
message: streamError instanceof Error
|
|
284
|
-
? streamError.message
|
|
285
|
-
: String(streamError),
|
|
286
|
-
});
|
|
287
|
-
streamSpan.end();
|
|
288
|
-
throw streamError;
|
|
289
|
-
}
|
|
290
|
-
// Collect token usage, cost, and finish reason asynchronously when the stream completes,
|
|
291
|
-
// then end the span. This avoids blocking the stream consumer.
|
|
292
|
-
Promise.resolve(result.usage)
|
|
293
|
-
.then((usage) => {
|
|
294
|
-
streamSpan.setAttribute("gen_ai.usage.input_tokens", usage.inputTokens || 0);
|
|
295
|
-
streamSpan.setAttribute("gen_ai.usage.output_tokens", usage.outputTokens || 0);
|
|
296
|
-
const cost = calculateCost(this.providerName, this.modelName, {
|
|
297
|
-
input: usage.inputTokens || 0,
|
|
298
|
-
output: usage.outputTokens || 0,
|
|
299
|
-
total: (usage.inputTokens || 0) + (usage.outputTokens || 0),
|
|
300
|
-
});
|
|
301
|
-
if (cost && cost > 0) {
|
|
302
|
-
streamSpan.setAttribute("neurolink.cost", cost);
|
|
303
|
-
}
|
|
304
|
-
})
|
|
305
|
-
.catch(() => {
|
|
306
|
-
// Usage may not be available if the stream is aborted
|
|
307
|
-
});
|
|
308
|
-
Promise.resolve(result.finishReason)
|
|
309
|
-
.then((reason) => {
|
|
310
|
-
streamSpan.setAttribute("gen_ai.response.finish_reason", reason || "unknown");
|
|
311
|
-
})
|
|
312
|
-
.catch(() => {
|
|
313
|
-
// Finish reason may not be available if the stream is aborted
|
|
314
|
-
});
|
|
315
|
-
Promise.resolve(result.text)
|
|
316
|
-
.then(() => {
|
|
317
|
-
streamSpan.end();
|
|
318
|
-
})
|
|
319
|
-
.catch((err) => {
|
|
320
|
-
streamSpan.setStatus({
|
|
321
|
-
code: SpanStatusCode.ERROR,
|
|
322
|
-
message: err instanceof Error ? err.message : String(err),
|
|
323
|
-
});
|
|
324
|
-
streamSpan.end();
|
|
325
|
-
});
|
|
326
|
-
timeoutController?.cleanup();
|
|
327
|
-
const transformedStream = this.createLiteLLMTransformedStream(result, () => capturedProviderError);
|
|
328
|
-
// Create analytics promise that resolves after stream completion
|
|
329
|
-
const analyticsPromise = streamAnalyticsCollector.createAnalytics(this.providerName, this.modelName, result, Date.now() - startTime, {
|
|
330
|
-
requestId: options.requestId ??
|
|
331
|
-
`litellm-stream-${Date.now()}`,
|
|
332
|
-
streamingMode: true,
|
|
333
|
-
});
|
|
334
|
-
return {
|
|
335
|
-
stream: transformedStream,
|
|
336
|
-
provider: this.providerName,
|
|
337
|
-
model: this.modelName,
|
|
338
|
-
...(shouldUseTools && {
|
|
339
|
-
toolCalls: collectedToolCalls,
|
|
340
|
-
toolResults: collectedToolResults,
|
|
341
|
-
}),
|
|
342
|
-
analytics: analyticsPromise,
|
|
343
|
-
metadata: {
|
|
344
|
-
startTime,
|
|
345
|
-
streamId: `litellm-${Date.now()}`,
|
|
346
|
-
},
|
|
347
|
-
};
|
|
348
|
-
}
|
|
349
|
-
catch (error) {
|
|
350
|
-
timeoutController?.cleanup();
|
|
351
|
-
throw this.handleProviderError(error);
|
|
352
|
-
}
|
|
353
|
-
}
|
|
354
|
-
async *createLiteLLMTransformedStream(result, getCapturedProviderError) {
|
|
355
|
-
// Reviewer follow-up: gate the post-stream NoOutput detect on
|
|
356
|
-
// *content yielded*, not raw chunk count. AI SDK fullStream emits
|
|
357
|
-
// control events ({ type: "start" }, "step-start", etc.) before any
|
|
358
|
-
// text-delta — those incremented chunkCount and made the post-stream
|
|
359
|
-
// detect dead even when zero text was produced.
|
|
360
|
-
let contentYielded = 0;
|
|
361
|
-
try {
|
|
362
|
-
const streamToUse = result.fullStream || result.textStream;
|
|
363
|
-
for await (const chunk of streamToUse) {
|
|
364
|
-
if (chunk && typeof chunk === "object") {
|
|
365
|
-
if ("type" in chunk && chunk.type === "error") {
|
|
366
|
-
const errorChunk = chunk;
|
|
367
|
-
logger.error(`LiteLLM: Error chunk received:`, {
|
|
368
|
-
errorType: errorChunk.type,
|
|
369
|
-
errorDetails: errorChunk.error,
|
|
370
|
-
});
|
|
371
|
-
throw this.formatProviderError(new Error(`LiteLLM streaming error: ${errorChunk.error?.message || "Unknown error"}`));
|
|
372
|
-
}
|
|
373
|
-
if ("textDelta" in chunk) {
|
|
374
|
-
const textDelta = chunk.textDelta;
|
|
375
|
-
if (textDelta) {
|
|
376
|
-
contentYielded++;
|
|
377
|
-
yield { content: textDelta };
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
else if ("type" in chunk &&
|
|
381
|
-
chunk.type === "tool-call" &&
|
|
382
|
-
"toolCallId" in chunk) {
|
|
383
|
-
logger.debug("LiteLLM: Tool call", {
|
|
384
|
-
toolCallId: String(chunk.toolCallId),
|
|
385
|
-
toolName: "toolName" in chunk ? String(chunk.toolName) : "unknown",
|
|
386
|
-
});
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
else if (typeof chunk === "string") {
|
|
390
|
-
contentYielded++;
|
|
391
|
-
yield { content: chunk };
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
catch (streamError) {
|
|
396
|
-
if (NoOutputGeneratedError.isInstance(streamError)) {
|
|
397
|
-
logger.warn("LiteLLM: Stream produced no output (NoOutputGeneratedError) — caught from textStream");
|
|
398
|
-
// Yield the enriched sentinel so downstream telemetry has
|
|
399
|
-
// finishReason / usage / providerError. Match the other
|
|
400
|
-
// providers' pattern: yield + return (no throw). NeuroLink's
|
|
401
|
-
// iteration fallback at neurolink.ts only fires for
|
|
402
|
-
// looksLikeModelAccessDenied errors, so a NoOutput throw here
|
|
403
|
-
// would NOT trigger any fallback — and it would mask the
|
|
404
|
-
// already-yielded sentinel from consumers expecting a clean
|
|
405
|
-
// stream. The sentinel itself signals the no-output condition.
|
|
406
|
-
const sentinel = await buildNoOutputSentinel(streamError, result, getCapturedProviderError?.());
|
|
407
|
-
stampNoOutputSpan(sentinel);
|
|
408
|
-
yield sentinel;
|
|
409
|
-
return;
|
|
410
|
-
}
|
|
411
|
-
throw streamError;
|
|
412
|
-
}
|
|
413
|
-
// Curator P3-6 (round-2 fix): production trigger sets the error on
|
|
414
|
-
// result.finishReason rejection (NOT thrown from textStream).
|
|
415
|
-
// Surface that path here, matching the catch above (yield + return).
|
|
416
|
-
if (contentYielded === 0) {
|
|
417
|
-
const detected = await detectPostStreamNoOutput(result, getCapturedProviderError?.());
|
|
418
|
-
if (detected) {
|
|
419
|
-
logger.warn("LiteLLM: Stream produced no output (NoOutputGeneratedError) — caught from finishReason rejection");
|
|
420
|
-
stampNoOutputSpan(detected.sentinel);
|
|
421
|
-
yield detected.sentinel;
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
/**
|
|
426
|
-
* Generate an embedding for a single text input
|
|
427
|
-
* Uses the LiteLLM proxy with OpenAI-compatible embedding API
|
|
428
|
-
*/
|
|
429
|
-
async embed(text, modelName) {
|
|
430
|
-
const { embed: aiEmbed } = await import("../utils/generation.js");
|
|
431
|
-
const { createOpenAI } = await import("@ai-sdk/openai");
|
|
432
|
-
const config = getLiteLLMConfig();
|
|
433
|
-
const embeddingModelName = modelName ||
|
|
434
|
-
process.env.LITELLM_EMBEDDING_MODEL ||
|
|
435
|
-
"gemini-embedding-001";
|
|
436
|
-
const customOpenAI = createOpenAI({
|
|
437
|
-
baseURL: this.credentials?.baseURL ?? config.baseURL,
|
|
438
|
-
apiKey: this.credentials?.apiKey ?? config.apiKey,
|
|
439
|
-
fetch: createProxyFetch(),
|
|
440
|
-
});
|
|
441
|
-
const embeddingModel = customOpenAI.textEmbeddingModel(embeddingModelName);
|
|
442
|
-
// Wrap in withTimeout so stalled upstream embedding requests abort instead
|
|
443
|
-
// of hanging forever. 30s matches the default for embedding endpoints
|
|
444
|
-
// across the OpenAI-compatible cluster.
|
|
445
|
-
const result = await withTimeout(aiEmbed({ model: embeddingModel, value: text }), 30_000, "litellm", "generate");
|
|
446
|
-
return result.embedding;
|
|
447
|
-
}
|
|
448
|
-
/**
|
|
449
|
-
* Generate embeddings for multiple text inputs
|
|
450
|
-
* Uses the LiteLLM proxy with OpenAI-compatible embedding API
|
|
451
|
-
*/
|
|
452
|
-
async embedMany(texts, modelName) {
|
|
453
|
-
const { embedMany: aiEmbedMany } = await import("../utils/generation.js");
|
|
454
|
-
const { createOpenAI } = await import("@ai-sdk/openai");
|
|
455
|
-
const config = getLiteLLMConfig();
|
|
456
|
-
const embeddingModelName = modelName ||
|
|
457
|
-
process.env.LITELLM_EMBEDDING_MODEL ||
|
|
458
|
-
"gemini-embedding-001";
|
|
459
|
-
const customOpenAI = createOpenAI({
|
|
460
|
-
baseURL: this.credentials?.baseURL ?? config.baseURL,
|
|
461
|
-
apiKey: this.credentials?.apiKey ?? config.apiKey,
|
|
462
|
-
fetch: createProxyFetch(),
|
|
463
|
-
});
|
|
464
|
-
const embeddingModel = customOpenAI.textEmbeddingModel(embeddingModelName);
|
|
465
|
-
// Wrap in withTimeout so a single slow batch doesn't hang indefinitely.
|
|
466
|
-
const result = await withTimeout(aiEmbedMany({ model: embeddingModel, values: texts }), 30_000, "litellm", "generate");
|
|
467
|
-
return result.embeddings;
|
|
468
|
-
}
|
|
469
|
-
/**
|
|
470
|
-
* Get available models from LiteLLM proxy server
|
|
471
|
-
* Dynamically fetches from /v1/models endpoint with caching and fallback
|
|
182
|
+
* Get available models from LiteLLM proxy `/v1/models` endpoint.
|
|
183
|
+
* Caches results for 10 minutes; falls back to env-driven list or a
|
|
184
|
+
* minimal safe default if the API fetch fails.
|
|
472
185
|
*/
|
|
473
186
|
async getAvailableModels() {
|
|
474
|
-
const functionTag = "LiteLLMProvider.getAvailableModels";
|
|
475
187
|
const now = Date.now();
|
|
476
|
-
// Check if cached models are still valid
|
|
477
188
|
if (LiteLLMProvider.modelsCache.length > 0 &&
|
|
478
189
|
now - LiteLLMProvider.modelsCacheTime <
|
|
479
190
|
LiteLLMProvider.MODELS_CACHE_DURATION) {
|
|
480
|
-
logger.debug(
|
|
191
|
+
logger.debug("[LiteLLMProvider.getAvailableModels] Using cached models", {
|
|
481
192
|
cacheAge: Math.round((now - LiteLLMProvider.modelsCacheTime) / 1000),
|
|
482
193
|
modelCount: LiteLLMProvider.modelsCache.length,
|
|
483
194
|
});
|
|
484
195
|
return LiteLLMProvider.modelsCache;
|
|
485
196
|
}
|
|
486
|
-
// Try to fetch models dynamically
|
|
487
197
|
try {
|
|
488
198
|
const dynamicModels = await this.fetchModelsFromAPI();
|
|
489
199
|
if (dynamicModels.length > 0) {
|
|
490
|
-
// Cache successful result
|
|
491
200
|
LiteLLMProvider.modelsCache = dynamicModels;
|
|
492
201
|
LiteLLMProvider.modelsCacheTime = now;
|
|
493
|
-
logger.debug(`[${functionTag}] Successfully fetched models from API`, {
|
|
494
|
-
modelCount: dynamicModels.length,
|
|
495
|
-
});
|
|
496
202
|
return dynamicModels;
|
|
497
203
|
}
|
|
498
204
|
}
|
|
499
205
|
catch (error) {
|
|
500
|
-
logger.warn(
|
|
501
|
-
error: error instanceof Error ? error.message : String(error),
|
|
502
|
-
});
|
|
206
|
+
logger.warn("[LiteLLMProvider.getAvailableModels] Failed to fetch models from API, using fallback", { error: error instanceof Error ? error.message : String(error) });
|
|
503
207
|
}
|
|
504
|
-
|
|
505
|
-
const fallbackModels = process.env.LITELLM_FALLBACK_MODELS?.split(",")
|
|
506
|
-
.map((m) => m.trim())
|
|
507
|
-
.filter((m) => m.length > 0) || [
|
|
508
|
-
"openai/gpt-4o", // minimal safe baseline
|
|
509
|
-
"anthropic/claude-3-haiku",
|
|
510
|
-
"meta-llama/llama-3.1-8b-instruct",
|
|
511
|
-
"google/gemini-2.5-flash",
|
|
512
|
-
];
|
|
513
|
-
logger.debug(`[${functionTag}] Using fallback model list`, {
|
|
514
|
-
modelCount: fallbackModels.length,
|
|
515
|
-
});
|
|
516
|
-
return fallbackModels;
|
|
208
|
+
return this.getFallbackModels();
|
|
517
209
|
}
|
|
518
|
-
/**
|
|
519
|
-
* Fetch available models from LiteLLM proxy /v1/models endpoint
|
|
520
|
-
* @private
|
|
521
|
-
*/
|
|
522
210
|
async fetchModelsFromAPI() {
|
|
523
|
-
const
|
|
524
|
-
const
|
|
525
|
-
const resolvedBaseURL = this.credentials?.baseURL ?? config.baseURL;
|
|
526
|
-
const resolvedApiKey = this.credentials?.apiKey ?? config.apiKey;
|
|
527
|
-
const modelsUrl = `${resolvedBaseURL}/v1/models`;
|
|
211
|
+
const modelsUrl = `${stripTrailingSlash(this.config.baseURL)}/v1/models`;
|
|
212
|
+
const proxyFetch = createProxyFetch();
|
|
528
213
|
const controller = new AbortController();
|
|
529
|
-
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
214
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
530
215
|
try {
|
|
531
|
-
logger.debug(`[${functionTag}] Fetching models from ${modelsUrl}`);
|
|
532
|
-
const proxyFetch = createProxyFetch();
|
|
533
216
|
const response = await proxyFetch(modelsUrl, {
|
|
534
217
|
method: "GET",
|
|
535
218
|
headers: {
|
|
536
|
-
Authorization: `Bearer ${
|
|
219
|
+
Authorization: `Bearer ${this.config.apiKey}`,
|
|
537
220
|
"Content-Type": "application/json",
|
|
538
221
|
},
|
|
539
222
|
signal: controller.signal,
|
|
540
223
|
});
|
|
541
|
-
clearTimeout(timeoutId);
|
|
542
224
|
if (!response.ok) {
|
|
543
225
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
544
226
|
}
|
|
545
|
-
const data = await response.json();
|
|
546
|
-
|
|
547
|
-
if (data && Array.isArray(data.data)) {
|
|
548
|
-
const models = data.data
|
|
549
|
-
.map((model) => typeof model === "object" &&
|
|
550
|
-
model !== null &&
|
|
551
|
-
"id" in model &&
|
|
552
|
-
typeof model.id === "string"
|
|
553
|
-
? model.id
|
|
554
|
-
: undefined)
|
|
555
|
-
.filter((id) => typeof id === "string" && id.length > 0)
|
|
556
|
-
.sort();
|
|
557
|
-
logger.debug(`[${functionTag}] Successfully parsed models`, {
|
|
558
|
-
totalModels: models.length,
|
|
559
|
-
sampleModels: models.slice(0, 5),
|
|
560
|
-
});
|
|
561
|
-
return models;
|
|
562
|
-
}
|
|
563
|
-
else {
|
|
227
|
+
const data = (await response.json());
|
|
228
|
+
if (!Array.isArray(data.data)) {
|
|
564
229
|
throw new Error("Invalid response format: expected data.data array");
|
|
565
230
|
}
|
|
231
|
+
return data.data
|
|
232
|
+
.map((m) => m.id)
|
|
233
|
+
.filter((id) => typeof id === "string" && id.length > 0)
|
|
234
|
+
.sort();
|
|
566
235
|
}
|
|
567
236
|
catch (error) {
|
|
568
|
-
clearTimeout(timeoutId);
|
|
569
237
|
if (isAbortError(error)) {
|
|
570
238
|
throw new NetworkError("Request timed out after 5 seconds", this.providerName);
|
|
571
239
|
}
|
|
572
240
|
throw error;
|
|
573
241
|
}
|
|
242
|
+
finally {
|
|
243
|
+
clearTimeout(timeoutId);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Generate an embedding for a single text input via native /v1/embeddings.
|
|
248
|
+
*/
|
|
249
|
+
async embed(text, modelName) {
|
|
250
|
+
const embeddingModelName = modelName ||
|
|
251
|
+
process.env.LITELLM_EMBEDDING_MODEL ||
|
|
252
|
+
"gemini-embedding-001";
|
|
253
|
+
const [embedding] = await this.callEmbeddings(embeddingModelName, [text], "embed");
|
|
254
|
+
return embedding;
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Generate embeddings for multiple text inputs via native /v1/embeddings.
|
|
258
|
+
*/
|
|
259
|
+
async embedMany(texts, modelName) {
|
|
260
|
+
const embeddingModelName = modelName ||
|
|
261
|
+
process.env.LITELLM_EMBEDDING_MODEL ||
|
|
262
|
+
"gemini-embedding-001";
|
|
263
|
+
return this.callEmbeddings(embeddingModelName, texts, "embedMany");
|
|
264
|
+
}
|
|
265
|
+
async callEmbeddings(modelName, input, operation) {
|
|
266
|
+
const url = `${stripTrailingSlash(this.config.baseURL)}/embeddings`;
|
|
267
|
+
const fetchImpl = createProxyFetch();
|
|
268
|
+
const timeoutController = createTimeoutController(30_000, this.providerName, "generate");
|
|
269
|
+
try {
|
|
270
|
+
const res = await fetchImpl(url, {
|
|
271
|
+
method: "POST",
|
|
272
|
+
headers: {
|
|
273
|
+
"Content-Type": "application/json",
|
|
274
|
+
Authorization: `Bearer ${this.config.apiKey}`,
|
|
275
|
+
},
|
|
276
|
+
body: JSON.stringify({
|
|
277
|
+
model: modelName,
|
|
278
|
+
input: input.length === 1 ? input[0] : input,
|
|
279
|
+
}),
|
|
280
|
+
...(timeoutController?.controller.signal
|
|
281
|
+
? { signal: timeoutController.controller.signal }
|
|
282
|
+
: {}),
|
|
283
|
+
});
|
|
284
|
+
if (!res.ok) {
|
|
285
|
+
const bodyText = await res.text().catch(() => "");
|
|
286
|
+
const parsed = bodyText
|
|
287
|
+
? JSON.parse(bodyText)
|
|
288
|
+
: undefined;
|
|
289
|
+
throw this.formatProviderError(new Error(parsed?.error?.message ||
|
|
290
|
+
`LiteLLM ${operation} failed with status ${res.status}`));
|
|
291
|
+
}
|
|
292
|
+
const json = (await res.json());
|
|
293
|
+
const embeddings = (json.data ?? [])
|
|
294
|
+
.map((row) => row.embedding)
|
|
295
|
+
.filter((e) => Array.isArray(e));
|
|
296
|
+
if (embeddings.length === 0) {
|
|
297
|
+
throw new ProviderError(`LiteLLM ${operation} returned no embeddings`, this.providerName);
|
|
298
|
+
}
|
|
299
|
+
return embeddings;
|
|
300
|
+
}
|
|
301
|
+
finally {
|
|
302
|
+
timeoutController?.cleanup();
|
|
303
|
+
}
|
|
574
304
|
}
|
|
575
305
|
}
|
|
576
306
|
//# sourceMappingURL=litellm.js.map
|