@relayplane/proxy 1.8.5 → 1.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,8 @@ const index_js_1 = require("./mesh/index.js");
75
75
  const response_cache_js_1 = require("./response-cache.js");
76
76
  const stats_js_1 = require("./stats.js");
77
77
  const rate_limiter_js_1 = require("./rate-limiter.js");
78
+ const ollama_js_1 = require("./ollama.js");
79
+ const cross_provider_cascade_js_1 = require("./cross-provider-cascade.js");
78
80
  const budget_js_1 = require("./budget.js");
79
81
  const anomaly_js_1 = require("./anomaly.js");
80
82
  const alerts_js_1 = require("./alerts.js");
@@ -186,6 +188,10 @@ exports.DEFAULT_ENDPOINTS = {
186
188
  baseUrl: 'https://api.perplexity.ai',
187
189
  apiKeyEnv: 'PERPLEXITY_API_KEY',
188
190
  },
191
+ ollama: {
192
+ baseUrl: 'http://localhost:11434',
193
+ apiKeyEnv: 'OLLAMA_API_KEY', // Not actually required, placeholder for consistency
194
+ },
189
195
  };
190
196
  /**
191
197
  * Model to provider/model mapping
@@ -726,6 +732,8 @@ const DEFAULT_PROXY_CONFIG = {
726
732
  };
727
733
  /** Module-level ref to active proxy config (set during startProxy) */
728
734
  let _activeProxyConfig = {};
735
+ /** Module-level ref to active Ollama config (set during startProxy) */
736
+ let _activeOllamaConfig;
729
737
  function isContentLoggingEnabled() {
730
738
  return _activeProxyConfig.dashboard?.showRequestContent !== false;
731
739
  }
@@ -1901,10 +1909,14 @@ function resolveExplicitModel(modelName) {
1901
1909
  if (modelName.startsWith('deepseek-') || modelName.startsWith('groq-')) {
1902
1910
  return { provider: 'openrouter', model: modelName };
1903
1911
  }
1912
+ // Ollama models: "ollama/llama3.2" or direct model names when Ollama config exists
1913
+ if (modelName.startsWith('ollama/')) {
1914
+ return { provider: 'ollama', model: modelName.slice('ollama/'.length) };
1915
+ }
1904
1916
  // Provider-prefixed format: "anthropic/claude-3-5-sonnet-latest"
1905
1917
  if (modelName.includes('/')) {
1906
1918
  const [provider, model] = modelName.split('/');
1907
- const validProviders = ['openai', 'anthropic', 'google', 'xai', 'openrouter', 'deepseek', 'groq', 'local'];
1919
+ const validProviders = ['openai', 'anthropic', 'google', 'xai', 'openrouter', 'deepseek', 'groq', 'local', 'ollama'];
1908
1920
  if (provider && model && validProviders.includes(provider)) {
1909
1921
  return { provider: provider, model };
1910
1922
  }
@@ -1962,6 +1974,57 @@ function checkResponseModelMismatch(responseData, requestedModel, provider, log)
1962
1974
  * Extract a human-readable error message from a provider error payload.
1963
1975
  * Handles Anthropic ({ error: { type, message } }) and OpenAI ({ error: { message } }) formats.
1964
1976
  */
1977
+ /**
1978
+ * Convert a native Anthropic messages request body into the OpenAI-compatible
1979
+ * ChatRequest format used by forwardToOpenAICompatible and related helpers.
1980
+ *
1981
+ * This allows cross-provider cascade from Anthropic → OpenRouter (and others)
1982
+ * without losing the original request content. (GH #38)
1983
+ */
1984
+ function convertNativeAnthropicBodyToChatRequest(body, mappedModel) {
1985
+ const rawMessages = Array.isArray(body['messages'])
1986
+ ? body['messages']
1987
+ : [];
1988
+ const messages = [];
1989
+ // Prepend system message if present
1990
+ if (body['system'] && typeof body['system'] === 'string') {
1991
+ messages.push({ role: 'system', content: body['system'] });
1992
+ }
1993
+ else if (Array.isArray(body['system'])) {
1994
+ // Anthropic structured system (array of {type, text}) — flatten to text
1995
+ const systemText = body['system']
1996
+ .filter((b) => b.type === 'text')
1997
+ .map((b) => b.text ?? '')
1998
+ .join('\n');
1999
+ if (systemText)
2000
+ messages.push({ role: 'system', content: systemText });
2001
+ }
2002
+ for (const msg of rawMessages) {
2003
+ const role = msg['role'];
2004
+ const content = msg['content'];
2005
+ if (typeof content === 'string') {
2006
+ messages.push({ role: role, content });
2007
+ }
2008
+ else if (Array.isArray(content)) {
2009
+ // Anthropic content blocks — extract text parts
2010
+ const text = content
2011
+ .filter((b) => b.type === 'text')
2012
+ .map((b) => b.text ?? '')
2013
+ .join('');
2014
+ messages.push({ role: role, content: text });
2015
+ }
2016
+ else {
2017
+ messages.push({ role: role, content: String(content ?? '') });
2018
+ }
2019
+ }
2020
+ return {
2021
+ model: mappedModel,
2022
+ messages,
2023
+ max_tokens: body['max_tokens'] ?? 4096,
2024
+ temperature: body['temperature'],
2025
+ stream: false,
2026
+ };
2027
+ }
1965
2028
  function extractProviderErrorMessage(payload, statusCode) {
1966
2029
  const err = payload['error'];
1967
2030
  if (typeof err === 'string')
@@ -2047,6 +2110,10 @@ function resolveProviderApiKey(provider, ctx, envApiKey) {
2047
2110
  }
2048
2111
  return { apiKey: envApiKey };
2049
2112
  }
2113
+ // Ollama doesn't need an API key — it's local
2114
+ if (provider === 'ollama') {
2115
+ return { apiKey: 'ollama-local' };
2116
+ }
2050
2117
  const apiKeyEnv = exports.DEFAULT_ENDPOINTS[provider]?.apiKeyEnv ?? `${provider.toUpperCase()}_API_KEY`;
2051
2118
  const apiKey = process.env[apiKeyEnv];
2052
2119
  if (!apiKey) {
@@ -2437,6 +2504,17 @@ async function startProxy(config = {}) {
2437
2504
  }
2438
2505
  catch { /* file missing or parse error = treat as first run */ }
2439
2506
  const userConfig = (0, config_js_1.loadConfig)();
2507
+ (0, rate_limiter_js_1.configureRateLimiter)();
2508
+ // ── Cross-provider cascade: configure from proxy config (GH #38) ──
2509
+ if (proxyConfig.crossProviderCascade?.enabled && (proxyConfig.crossProviderCascade.providers?.length ?? 0) > 1) {
2510
+ cross_provider_cascade_js_1.crossProviderCascade.configure({
2511
+ enabled: true,
2512
+ providers: proxyConfig.crossProviderCascade.providers,
2513
+ triggerStatuses: proxyConfig.crossProviderCascade.triggerStatuses,
2514
+ modelMapping: proxyConfig.crossProviderCascade.modelMapping,
2515
+ });
2516
+ log(`[CROSS-CASCADE] Enabled. Provider order: ${proxyConfig.crossProviderCascade.providers.join(' → ')}`);
2517
+ }
2440
2518
  const isFirstRun = !rawFileHasRouting || !userConfig.first_run_complete;
2441
2519
  if (isFirstRun || proxyConfig.routing?.mode === 'auto') {
2442
2520
  const envAnthropicKey = process.env['ANTHROPIC_API_KEY'];
@@ -2497,7 +2575,37 @@ async function startProxy(config = {}) {
2497
2575
  }
2498
2576
  }
2499
2577
  _activeProxyConfig = proxyConfig;
2578
+ _activeOllamaConfig = proxyConfig.ollama;
2500
2579
  const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
2580
+ // === Ollama provider initialization ===
2581
+ if (_activeOllamaConfig?.enabled !== false && _activeOllamaConfig?.models?.length) {
2582
+ const ollamaUrl = _activeOllamaConfig.baseUrl ?? ollama_js_1.OLLAMA_DEFAULTS.baseUrl;
2583
+ console.log(`[RelayPlane] Ollama provider configured: ${ollamaUrl}`);
2584
+ console.log(`[RelayPlane] Ollama models: ${_activeOllamaConfig.models.join(', ')}`);
2585
+ if (_activeOllamaConfig.routeWhen) {
2586
+ const routeInfo = [];
2587
+ if (_activeOllamaConfig.routeWhen.complexity?.length) {
2588
+ routeInfo.push(`complexity: ${_activeOllamaConfig.routeWhen.complexity.join(', ')}`);
2589
+ }
2590
+ if (_activeOllamaConfig.routeWhen.taskTypes?.length) {
2591
+ routeInfo.push(`taskTypes: ${_activeOllamaConfig.routeWhen.taskTypes.join(', ')}`);
2592
+ }
2593
+ if (routeInfo.length) {
2594
+ console.log(`[RelayPlane] Ollama routing rules: ${routeInfo.join('; ')}`);
2595
+ }
2596
+ }
2597
+ // Async health check (non-blocking)
2598
+ (0, ollama_js_1.checkOllamaHealthCached)(ollamaUrl).then((health) => {
2599
+ if (health.available) {
2600
+ console.log(`[RelayPlane] ✓ Ollama is online (${health.models.length} models available, ${health.responseTimeMs}ms)`);
2601
+ }
2602
+ else {
2603
+ console.warn(`[RelayPlane] ⚠️ Ollama not available: ${health.error} — will fall back to cloud providers`);
2604
+ }
2605
+ }).catch(() => {
2606
+ console.warn('[RelayPlane] ⚠️ Ollama health check failed — will fall back to cloud providers');
2607
+ });
2608
+ }
2501
2609
  // === Startup config validation (Task 4) ===
2502
2610
  try {
2503
2611
  const userConfig = (0, config_js_1.loadConfig)();
@@ -2651,6 +2759,8 @@ async function startProxy(config = {}) {
2651
2759
  anomalyDetector.updateConfig({ ...anomalyDetector.getConfig(), ...(proxyConfig.anomaly ?? {}) });
2652
2760
  alertManager.updateConfig({ ...alertManager.getConfig(), ...(proxyConfig.alerts ?? {}) });
2653
2761
  downgradeConfig = { ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG, ...(proxyConfig.downgrade ?? {}) };
2762
+ _activeOllamaConfig = proxyConfig.ollama;
2763
+ (0, ollama_js_1.clearOllamaHealthCache)(); // Invalidate cached health on config change
2654
2764
  log(`Reloaded config from ${configPath}`);
2655
2765
  };
2656
2766
  const scheduleConfigReload = () => {
@@ -2999,6 +3109,9 @@ async function startProxy(config = {}) {
2999
3109
  console.log('[RelayPlane Health] Provider stats:', JSON.stringify(providerStats));
3000
3110
  const providers = [];
3001
3111
  for (const [name, ep] of Object.entries(exports.DEFAULT_ENDPOINTS)) {
3112
+ // Skip Ollama from normal key-based health check — it's handled separately
3113
+ if (name === 'ollama')
3114
+ continue;
3002
3115
  const hasKey = !!process.env[ep.apiKeyEnv];
3003
3116
  const stats = providerStats[name.toLowerCase()];
3004
3117
  const successRate = stats && stats.total > 0 ? stats.success / stats.total : (hasKey ? 1 : 0);
@@ -3018,6 +3131,19 @@ async function startProxy(config = {}) {
3018
3131
  lastChecked: new Date().toISOString(),
3019
3132
  });
3020
3133
  }
3134
+ // Add Ollama status if configured
3135
+ if (_activeOllamaConfig && _activeOllamaConfig.enabled !== false) {
3136
+ const ollamaStats = providerStats['ollama'];
3137
+ const ollamaSuccessRate = ollamaStats && ollamaStats.total > 0 ? ollamaStats.success / ollamaStats.total : 0;
3138
+ const ollamaHealth = await (0, ollama_js_1.checkOllamaHealthCached)(_activeOllamaConfig.baseUrl);
3139
+ providers.push({
3140
+ provider: 'ollama',
3141
+ status: ollamaHealth.available ? 'healthy' : 'down',
3142
+ latency: ollamaHealth.responseTimeMs ?? 0,
3143
+ successRate: ollamaHealth.available ? (ollamaSuccessRate || 1) : 0,
3144
+ lastChecked: new Date().toISOString(),
3145
+ });
3146
+ }
3021
3147
  res.writeHead(200, { 'Content-Type': 'application/json' });
3022
3148
  res.end(JSON.stringify({ providers }));
3023
3149
  return;
@@ -3095,6 +3221,21 @@ async function startProxy(config = {}) {
3095
3221
  return;
3096
3222
  }
3097
3223
  // === Mesh stats endpoint ===
3224
+ // === Ollama status endpoint ===
3225
+ if (req.method === 'GET' && pathname === '/v1/ollama/status') {
3226
+ const ollamaBaseUrl = _activeOllamaConfig?.baseUrl ?? ollama_js_1.OLLAMA_DEFAULTS.baseUrl;
3227
+ const health = await (0, ollama_js_1.checkOllamaHealthCached)(ollamaBaseUrl);
3228
+ res.writeHead(200, { 'Content-Type': 'application/json' });
3229
+ res.end(JSON.stringify({
3230
+ configured: !!_activeOllamaConfig,
3231
+ enabled: _activeOllamaConfig?.enabled !== false,
3232
+ baseUrl: ollamaBaseUrl,
3233
+ health,
3234
+ routeWhen: _activeOllamaConfig?.routeWhen ?? null,
3235
+ configuredModels: _activeOllamaConfig?.models ?? [],
3236
+ }));
3237
+ return;
3238
+ }
3098
3239
  if (req.method === 'GET' && pathname === '/v1/mesh/stats') {
3099
3240
  res.writeHead(200, { 'Content-Type': 'application/json' });
3100
3241
  res.end(JSON.stringify(meshHandle.getStats()));
@@ -3432,20 +3573,25 @@ async function startProxy(config = {}) {
3432
3573
  // ── End budget check ──
3433
3574
  // ── Rate limit check ──
3434
3575
  const workspaceId = 'local'; // Local proxy uses single workspace
3435
- const rateLimit = (0, rate_limiter_js_1.checkLimit)(workspaceId, targetModel);
3436
- if (!rateLimit.allowed) {
3437
- console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${workspaceId}`);
3576
+ try {
3577
+ // Pass targetProvider so per-provider limits are applied and limits don't
3578
+ // cascade across providers (e.g. Anthropic hitting its cap won't block OpenAI).
3579
+ await (0, rate_limiter_js_1.acquireSlot)(workspaceId, targetModel, targetProvider);
3580
+ }
3581
+ catch (err) {
3582
+ const rlErr = err;
3583
+ console.error(`[RATE LIMIT] ${targetModel}: ${rlErr.message}`);
3438
3584
  res.writeHead(429, {
3439
3585
  'Content-Type': 'application/json',
3440
- 'Retry-After': String(rateLimit.retryAfter || 60),
3441
- 'X-RelayPlane-RateLimit-Limit': String(rateLimit.limit),
3586
+ 'Retry-After': String(rlErr.retryAfter ?? 60),
3587
+ 'X-RelayPlane-RateLimit-Limit': String(rlErr.limit),
3442
3588
  'X-RelayPlane-RateLimit-Remaining': '0',
3443
- 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rateLimit.resetAt / 1000))
3589
+ 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rlErr.resetAt / 1000)),
3444
3590
  });
3445
3591
  res.end(JSON.stringify({
3446
- error: `Rate limit exceeded for ${targetModel}. Max ${rateLimit.limit} requests per minute.`,
3592
+ error: rlErr.message,
3447
3593
  type: 'rate_limit_exceeded',
3448
- retry_after: rateLimit.retryAfter || 60
3594
+ retry_after: rlErr.retryAfter ?? 60,
3449
3595
  }));
3450
3596
  return;
3451
3597
  }
@@ -3510,6 +3656,46 @@ async function startProxy(config = {}) {
3510
3656
  if (proxyConfig.reliability?.cooldowns?.enabled) {
3511
3657
  cooldownManager.recordFailure(targetProvider, JSON.stringify(errorPayload));
3512
3658
  }
3659
+ // ── Cross-provider cascade for /v1/messages path (GH #38) ──
3660
+ if (!isStreaming &&
3661
+ cross_provider_cascade_js_1.crossProviderCascade.enabled &&
3662
+ cross_provider_cascade_js_1.crossProviderCascade.shouldCascade(providerResponse.status)) {
3663
+ const { result: cascResult, data: cascData } = await cross_provider_cascade_js_1.crossProviderCascade.execute(targetProvider, targetModel || requestedModel, providerResponse.status, async (hop) => {
3664
+ const apiKeyResult = resolveProviderApiKey(hop.provider, ctx, useAnthropicEnvKey);
3665
+ if (apiKeyResult.error) {
3666
+ return { status: apiKeyResult.error.status, data: apiKeyResult.error.payload };
3667
+ }
3668
+ // Respect per-provider rate limits before attempting the hop
3669
+ try {
3670
+ await (0, rate_limiter_js_1.acquireSlot)('local', hop.model, hop.provider);
3671
+ }
3672
+ catch {
3673
+ return { status: 429, data: { error: `Local rate limit for ${hop.provider}` } };
3674
+ }
3675
+ // Convert native Anthropic body to ChatRequest for OpenAI-compatible providers
3676
+ const chatReq = convertNativeAnthropicBodyToChatRequest(requestBody, hop.model);
3677
+ const hopResult = await executeNonStreamingProviderRequest(chatReq, hop.provider, hop.model, apiKeyResult.apiKey, ctx);
3678
+ return { status: hopResult.status, data: hopResult.responseData };
3679
+ }, log);
3680
+ if (cascResult.success && cascData) {
3681
+ // Cascade succeeded — update provider/model and respond
3682
+ const cascDurationMs = Date.now() - startTime;
3683
+ const cascProvider = cascResult.provider;
3684
+ const cascModel = cascResult.model;
3685
+ logRequest(originalModel ?? 'unknown', cascModel, cascProvider, cascDurationMs, true, `${routingMode}+cross-cascade`, undefined, taskType, complexity);
3686
+ const cascRpHeaders = buildRelayPlaneResponseHeaders(cascModel, originalModel ?? 'unknown', complexity, cascProvider, `${routingMode}+cross-cascade`);
3687
+ res.writeHead(200, {
3688
+ 'Content-Type': 'application/json',
3689
+ 'X-RelayPlane-Cascade-Provider': cascProvider,
3690
+ 'X-RelayPlane-Cascade-Model': cascModel,
3691
+ ...cascRpHeaders,
3692
+ });
3693
+ res.end(JSON.stringify(cascData));
3694
+ return;
3695
+ }
3696
+ // All fallbacks exhausted — fall through to original error response
3697
+ }
3698
+ // ── End cross-provider cascade ──
3513
3699
  const durationMs = Date.now() - startTime;
3514
3700
  const errMsg = extractProviderErrorMessage(errorPayload, providerResponse.status);
3515
3701
  logRequest(originalModel ?? 'unknown', targetModel || requestedModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, errMsg, providerResponse.status);
@@ -4011,6 +4197,21 @@ async function startProxy(config = {}) {
4011
4197
  targetModel = defaultRoute.model;
4012
4198
  }
4013
4199
  }
4200
+ // ── Ollama routing: intercept before cloud dispatch ──
4201
+ if (!useCascade && _activeOllamaConfig && _activeOllamaConfig.enabled !== false) {
4202
+ if (targetProvider === 'ollama' || (0, ollama_js_1.shouldRouteToOllama)(_activeOllamaConfig, complexity, taskType, request.model)) {
4203
+ // Check Ollama availability before routing
4204
+ const ollamaHealth = await (0, ollama_js_1.checkOllamaHealthCached)(_activeOllamaConfig.baseUrl);
4205
+ if (ollamaHealth.available) {
4206
+ targetProvider = 'ollama';
4207
+ targetModel = (0, ollama_js_1.resolveOllamaModel)(targetModel, _activeOllamaConfig);
4208
+ log(`Ollama routing: ${complexity}/${taskType} → ollama/${targetModel}`);
4209
+ }
4210
+ else {
4211
+ log(`Ollama unavailable (${ollamaHealth.error}), falling back to cloud provider`);
4212
+ }
4213
+ }
4214
+ }
4014
4215
  if (!useCascade) {
4015
4216
  log(`Routing to: ${targetProvider}/${targetModel}`);
4016
4217
  }
@@ -4053,20 +4254,24 @@ async function startProxy(config = {}) {
4053
4254
  // ── End budget check ──
4054
4255
  // ── Rate limit check ──
4055
4256
  const chatWorkspaceId = 'local'; // Local proxy uses single workspace
4056
- const chatRateLimit = (0, rate_limiter_js_1.checkLimit)(chatWorkspaceId, targetModel);
4057
- if (!chatRateLimit.allowed) {
4058
- console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${chatWorkspaceId}`);
4257
+ try {
4258
+ // Pass targetProvider so per-provider limits apply and don't cascade across providers.
4259
+ await (0, rate_limiter_js_1.acquireSlot)(chatWorkspaceId, targetModel, targetProvider);
4260
+ }
4261
+ catch (err) {
4262
+ const chatRlErr = err;
4263
+ console.error(`[RATE LIMIT] ${targetModel}: ${chatRlErr.message}`);
4059
4264
  res.writeHead(429, {
4060
4265
  'Content-Type': 'application/json',
4061
- 'Retry-After': String(chatRateLimit.retryAfter || 60),
4062
- 'X-RelayPlane-RateLimit-Limit': String(chatRateLimit.limit),
4266
+ 'Retry-After': String(chatRlErr.retryAfter ?? 60),
4267
+ 'X-RelayPlane-RateLimit-Limit': String(chatRlErr.limit),
4063
4268
  'X-RelayPlane-RateLimit-Remaining': '0',
4064
- 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRateLimit.resetAt / 1000))
4269
+ 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRlErr.resetAt / 1000)),
4065
4270
  });
4066
4271
  res.end(JSON.stringify({
4067
- error: `Rate limit exceeded for ${targetModel}. Max ${chatRateLimit.limit} requests per minute.`,
4272
+ error: chatRlErr.message,
4068
4273
  type: 'rate_limit_exceeded',
4069
- retry_after: chatRateLimit.retryAfter || 60
4274
+ retry_after: chatRlErr.retryAfter ?? 60,
4070
4275
  }));
4071
4276
  return;
4072
4277
  }
@@ -4175,7 +4380,7 @@ async function startProxy(config = {}) {
4175
4380
  }
4176
4381
  }
4177
4382
  else {
4178
- await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatAgentFingerprint, chatExplicitAgentId);
4383
+ await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatAgentFingerprint, chatExplicitAgentId, useAnthropicEnvKey);
4179
4384
  }
4180
4385
  }
4181
4386
  });
@@ -4308,6 +4513,24 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
4308
4513
  }
4309
4514
  break;
4310
4515
  }
4516
+ case 'ollama': {
4517
+ const ollamaResult = await (0, ollama_js_1.forwardToOllama)(targetModel, request.messages, {
4518
+ temperature: request.temperature,
4519
+ max_tokens: request.max_tokens,
4520
+ tools: request.tools,
4521
+ baseUrl: _activeOllamaConfig?.baseUrl,
4522
+ timeoutMs: _activeOllamaConfig?.timeoutMs,
4523
+ });
4524
+ if (!ollamaResult.success) {
4525
+ return {
4526
+ responseData: { error: ollamaResult.error },
4527
+ ok: false,
4528
+ status: ollamaResult.error?.status ?? 502,
4529
+ };
4530
+ }
4531
+ responseData = ollamaResult.data;
4532
+ break;
4533
+ }
4311
4534
  default: {
4312
4535
  providerResponse = await forwardToOpenAI(request, targetModel, apiKey);
4313
4536
  responseData = (await providerResponse.json());
@@ -4337,6 +4560,44 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
4337
4560
  case 'groq':
4338
4561
  providerResponse = await forwardToOpenAICompatibleStream(request, targetModel, apiKey);
4339
4562
  break;
4563
+ case 'ollama': {
4564
+ // Ollama streaming uses its own handler that converts NDJSON → SSE
4565
+ const ollamaStream = await (0, ollama_js_1.forwardToOllamaStream)(targetModel, request.messages, {
4566
+ temperature: request.temperature,
4567
+ max_tokens: request.max_tokens,
4568
+ tools: request.tools,
4569
+ baseUrl: _activeOllamaConfig?.baseUrl,
4570
+ timeoutMs: _activeOllamaConfig?.timeoutMs,
4571
+ });
4572
+ if (!ollamaStream.success || !ollamaStream.stream) {
4573
+ const durationMs = Date.now() - startTime;
4574
+ const errMsg = ollamaStream.error?.message ?? 'Ollama stream failed';
4575
+ logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, errMsg, ollamaStream.error?.status);
4576
+ res.writeHead(ollamaStream.error?.status ?? 502, { 'Content-Type': 'application/json' });
4577
+ res.end(JSON.stringify({ error: ollamaStream.error }));
4578
+ return;
4579
+ }
4580
+ // Write SSE headers and pipe converted stream
4581
+ const relayHeaders = buildRelayPlaneResponseHeaders(targetModel, request.model, complexity, 'ollama', routingMode);
4582
+ res.writeHead(200, {
4583
+ 'Content-Type': 'text/event-stream',
4584
+ 'Cache-Control': 'no-cache',
4585
+ 'Connection': 'keep-alive',
4586
+ ...relayHeaders,
4587
+ });
4588
+ for await (const chunk of ollamaStream.stream) {
4589
+ res.write(chunk);
4590
+ }
4591
+ const durationMs = Date.now() - startTime;
4592
+ logRequest(request.model ?? 'unknown', targetModel, 'ollama', durationMs, true, routingMode, false, taskType, complexity, agentFingerprint, agentId);
4593
+ updateLastHistoryEntry(0, 0, 0, targetModel, undefined, undefined, agentFingerprint, agentId);
4594
+ if (recordTelemetry) {
4595
+ sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, true, 0, request.model ?? undefined);
4596
+ meshCapture(targetModel, 'ollama', taskType, 0, 0, 0, durationMs, true);
4597
+ }
4598
+ res.end();
4599
+ return;
4600
+ }
4340
4601
  default:
4341
4602
  providerResponse = await forwardToOpenAIStream(request, targetModel, apiKey);
4342
4603
  }
@@ -4524,7 +4785,9 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
4524
4785
  /**
4525
4786
  * Handle non-streaming request
4526
4787
  */
4527
- async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', agentFingerprint, agentId) {
4788
+ async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', agentFingerprint, agentId,
4789
+ /** Anthropic env API key — required for cross-provider cascade API key resolution (GH #38) */
4790
+ anthropicEnvKeyForCascade) {
4528
4791
  let responseData;
4529
4792
  try {
4530
4793
  const result = await executeNonStreamingProviderRequest(request, targetProvider, targetModel, apiKey, ctx);
@@ -4533,16 +4796,59 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
4533
4796
  if (cooldownsEnabled) {
4534
4797
  cooldownManager.recordFailure(targetProvider, JSON.stringify(responseData));
4535
4798
  }
4536
- const durationMs = Date.now() - startTime;
4537
- const nsErrMsg = extractProviderErrorMessage(responseData, result.status);
4538
- logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, nsErrMsg, result.status);
4539
- if (recordTelemetry) {
4540
- sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, false, 0, request.model ?? undefined);
4541
- meshCapture(targetModel, targetProvider, taskType, 0, 0, 0, durationMs, false, nsErrMsg);
4799
+ // ── Cross-provider cascade (GH #38) ──
4800
+ if (cross_provider_cascade_js_1.crossProviderCascade.enabled && cross_provider_cascade_js_1.crossProviderCascade.shouldCascade(result.status)) {
4801
+ const { result: cascResult, data: cascData } = await cross_provider_cascade_js_1.crossProviderCascade.execute(targetProvider, targetModel, result.status, async (hop) => {
4802
+ const apiKeyResult = resolveProviderApiKey(hop.provider, ctx, anthropicEnvKeyForCascade);
4803
+ if (apiKeyResult.error) {
4804
+ return { status: apiKeyResult.error.status, data: apiKeyResult.error.payload };
4805
+ }
4806
+ // Respect per-provider rate limits before attempting the hop
4807
+ try {
4808
+ await (0, rate_limiter_js_1.acquireSlot)('local', hop.model, hop.provider);
4809
+ }
4810
+ catch {
4811
+ // Rate-limited locally — treat as 429 so cascade continues
4812
+ return { status: 429, data: { error: `Local rate limit for ${hop.provider}` } };
4813
+ }
4814
+ const hopResult = await executeNonStreamingProviderRequest({ ...request, model: hop.model }, hop.provider, hop.model, apiKeyResult.apiKey, ctx);
4815
+ return { status: hopResult.status, data: hopResult.responseData };
4816
+ }, log);
4817
+ if (cascResult.success && cascData) {
4818
+ // Update tracking variables to reflect the actual provider/model used
4819
+ targetProvider = cascResult.provider;
4820
+ targetModel = cascResult.model;
4821
+ responseData = cascData;
4822
+ // Fall through to success handling below (don't return early)
4823
+ }
4824
+ else {
4825
+ // All fallbacks exhausted — return the primary error
4826
+ const durationMs = Date.now() - startTime;
4827
+ const nsErrMsg = extractProviderErrorMessage(responseData, result.status);
4828
+ logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, `${routingMode}+cascade`, undefined, taskType, complexity, undefined, undefined, nsErrMsg, result.status);
4829
+ if (recordTelemetry) {
4830
+ sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, false, 0, request.model ?? undefined);
4831
+ meshCapture(targetModel, targetProvider, taskType, 0, 0, 0, durationMs, false, nsErrMsg);
4832
+ }
4833
+ res.writeHead(result.status, { 'Content-Type': 'application/json' });
4834
+ res.end(JSON.stringify(responseData));
4835
+ return;
4836
+ }
4542
4837
  }
4543
- res.writeHead(result.status, { 'Content-Type': 'application/json' });
4544
- res.end(JSON.stringify(responseData));
4545
- return;
4838
+ else {
4839
+ // No cascade — return error as-is
4840
+ const durationMs = Date.now() - startTime;
4841
+ const nsErrMsg = extractProviderErrorMessage(responseData, result.status);
4842
+ logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, nsErrMsg, result.status);
4843
+ if (recordTelemetry) {
4844
+ sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, false, 0, request.model ?? undefined);
4845
+ meshCapture(targetModel, targetProvider, taskType, 0, 0, 0, durationMs, false, nsErrMsg);
4846
+ }
4847
+ res.writeHead(result.status, { 'Content-Type': 'application/json' });
4848
+ res.end(JSON.stringify(responseData));
4849
+ return;
4850
+ }
4851
+ // ── End cross-provider cascade ──
4546
4852
  }
4547
4853
  }
4548
4854
  catch (err) {