@relayplane/proxy 1.8.5 → 1.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config.d.ts +92 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +26 -0
- package/dist/config.js.map +1 -1
- package/dist/cross-provider-cascade.d.ts +137 -0
- package/dist/cross-provider-cascade.d.ts.map +1 -0
- package/dist/cross-provider-cascade.js +258 -0
- package/dist/cross-provider-cascade.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -1
- package/dist/index.js.map +1 -1
- package/dist/ollama.d.ts +264 -0
- package/dist/ollama.d.ts.map +1 -0
- package/dist/ollama.js +510 -0
- package/dist/ollama.js.map +1 -0
- package/dist/rate-limiter.d.ts +87 -9
- package/dist/rate-limiter.d.ts.map +1 -1
- package/dist/rate-limiter.js +262 -28
- package/dist/rate-limiter.js.map +1 -1
- package/dist/standalone-proxy.d.ts +1 -1
- package/dist/standalone-proxy.d.ts.map +1 -1
- package/dist/standalone-proxy.js +334 -28
- package/dist/standalone-proxy.js.map +1 -1
- package/package.json +1 -1
package/dist/standalone-proxy.js
CHANGED
|
@@ -75,6 +75,8 @@ const index_js_1 = require("./mesh/index.js");
|
|
|
75
75
|
const response_cache_js_1 = require("./response-cache.js");
|
|
76
76
|
const stats_js_1 = require("./stats.js");
|
|
77
77
|
const rate_limiter_js_1 = require("./rate-limiter.js");
|
|
78
|
+
const ollama_js_1 = require("./ollama.js");
|
|
79
|
+
const cross_provider_cascade_js_1 = require("./cross-provider-cascade.js");
|
|
78
80
|
const budget_js_1 = require("./budget.js");
|
|
79
81
|
const anomaly_js_1 = require("./anomaly.js");
|
|
80
82
|
const alerts_js_1 = require("./alerts.js");
|
|
@@ -186,6 +188,10 @@ exports.DEFAULT_ENDPOINTS = {
|
|
|
186
188
|
baseUrl: 'https://api.perplexity.ai',
|
|
187
189
|
apiKeyEnv: 'PERPLEXITY_API_KEY',
|
|
188
190
|
},
|
|
191
|
+
ollama: {
|
|
192
|
+
baseUrl: 'http://localhost:11434',
|
|
193
|
+
apiKeyEnv: 'OLLAMA_API_KEY', // Not actually required, placeholder for consistency
|
|
194
|
+
},
|
|
189
195
|
};
|
|
190
196
|
/**
|
|
191
197
|
* Model to provider/model mapping
|
|
@@ -726,6 +732,8 @@ const DEFAULT_PROXY_CONFIG = {
|
|
|
726
732
|
};
|
|
727
733
|
/** Module-level ref to active proxy config (set during startProxy) */
|
|
728
734
|
let _activeProxyConfig = {};
|
|
735
|
+
/** Module-level ref to active Ollama config (set during startProxy) */
|
|
736
|
+
let _activeOllamaConfig;
|
|
729
737
|
function isContentLoggingEnabled() {
|
|
730
738
|
return _activeProxyConfig.dashboard?.showRequestContent !== false;
|
|
731
739
|
}
|
|
@@ -1901,10 +1909,14 @@ function resolveExplicitModel(modelName) {
|
|
|
1901
1909
|
if (modelName.startsWith('deepseek-') || modelName.startsWith('groq-')) {
|
|
1902
1910
|
return { provider: 'openrouter', model: modelName };
|
|
1903
1911
|
}
|
|
1912
|
+
// Ollama models: "ollama/llama3.2" or direct model names when Ollama config exists
|
|
1913
|
+
if (modelName.startsWith('ollama/')) {
|
|
1914
|
+
return { provider: 'ollama', model: modelName.slice('ollama/'.length) };
|
|
1915
|
+
}
|
|
1904
1916
|
// Provider-prefixed format: "anthropic/claude-3-5-sonnet-latest"
|
|
1905
1917
|
if (modelName.includes('/')) {
|
|
1906
1918
|
const [provider, model] = modelName.split('/');
|
|
1907
|
-
const validProviders = ['openai', 'anthropic', 'google', 'xai', 'openrouter', 'deepseek', 'groq', 'local'];
|
|
1919
|
+
const validProviders = ['openai', 'anthropic', 'google', 'xai', 'openrouter', 'deepseek', 'groq', 'local', 'ollama'];
|
|
1908
1920
|
if (provider && model && validProviders.includes(provider)) {
|
|
1909
1921
|
return { provider: provider, model };
|
|
1910
1922
|
}
|
|
@@ -1962,6 +1974,57 @@ function checkResponseModelMismatch(responseData, requestedModel, provider, log)
|
|
|
1962
1974
|
* Extract a human-readable error message from a provider error payload.
|
|
1963
1975
|
* Handles Anthropic ({ error: { type, message } }) and OpenAI ({ error: { message } }) formats.
|
|
1964
1976
|
*/
|
|
1977
|
+
/**
|
|
1978
|
+
* Convert a native Anthropic messages request body into the OpenAI-compatible
|
|
1979
|
+
* ChatRequest format used by forwardToOpenAICompatible and related helpers.
|
|
1980
|
+
*
|
|
1981
|
+
* This allows cross-provider cascade from Anthropic → OpenRouter (and others)
|
|
1982
|
+
* without losing the original request content. (GH #38)
|
|
1983
|
+
*/
|
|
1984
|
+
function convertNativeAnthropicBodyToChatRequest(body, mappedModel) {
|
|
1985
|
+
const rawMessages = Array.isArray(body['messages'])
|
|
1986
|
+
? body['messages']
|
|
1987
|
+
: [];
|
|
1988
|
+
const messages = [];
|
|
1989
|
+
// Prepend system message if present
|
|
1990
|
+
if (body['system'] && typeof body['system'] === 'string') {
|
|
1991
|
+
messages.push({ role: 'system', content: body['system'] });
|
|
1992
|
+
}
|
|
1993
|
+
else if (Array.isArray(body['system'])) {
|
|
1994
|
+
// Anthropic structured system (array of {type, text}) — flatten to text
|
|
1995
|
+
const systemText = body['system']
|
|
1996
|
+
.filter((b) => b.type === 'text')
|
|
1997
|
+
.map((b) => b.text ?? '')
|
|
1998
|
+
.join('\n');
|
|
1999
|
+
if (systemText)
|
|
2000
|
+
messages.push({ role: 'system', content: systemText });
|
|
2001
|
+
}
|
|
2002
|
+
for (const msg of rawMessages) {
|
|
2003
|
+
const role = msg['role'];
|
|
2004
|
+
const content = msg['content'];
|
|
2005
|
+
if (typeof content === 'string') {
|
|
2006
|
+
messages.push({ role: role, content });
|
|
2007
|
+
}
|
|
2008
|
+
else if (Array.isArray(content)) {
|
|
2009
|
+
// Anthropic content blocks — extract text parts
|
|
2010
|
+
const text = content
|
|
2011
|
+
.filter((b) => b.type === 'text')
|
|
2012
|
+
.map((b) => b.text ?? '')
|
|
2013
|
+
.join('');
|
|
2014
|
+
messages.push({ role: role, content: text });
|
|
2015
|
+
}
|
|
2016
|
+
else {
|
|
2017
|
+
messages.push({ role: role, content: String(content ?? '') });
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
return {
|
|
2021
|
+
model: mappedModel,
|
|
2022
|
+
messages,
|
|
2023
|
+
max_tokens: body['max_tokens'] ?? 4096,
|
|
2024
|
+
temperature: body['temperature'],
|
|
2025
|
+
stream: false,
|
|
2026
|
+
};
|
|
2027
|
+
}
|
|
1965
2028
|
function extractProviderErrorMessage(payload, statusCode) {
|
|
1966
2029
|
const err = payload['error'];
|
|
1967
2030
|
if (typeof err === 'string')
|
|
@@ -2047,6 +2110,10 @@ function resolveProviderApiKey(provider, ctx, envApiKey) {
|
|
|
2047
2110
|
}
|
|
2048
2111
|
return { apiKey: envApiKey };
|
|
2049
2112
|
}
|
|
2113
|
+
// Ollama doesn't need an API key — it's local
|
|
2114
|
+
if (provider === 'ollama') {
|
|
2115
|
+
return { apiKey: 'ollama-local' };
|
|
2116
|
+
}
|
|
2050
2117
|
const apiKeyEnv = exports.DEFAULT_ENDPOINTS[provider]?.apiKeyEnv ?? `${provider.toUpperCase()}_API_KEY`;
|
|
2051
2118
|
const apiKey = process.env[apiKeyEnv];
|
|
2052
2119
|
if (!apiKey) {
|
|
@@ -2437,6 +2504,17 @@ async function startProxy(config = {}) {
|
|
|
2437
2504
|
}
|
|
2438
2505
|
catch { /* file missing or parse error = treat as first run */ }
|
|
2439
2506
|
const userConfig = (0, config_js_1.loadConfig)();
|
|
2507
|
+
(0, rate_limiter_js_1.configureRateLimiter)();
|
|
2508
|
+
// ── Cross-provider cascade: configure from proxy config (GH #38) ──
|
|
2509
|
+
if (proxyConfig.crossProviderCascade?.enabled && (proxyConfig.crossProviderCascade.providers?.length ?? 0) > 1) {
|
|
2510
|
+
cross_provider_cascade_js_1.crossProviderCascade.configure({
|
|
2511
|
+
enabled: true,
|
|
2512
|
+
providers: proxyConfig.crossProviderCascade.providers,
|
|
2513
|
+
triggerStatuses: proxyConfig.crossProviderCascade.triggerStatuses,
|
|
2514
|
+
modelMapping: proxyConfig.crossProviderCascade.modelMapping,
|
|
2515
|
+
});
|
|
2516
|
+
log(`[CROSS-CASCADE] Enabled. Provider order: ${proxyConfig.crossProviderCascade.providers.join(' → ')}`);
|
|
2517
|
+
}
|
|
2440
2518
|
const isFirstRun = !rawFileHasRouting || !userConfig.first_run_complete;
|
|
2441
2519
|
if (isFirstRun || proxyConfig.routing?.mode === 'auto') {
|
|
2442
2520
|
const envAnthropicKey = process.env['ANTHROPIC_API_KEY'];
|
|
@@ -2497,7 +2575,37 @@ async function startProxy(config = {}) {
|
|
|
2497
2575
|
}
|
|
2498
2576
|
}
|
|
2499
2577
|
_activeProxyConfig = proxyConfig;
|
|
2578
|
+
_activeOllamaConfig = proxyConfig.ollama;
|
|
2500
2579
|
const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
|
|
2580
|
+
// === Ollama provider initialization ===
|
|
2581
|
+
if (_activeOllamaConfig?.enabled !== false && _activeOllamaConfig?.models?.length) {
|
|
2582
|
+
const ollamaUrl = _activeOllamaConfig.baseUrl ?? ollama_js_1.OLLAMA_DEFAULTS.baseUrl;
|
|
2583
|
+
console.log(`[RelayPlane] Ollama provider configured: ${ollamaUrl}`);
|
|
2584
|
+
console.log(`[RelayPlane] Ollama models: ${_activeOllamaConfig.models.join(', ')}`);
|
|
2585
|
+
if (_activeOllamaConfig.routeWhen) {
|
|
2586
|
+
const routeInfo = [];
|
|
2587
|
+
if (_activeOllamaConfig.routeWhen.complexity?.length) {
|
|
2588
|
+
routeInfo.push(`complexity: ${_activeOllamaConfig.routeWhen.complexity.join(', ')}`);
|
|
2589
|
+
}
|
|
2590
|
+
if (_activeOllamaConfig.routeWhen.taskTypes?.length) {
|
|
2591
|
+
routeInfo.push(`taskTypes: ${_activeOllamaConfig.routeWhen.taskTypes.join(', ')}`);
|
|
2592
|
+
}
|
|
2593
|
+
if (routeInfo.length) {
|
|
2594
|
+
console.log(`[RelayPlane] Ollama routing rules: ${routeInfo.join('; ')}`);
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
2597
|
+
// Async health check (non-blocking)
|
|
2598
|
+
(0, ollama_js_1.checkOllamaHealthCached)(ollamaUrl).then((health) => {
|
|
2599
|
+
if (health.available) {
|
|
2600
|
+
console.log(`[RelayPlane] ✓ Ollama is online (${health.models.length} models available, ${health.responseTimeMs}ms)`);
|
|
2601
|
+
}
|
|
2602
|
+
else {
|
|
2603
|
+
console.warn(`[RelayPlane] ⚠️ Ollama not available: ${health.error} — will fall back to cloud providers`);
|
|
2604
|
+
}
|
|
2605
|
+
}).catch(() => {
|
|
2606
|
+
console.warn('[RelayPlane] ⚠️ Ollama health check failed — will fall back to cloud providers');
|
|
2607
|
+
});
|
|
2608
|
+
}
|
|
2501
2609
|
// === Startup config validation (Task 4) ===
|
|
2502
2610
|
try {
|
|
2503
2611
|
const userConfig = (0, config_js_1.loadConfig)();
|
|
@@ -2651,6 +2759,8 @@ async function startProxy(config = {}) {
|
|
|
2651
2759
|
anomalyDetector.updateConfig({ ...anomalyDetector.getConfig(), ...(proxyConfig.anomaly ?? {}) });
|
|
2652
2760
|
alertManager.updateConfig({ ...alertManager.getConfig(), ...(proxyConfig.alerts ?? {}) });
|
|
2653
2761
|
downgradeConfig = { ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG, ...(proxyConfig.downgrade ?? {}) };
|
|
2762
|
+
_activeOllamaConfig = proxyConfig.ollama;
|
|
2763
|
+
(0, ollama_js_1.clearOllamaHealthCache)(); // Invalidate cached health on config change
|
|
2654
2764
|
log(`Reloaded config from ${configPath}`);
|
|
2655
2765
|
};
|
|
2656
2766
|
const scheduleConfigReload = () => {
|
|
@@ -2999,6 +3109,9 @@ async function startProxy(config = {}) {
|
|
|
2999
3109
|
console.log('[RelayPlane Health] Provider stats:', JSON.stringify(providerStats));
|
|
3000
3110
|
const providers = [];
|
|
3001
3111
|
for (const [name, ep] of Object.entries(exports.DEFAULT_ENDPOINTS)) {
|
|
3112
|
+
// Skip Ollama from normal key-based health check — it's handled separately
|
|
3113
|
+
if (name === 'ollama')
|
|
3114
|
+
continue;
|
|
3002
3115
|
const hasKey = !!process.env[ep.apiKeyEnv];
|
|
3003
3116
|
const stats = providerStats[name.toLowerCase()];
|
|
3004
3117
|
const successRate = stats && stats.total > 0 ? stats.success / stats.total : (hasKey ? 1 : 0);
|
|
@@ -3018,6 +3131,19 @@ async function startProxy(config = {}) {
|
|
|
3018
3131
|
lastChecked: new Date().toISOString(),
|
|
3019
3132
|
});
|
|
3020
3133
|
}
|
|
3134
|
+
// Add Ollama status if configured
|
|
3135
|
+
if (_activeOllamaConfig && _activeOllamaConfig.enabled !== false) {
|
|
3136
|
+
const ollamaStats = providerStats['ollama'];
|
|
3137
|
+
const ollamaSuccessRate = ollamaStats && ollamaStats.total > 0 ? ollamaStats.success / ollamaStats.total : 0;
|
|
3138
|
+
const ollamaHealth = await (0, ollama_js_1.checkOllamaHealthCached)(_activeOllamaConfig.baseUrl);
|
|
3139
|
+
providers.push({
|
|
3140
|
+
provider: 'ollama',
|
|
3141
|
+
status: ollamaHealth.available ? 'healthy' : 'down',
|
|
3142
|
+
latency: ollamaHealth.responseTimeMs ?? 0,
|
|
3143
|
+
successRate: ollamaHealth.available ? (ollamaSuccessRate || 1) : 0,
|
|
3144
|
+
lastChecked: new Date().toISOString(),
|
|
3145
|
+
});
|
|
3146
|
+
}
|
|
3021
3147
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
3022
3148
|
res.end(JSON.stringify({ providers }));
|
|
3023
3149
|
return;
|
|
@@ -3095,6 +3221,21 @@ async function startProxy(config = {}) {
|
|
|
3095
3221
|
return;
|
|
3096
3222
|
}
|
|
3097
3223
|
// === Mesh stats endpoint ===
|
|
3224
|
+
// === Ollama status endpoint ===
|
|
3225
|
+
if (req.method === 'GET' && pathname === '/v1/ollama/status') {
|
|
3226
|
+
const ollamaBaseUrl = _activeOllamaConfig?.baseUrl ?? ollama_js_1.OLLAMA_DEFAULTS.baseUrl;
|
|
3227
|
+
const health = await (0, ollama_js_1.checkOllamaHealthCached)(ollamaBaseUrl);
|
|
3228
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
3229
|
+
res.end(JSON.stringify({
|
|
3230
|
+
configured: !!_activeOllamaConfig,
|
|
3231
|
+
enabled: _activeOllamaConfig?.enabled !== false,
|
|
3232
|
+
baseUrl: ollamaBaseUrl,
|
|
3233
|
+
health,
|
|
3234
|
+
routeWhen: _activeOllamaConfig?.routeWhen ?? null,
|
|
3235
|
+
configuredModels: _activeOllamaConfig?.models ?? [],
|
|
3236
|
+
}));
|
|
3237
|
+
return;
|
|
3238
|
+
}
|
|
3098
3239
|
if (req.method === 'GET' && pathname === '/v1/mesh/stats') {
|
|
3099
3240
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
3100
3241
|
res.end(JSON.stringify(meshHandle.getStats()));
|
|
@@ -3432,20 +3573,25 @@ async function startProxy(config = {}) {
|
|
|
3432
3573
|
// ── End budget check ──
|
|
3433
3574
|
// ── Rate limit check ──
|
|
3434
3575
|
const workspaceId = 'local'; // Local proxy uses single workspace
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3576
|
+
try {
|
|
3577
|
+
// Pass targetProvider so per-provider limits are applied and limits don't
|
|
3578
|
+
// cascade across providers (e.g. Anthropic hitting its cap won't block OpenAI).
|
|
3579
|
+
await (0, rate_limiter_js_1.acquireSlot)(workspaceId, targetModel, targetProvider);
|
|
3580
|
+
}
|
|
3581
|
+
catch (err) {
|
|
3582
|
+
const rlErr = err;
|
|
3583
|
+
console.error(`[RATE LIMIT] ${targetModel}: ${rlErr.message}`);
|
|
3438
3584
|
res.writeHead(429, {
|
|
3439
3585
|
'Content-Type': 'application/json',
|
|
3440
|
-
'Retry-After': String(
|
|
3441
|
-
'X-RelayPlane-RateLimit-Limit': String(
|
|
3586
|
+
'Retry-After': String(rlErr.retryAfter ?? 60),
|
|
3587
|
+
'X-RelayPlane-RateLimit-Limit': String(rlErr.limit),
|
|
3442
3588
|
'X-RelayPlane-RateLimit-Remaining': '0',
|
|
3443
|
-
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(
|
|
3589
|
+
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rlErr.resetAt / 1000)),
|
|
3444
3590
|
});
|
|
3445
3591
|
res.end(JSON.stringify({
|
|
3446
|
-
error:
|
|
3592
|
+
error: rlErr.message,
|
|
3447
3593
|
type: 'rate_limit_exceeded',
|
|
3448
|
-
retry_after:
|
|
3594
|
+
retry_after: rlErr.retryAfter ?? 60,
|
|
3449
3595
|
}));
|
|
3450
3596
|
return;
|
|
3451
3597
|
}
|
|
@@ -3510,6 +3656,46 @@ async function startProxy(config = {}) {
|
|
|
3510
3656
|
if (proxyConfig.reliability?.cooldowns?.enabled) {
|
|
3511
3657
|
cooldownManager.recordFailure(targetProvider, JSON.stringify(errorPayload));
|
|
3512
3658
|
}
|
|
3659
|
+
// ── Cross-provider cascade for /v1/messages path (GH #38) ──
|
|
3660
|
+
if (!isStreaming &&
|
|
3661
|
+
cross_provider_cascade_js_1.crossProviderCascade.enabled &&
|
|
3662
|
+
cross_provider_cascade_js_1.crossProviderCascade.shouldCascade(providerResponse.status)) {
|
|
3663
|
+
const { result: cascResult, data: cascData } = await cross_provider_cascade_js_1.crossProviderCascade.execute(targetProvider, targetModel || requestedModel, providerResponse.status, async (hop) => {
|
|
3664
|
+
const apiKeyResult = resolveProviderApiKey(hop.provider, ctx, useAnthropicEnvKey);
|
|
3665
|
+
if (apiKeyResult.error) {
|
|
3666
|
+
return { status: apiKeyResult.error.status, data: apiKeyResult.error.payload };
|
|
3667
|
+
}
|
|
3668
|
+
// Respect per-provider rate limits before attempting the hop
|
|
3669
|
+
try {
|
|
3670
|
+
await (0, rate_limiter_js_1.acquireSlot)('local', hop.model, hop.provider);
|
|
3671
|
+
}
|
|
3672
|
+
catch {
|
|
3673
|
+
return { status: 429, data: { error: `Local rate limit for ${hop.provider}` } };
|
|
3674
|
+
}
|
|
3675
|
+
// Convert native Anthropic body to ChatRequest for OpenAI-compatible providers
|
|
3676
|
+
const chatReq = convertNativeAnthropicBodyToChatRequest(requestBody, hop.model);
|
|
3677
|
+
const hopResult = await executeNonStreamingProviderRequest(chatReq, hop.provider, hop.model, apiKeyResult.apiKey, ctx);
|
|
3678
|
+
return { status: hopResult.status, data: hopResult.responseData };
|
|
3679
|
+
}, log);
|
|
3680
|
+
if (cascResult.success && cascData) {
|
|
3681
|
+
// Cascade succeeded — update provider/model and respond
|
|
3682
|
+
const cascDurationMs = Date.now() - startTime;
|
|
3683
|
+
const cascProvider = cascResult.provider;
|
|
3684
|
+
const cascModel = cascResult.model;
|
|
3685
|
+
logRequest(originalModel ?? 'unknown', cascModel, cascProvider, cascDurationMs, true, `${routingMode}+cross-cascade`, undefined, taskType, complexity);
|
|
3686
|
+
const cascRpHeaders = buildRelayPlaneResponseHeaders(cascModel, originalModel ?? 'unknown', complexity, cascProvider, `${routingMode}+cross-cascade`);
|
|
3687
|
+
res.writeHead(200, {
|
|
3688
|
+
'Content-Type': 'application/json',
|
|
3689
|
+
'X-RelayPlane-Cascade-Provider': cascProvider,
|
|
3690
|
+
'X-RelayPlane-Cascade-Model': cascModel,
|
|
3691
|
+
...cascRpHeaders,
|
|
3692
|
+
});
|
|
3693
|
+
res.end(JSON.stringify(cascData));
|
|
3694
|
+
return;
|
|
3695
|
+
}
|
|
3696
|
+
// All fallbacks exhausted — fall through to original error response
|
|
3697
|
+
}
|
|
3698
|
+
// ── End cross-provider cascade ──
|
|
3513
3699
|
const durationMs = Date.now() - startTime;
|
|
3514
3700
|
const errMsg = extractProviderErrorMessage(errorPayload, providerResponse.status);
|
|
3515
3701
|
logRequest(originalModel ?? 'unknown', targetModel || requestedModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, errMsg, providerResponse.status);
|
|
@@ -4011,6 +4197,21 @@ async function startProxy(config = {}) {
|
|
|
4011
4197
|
targetModel = defaultRoute.model;
|
|
4012
4198
|
}
|
|
4013
4199
|
}
|
|
4200
|
+
// ── Ollama routing: intercept before cloud dispatch ──
|
|
4201
|
+
if (!useCascade && _activeOllamaConfig && _activeOllamaConfig.enabled !== false) {
|
|
4202
|
+
if (targetProvider === 'ollama' || (0, ollama_js_1.shouldRouteToOllama)(_activeOllamaConfig, complexity, taskType, request.model)) {
|
|
4203
|
+
// Check Ollama availability before routing
|
|
4204
|
+
const ollamaHealth = await (0, ollama_js_1.checkOllamaHealthCached)(_activeOllamaConfig.baseUrl);
|
|
4205
|
+
if (ollamaHealth.available) {
|
|
4206
|
+
targetProvider = 'ollama';
|
|
4207
|
+
targetModel = (0, ollama_js_1.resolveOllamaModel)(targetModel, _activeOllamaConfig);
|
|
4208
|
+
log(`Ollama routing: ${complexity}/${taskType} → ollama/${targetModel}`);
|
|
4209
|
+
}
|
|
4210
|
+
else {
|
|
4211
|
+
log(`Ollama unavailable (${ollamaHealth.error}), falling back to cloud provider`);
|
|
4212
|
+
}
|
|
4213
|
+
}
|
|
4214
|
+
}
|
|
4014
4215
|
if (!useCascade) {
|
|
4015
4216
|
log(`Routing to: ${targetProvider}/${targetModel}`);
|
|
4016
4217
|
}
|
|
@@ -4053,20 +4254,24 @@ async function startProxy(config = {}) {
|
|
|
4053
4254
|
// ── End budget check ──
|
|
4054
4255
|
// ── Rate limit check ──
|
|
4055
4256
|
const chatWorkspaceId = 'local'; // Local proxy uses single workspace
|
|
4056
|
-
|
|
4057
|
-
|
|
4058
|
-
|
|
4257
|
+
try {
|
|
4258
|
+
// Pass targetProvider so per-provider limits apply and don't cascade across providers.
|
|
4259
|
+
await (0, rate_limiter_js_1.acquireSlot)(chatWorkspaceId, targetModel, targetProvider);
|
|
4260
|
+
}
|
|
4261
|
+
catch (err) {
|
|
4262
|
+
const chatRlErr = err;
|
|
4263
|
+
console.error(`[RATE LIMIT] ${targetModel}: ${chatRlErr.message}`);
|
|
4059
4264
|
res.writeHead(429, {
|
|
4060
4265
|
'Content-Type': 'application/json',
|
|
4061
|
-
'Retry-After': String(
|
|
4062
|
-
'X-RelayPlane-RateLimit-Limit': String(
|
|
4266
|
+
'Retry-After': String(chatRlErr.retryAfter ?? 60),
|
|
4267
|
+
'X-RelayPlane-RateLimit-Limit': String(chatRlErr.limit),
|
|
4063
4268
|
'X-RelayPlane-RateLimit-Remaining': '0',
|
|
4064
|
-
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(
|
|
4269
|
+
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRlErr.resetAt / 1000)),
|
|
4065
4270
|
});
|
|
4066
4271
|
res.end(JSON.stringify({
|
|
4067
|
-
error:
|
|
4272
|
+
error: chatRlErr.message,
|
|
4068
4273
|
type: 'rate_limit_exceeded',
|
|
4069
|
-
retry_after:
|
|
4274
|
+
retry_after: chatRlErr.retryAfter ?? 60,
|
|
4070
4275
|
}));
|
|
4071
4276
|
return;
|
|
4072
4277
|
}
|
|
@@ -4175,7 +4380,7 @@ async function startProxy(config = {}) {
|
|
|
4175
4380
|
}
|
|
4176
4381
|
}
|
|
4177
4382
|
else {
|
|
4178
|
-
await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatAgentFingerprint, chatExplicitAgentId);
|
|
4383
|
+
await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatAgentFingerprint, chatExplicitAgentId, useAnthropicEnvKey);
|
|
4179
4384
|
}
|
|
4180
4385
|
}
|
|
4181
4386
|
});
|
|
@@ -4308,6 +4513,24 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
|
|
|
4308
4513
|
}
|
|
4309
4514
|
break;
|
|
4310
4515
|
}
|
|
4516
|
+
case 'ollama': {
|
|
4517
|
+
const ollamaResult = await (0, ollama_js_1.forwardToOllama)(targetModel, request.messages, {
|
|
4518
|
+
temperature: request.temperature,
|
|
4519
|
+
max_tokens: request.max_tokens,
|
|
4520
|
+
tools: request.tools,
|
|
4521
|
+
baseUrl: _activeOllamaConfig?.baseUrl,
|
|
4522
|
+
timeoutMs: _activeOllamaConfig?.timeoutMs,
|
|
4523
|
+
});
|
|
4524
|
+
if (!ollamaResult.success) {
|
|
4525
|
+
return {
|
|
4526
|
+
responseData: { error: ollamaResult.error },
|
|
4527
|
+
ok: false,
|
|
4528
|
+
status: ollamaResult.error?.status ?? 502,
|
|
4529
|
+
};
|
|
4530
|
+
}
|
|
4531
|
+
responseData = ollamaResult.data;
|
|
4532
|
+
break;
|
|
4533
|
+
}
|
|
4311
4534
|
default: {
|
|
4312
4535
|
providerResponse = await forwardToOpenAI(request, targetModel, apiKey);
|
|
4313
4536
|
responseData = (await providerResponse.json());
|
|
@@ -4337,6 +4560,44 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
4337
4560
|
case 'groq':
|
|
4338
4561
|
providerResponse = await forwardToOpenAICompatibleStream(request, targetModel, apiKey);
|
|
4339
4562
|
break;
|
|
4563
|
+
case 'ollama': {
|
|
4564
|
+
// Ollama streaming uses its own handler that converts NDJSON → SSE
|
|
4565
|
+
const ollamaStream = await (0, ollama_js_1.forwardToOllamaStream)(targetModel, request.messages, {
|
|
4566
|
+
temperature: request.temperature,
|
|
4567
|
+
max_tokens: request.max_tokens,
|
|
4568
|
+
tools: request.tools,
|
|
4569
|
+
baseUrl: _activeOllamaConfig?.baseUrl,
|
|
4570
|
+
timeoutMs: _activeOllamaConfig?.timeoutMs,
|
|
4571
|
+
});
|
|
4572
|
+
if (!ollamaStream.success || !ollamaStream.stream) {
|
|
4573
|
+
const durationMs = Date.now() - startTime;
|
|
4574
|
+
const errMsg = ollamaStream.error?.message ?? 'Ollama stream failed';
|
|
4575
|
+
logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, errMsg, ollamaStream.error?.status);
|
|
4576
|
+
res.writeHead(ollamaStream.error?.status ?? 502, { 'Content-Type': 'application/json' });
|
|
4577
|
+
res.end(JSON.stringify({ error: ollamaStream.error }));
|
|
4578
|
+
return;
|
|
4579
|
+
}
|
|
4580
|
+
// Write SSE headers and pipe converted stream
|
|
4581
|
+
const relayHeaders = buildRelayPlaneResponseHeaders(targetModel, request.model, complexity, 'ollama', routingMode);
|
|
4582
|
+
res.writeHead(200, {
|
|
4583
|
+
'Content-Type': 'text/event-stream',
|
|
4584
|
+
'Cache-Control': 'no-cache',
|
|
4585
|
+
'Connection': 'keep-alive',
|
|
4586
|
+
...relayHeaders,
|
|
4587
|
+
});
|
|
4588
|
+
for await (const chunk of ollamaStream.stream) {
|
|
4589
|
+
res.write(chunk);
|
|
4590
|
+
}
|
|
4591
|
+
const durationMs = Date.now() - startTime;
|
|
4592
|
+
logRequest(request.model ?? 'unknown', targetModel, 'ollama', durationMs, true, routingMode, false, taskType, complexity, agentFingerprint, agentId);
|
|
4593
|
+
updateLastHistoryEntry(0, 0, 0, targetModel, undefined, undefined, agentFingerprint, agentId);
|
|
4594
|
+
if (recordTelemetry) {
|
|
4595
|
+
sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, true, 0, request.model ?? undefined);
|
|
4596
|
+
meshCapture(targetModel, 'ollama', taskType, 0, 0, 0, durationMs, true);
|
|
4597
|
+
}
|
|
4598
|
+
res.end();
|
|
4599
|
+
return;
|
|
4600
|
+
}
|
|
4340
4601
|
default:
|
|
4341
4602
|
providerResponse = await forwardToOpenAIStream(request, targetModel, apiKey);
|
|
4342
4603
|
}
|
|
@@ -4524,7 +4785,9 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
4524
4785
|
/**
|
|
4525
4786
|
* Handle non-streaming request
|
|
4526
4787
|
*/
|
|
4527
|
-
async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', agentFingerprint, agentId
|
|
4788
|
+
async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', agentFingerprint, agentId,
|
|
4789
|
+
/** Anthropic env API key — required for cross-provider cascade API key resolution (GH #38) */
|
|
4790
|
+
anthropicEnvKeyForCascade) {
|
|
4528
4791
|
let responseData;
|
|
4529
4792
|
try {
|
|
4530
4793
|
const result = await executeNonStreamingProviderRequest(request, targetProvider, targetModel, apiKey, ctx);
|
|
@@ -4533,16 +4796,59 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
|
|
|
4533
4796
|
if (cooldownsEnabled) {
|
|
4534
4797
|
cooldownManager.recordFailure(targetProvider, JSON.stringify(responseData));
|
|
4535
4798
|
}
|
|
4536
|
-
|
|
4537
|
-
|
|
4538
|
-
|
|
4539
|
-
|
|
4540
|
-
|
|
4541
|
-
|
|
4799
|
+
// ── Cross-provider cascade (GH #38) ──
|
|
4800
|
+
if (cross_provider_cascade_js_1.crossProviderCascade.enabled && cross_provider_cascade_js_1.crossProviderCascade.shouldCascade(result.status)) {
|
|
4801
|
+
const { result: cascResult, data: cascData } = await cross_provider_cascade_js_1.crossProviderCascade.execute(targetProvider, targetModel, result.status, async (hop) => {
|
|
4802
|
+
const apiKeyResult = resolveProviderApiKey(hop.provider, ctx, anthropicEnvKeyForCascade);
|
|
4803
|
+
if (apiKeyResult.error) {
|
|
4804
|
+
return { status: apiKeyResult.error.status, data: apiKeyResult.error.payload };
|
|
4805
|
+
}
|
|
4806
|
+
// Respect per-provider rate limits before attempting the hop
|
|
4807
|
+
try {
|
|
4808
|
+
await (0, rate_limiter_js_1.acquireSlot)('local', hop.model, hop.provider);
|
|
4809
|
+
}
|
|
4810
|
+
catch {
|
|
4811
|
+
// Rate-limited locally — treat as 429 so cascade continues
|
|
4812
|
+
return { status: 429, data: { error: `Local rate limit for ${hop.provider}` } };
|
|
4813
|
+
}
|
|
4814
|
+
const hopResult = await executeNonStreamingProviderRequest({ ...request, model: hop.model }, hop.provider, hop.model, apiKeyResult.apiKey, ctx);
|
|
4815
|
+
return { status: hopResult.status, data: hopResult.responseData };
|
|
4816
|
+
}, log);
|
|
4817
|
+
if (cascResult.success && cascData) {
|
|
4818
|
+
// Update tracking variables to reflect the actual provider/model used
|
|
4819
|
+
targetProvider = cascResult.provider;
|
|
4820
|
+
targetModel = cascResult.model;
|
|
4821
|
+
responseData = cascData;
|
|
4822
|
+
// Fall through to success handling below (don't return early)
|
|
4823
|
+
}
|
|
4824
|
+
else {
|
|
4825
|
+
// All fallbacks exhausted — return the primary error
|
|
4826
|
+
const durationMs = Date.now() - startTime;
|
|
4827
|
+
const nsErrMsg = extractProviderErrorMessage(responseData, result.status);
|
|
4828
|
+
logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, `${routingMode}+cascade`, undefined, taskType, complexity, undefined, undefined, nsErrMsg, result.status);
|
|
4829
|
+
if (recordTelemetry) {
|
|
4830
|
+
sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, false, 0, request.model ?? undefined);
|
|
4831
|
+
meshCapture(targetModel, targetProvider, taskType, 0, 0, 0, durationMs, false, nsErrMsg);
|
|
4832
|
+
}
|
|
4833
|
+
res.writeHead(result.status, { 'Content-Type': 'application/json' });
|
|
4834
|
+
res.end(JSON.stringify(responseData));
|
|
4835
|
+
return;
|
|
4836
|
+
}
|
|
4542
4837
|
}
|
|
4543
|
-
|
|
4544
|
-
|
|
4545
|
-
|
|
4838
|
+
else {
|
|
4839
|
+
// No cascade — return error as-is
|
|
4840
|
+
const durationMs = Date.now() - startTime;
|
|
4841
|
+
const nsErrMsg = extractProviderErrorMessage(responseData, result.status);
|
|
4842
|
+
logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, false, routingMode, undefined, taskType, complexity, undefined, undefined, nsErrMsg, result.status);
|
|
4843
|
+
if (recordTelemetry) {
|
|
4844
|
+
sendCloudTelemetry(taskType, targetModel, 0, 0, durationMs, false, 0, request.model ?? undefined);
|
|
4845
|
+
meshCapture(targetModel, targetProvider, taskType, 0, 0, 0, durationMs, false, nsErrMsg);
|
|
4846
|
+
}
|
|
4847
|
+
res.writeHead(result.status, { 'Content-Type': 'application/json' });
|
|
4848
|
+
res.end(JSON.stringify(responseData));
|
|
4849
|
+
return;
|
|
4850
|
+
}
|
|
4851
|
+
// ── End cross-provider cascade ──
|
|
4546
4852
|
}
|
|
4547
4853
|
}
|
|
4548
4854
|
catch (err) {
|