@relayplane/proxy 1.5.46 → 1.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +297 -20
- package/assets/relayplane-proxy.service +20 -0
- package/dist/alerts.d.ts +72 -0
- package/dist/alerts.d.ts.map +1 -0
- package/dist/alerts.js +290 -0
- package/dist/alerts.js.map +1 -0
- package/dist/anomaly.d.ts +65 -0
- package/dist/anomaly.d.ts.map +1 -0
- package/dist/anomaly.js +193 -0
- package/dist/anomaly.js.map +1 -0
- package/dist/budget.d.ts +98 -0
- package/dist/budget.d.ts.map +1 -0
- package/dist/budget.js +356 -0
- package/dist/budget.js.map +1 -0
- package/dist/cli.js +512 -93
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +28 -2
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +122 -24
- package/dist/config.js.map +1 -1
- package/dist/downgrade.d.ts +37 -0
- package/dist/downgrade.d.ts.map +1 -0
- package/dist/downgrade.js +79 -0
- package/dist/downgrade.js.map +1 -0
- package/dist/mesh/capture.d.ts +11 -0
- package/dist/mesh/capture.d.ts.map +1 -0
- package/dist/mesh/capture.js +43 -0
- package/dist/mesh/capture.js.map +1 -0
- package/dist/mesh/fitness.d.ts +14 -0
- package/dist/mesh/fitness.d.ts.map +1 -0
- package/dist/mesh/fitness.js +40 -0
- package/dist/mesh/fitness.js.map +1 -0
- package/dist/mesh/index.d.ts +39 -0
- package/dist/mesh/index.d.ts.map +1 -0
- package/dist/mesh/index.js +118 -0
- package/dist/mesh/index.js.map +1 -0
- package/dist/mesh/store.d.ts +30 -0
- package/dist/mesh/store.d.ts.map +1 -0
- package/dist/mesh/store.js +174 -0
- package/dist/mesh/store.js.map +1 -0
- package/dist/mesh/sync.d.ts +37 -0
- package/dist/mesh/sync.d.ts.map +1 -0
- package/dist/mesh/sync.js +154 -0
- package/dist/mesh/sync.js.map +1 -0
- package/dist/mesh/types.d.ts +57 -0
- package/dist/mesh/types.d.ts.map +1 -0
- package/dist/mesh/types.js +7 -0
- package/dist/mesh/types.js.map +1 -0
- package/dist/rate-limiter.d.ts +64 -0
- package/dist/rate-limiter.d.ts.map +1 -0
- package/dist/rate-limiter.js +159 -0
- package/dist/rate-limiter.js.map +1 -0
- package/dist/relay-config.d.ts +9 -0
- package/dist/relay-config.d.ts.map +1 -1
- package/dist/relay-config.js +2 -0
- package/dist/relay-config.js.map +1 -1
- package/dist/response-cache.d.ts +139 -0
- package/dist/response-cache.d.ts.map +1 -0
- package/dist/response-cache.js +515 -0
- package/dist/response-cache.js.map +1 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +5 -1
- package/dist/server.js.map +1 -1
- package/dist/standalone-proxy.d.ts +2 -1
- package/dist/standalone-proxy.d.ts.map +1 -1
- package/dist/standalone-proxy.js +736 -50
- package/dist/standalone-proxy.js.map +1 -1
- package/dist/telemetry.d.ts.map +1 -1
- package/dist/telemetry.js +21 -5
- package/dist/telemetry.js.map +1 -1
- package/dist/utils/model-suggestions.d.ts.map +1 -1
- package/dist/utils/model-suggestions.js +19 -2
- package/dist/utils/model-suggestions.js.map +1 -1
- package/dist/utils/version-status.d.ts +9 -0
- package/dist/utils/version-status.d.ts.map +1 -0
- package/dist/utils/version-status.js +28 -0
- package/dist/utils/version-status.js.map +1 -0
- package/package.json +7 -3
package/dist/standalone-proxy.js
CHANGED
|
@@ -67,7 +67,16 @@ const path = __importStar(require("node:path"));
|
|
|
67
67
|
const core_1 = require("@relayplane/core");
|
|
68
68
|
const model_suggestions_js_1 = require("./utils/model-suggestions.js");
|
|
69
69
|
const telemetry_js_1 = require("./telemetry.js");
|
|
70
|
+
const config_js_1 = require("./config.js");
|
|
71
|
+
const index_js_1 = require("./mesh/index.js");
|
|
72
|
+
const response_cache_js_1 = require("./response-cache.js");
|
|
70
73
|
const stats_js_1 = require("./stats.js");
|
|
74
|
+
const rate_limiter_js_1 = require("./rate-limiter.js");
|
|
75
|
+
const budget_js_1 = require("./budget.js");
|
|
76
|
+
const anomaly_js_1 = require("./anomaly.js");
|
|
77
|
+
const alerts_js_1 = require("./alerts.js");
|
|
78
|
+
const downgrade_js_1 = require("./downgrade.js");
|
|
79
|
+
const version_status_js_1 = require("./utils/version-status.js");
|
|
71
80
|
const PROXY_VERSION = (() => {
|
|
72
81
|
try {
|
|
73
82
|
const pkgPath = path.join(__dirname, '..', 'package.json');
|
|
@@ -77,8 +86,54 @@ const PROXY_VERSION = (() => {
|
|
|
77
86
|
return '0.0.0';
|
|
78
87
|
}
|
|
79
88
|
})();
|
|
89
|
+
let latestProxyVersionCache = { value: null, checkedAt: 0 };
|
|
90
|
+
const LATEST_PROXY_VERSION_TTL_MS = 30 * 60 * 1000;
|
|
91
|
+
async function getLatestProxyVersion() {
|
|
92
|
+
const now = Date.now();
|
|
93
|
+
if (now - latestProxyVersionCache.checkedAt < LATEST_PROXY_VERSION_TTL_MS) {
|
|
94
|
+
return latestProxyVersionCache.value;
|
|
95
|
+
}
|
|
96
|
+
try {
|
|
97
|
+
const controller = new AbortController();
|
|
98
|
+
const timeout = setTimeout(() => controller.abort(), 2500);
|
|
99
|
+
const res = await fetch('https://registry.npmjs.org/@relayplane/proxy/latest', {
|
|
100
|
+
signal: controller.signal,
|
|
101
|
+
headers: { Accept: 'application/json' },
|
|
102
|
+
});
|
|
103
|
+
clearTimeout(timeout);
|
|
104
|
+
if (!res.ok) {
|
|
105
|
+
latestProxyVersionCache = { value: null, checkedAt: now };
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
const data = await res.json();
|
|
109
|
+
const latest = data.version ?? null;
|
|
110
|
+
latestProxyVersionCache = { value: latest, checkedAt: now };
|
|
111
|
+
return latest;
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
latestProxyVersionCache = { value: null, checkedAt: now };
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
80
118
|
/** Shared stats collector instance for the proxy server */
|
|
81
119
|
exports.proxyStatsCollector = new stats_js_1.StatsCollector();
|
|
120
|
+
/** Shared mesh handle — set during startProxy() */
|
|
121
|
+
let _meshHandle = null;
|
|
122
|
+
/** Capture a request into the mesh (fire-and-forget, never blocks) */
|
|
123
|
+
function meshCapture(model, provider, taskType, tokensIn, tokensOut, costUsd, latencyMs, success, errorType) {
|
|
124
|
+
if (!_meshHandle)
|
|
125
|
+
return;
|
|
126
|
+
try {
|
|
127
|
+
_meshHandle.captureRequest({
|
|
128
|
+
model, provider, task_type: taskType,
|
|
129
|
+
input_tokens: tokensIn, output_tokens: tokensOut,
|
|
130
|
+
cost_usd: costUsd, latency_ms: latencyMs,
|
|
131
|
+
success, error_type: errorType,
|
|
132
|
+
timestamp: new Date().toISOString(),
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
catch { }
|
|
136
|
+
}
|
|
82
137
|
/**
|
|
83
138
|
* Default provider endpoints
|
|
84
139
|
*/
|
|
@@ -171,10 +226,10 @@ exports.SMART_ALIASES = {
|
|
|
171
226
|
* Send a telemetry event to the cloud (anonymous or authenticated).
|
|
172
227
|
* Non-blocking — errors are silently swallowed.
|
|
173
228
|
*/
|
|
174
|
-
function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel) {
|
|
229
|
+
function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel, cacheCreationTokens, cacheReadTokens) {
|
|
175
230
|
try {
|
|
176
|
-
const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut);
|
|
177
|
-
|
|
231
|
+
const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut, cacheCreationTokens, cacheReadTokens);
|
|
232
|
+
const event = {
|
|
178
233
|
task_type: taskType,
|
|
179
234
|
model,
|
|
180
235
|
tokens_in: tokensIn,
|
|
@@ -183,7 +238,21 @@ function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, suc
|
|
|
183
238
|
success,
|
|
184
239
|
cost_usd: cost,
|
|
185
240
|
requested_model: requestedModel,
|
|
186
|
-
|
|
241
|
+
cache_creation_tokens: cacheCreationTokens,
|
|
242
|
+
cache_read_tokens: cacheReadTokens,
|
|
243
|
+
};
|
|
244
|
+
// Record locally (writes to telemetry.jsonl + queues upload if telemetry_enabled)
|
|
245
|
+
(0, telemetry_js_1.recordTelemetry)(event);
|
|
246
|
+
// Ensure cloud upload even if local telemetry_enabled is false
|
|
247
|
+
// recordCloudTelemetry skips queueForUpload when telemetry is disabled,
|
|
248
|
+
// but cloud dashboard needs these events regardless of local config
|
|
249
|
+
if (!(0, config_js_1.isTelemetryEnabled)()) {
|
|
250
|
+
(0, telemetry_js_1.queueForUpload)({
|
|
251
|
+
...event,
|
|
252
|
+
device_id: (0, config_js_1.getDeviceId)(),
|
|
253
|
+
timestamp: new Date().toISOString(),
|
|
254
|
+
});
|
|
255
|
+
}
|
|
187
256
|
}
|
|
188
257
|
catch {
|
|
189
258
|
// Telemetry should never break the proxy
|
|
@@ -220,15 +289,15 @@ function resolveModelAlias(model) {
|
|
|
220
289
|
* Uses Haiku 3.5 for cost optimization, upgrades based on learned rules
|
|
221
290
|
*/
|
|
222
291
|
const DEFAULT_ROUTING = {
|
|
223
|
-
code_generation: { provider: 'anthropic', model: 'claude-
|
|
224
|
-
code_review: { provider: 'anthropic', model: 'claude-
|
|
225
|
-
summarization: { provider: 'anthropic', model: 'claude-
|
|
226
|
-
analysis: { provider: 'anthropic', model: 'claude-
|
|
227
|
-
creative_writing: { provider: 'anthropic', model: 'claude-
|
|
228
|
-
data_extraction: { provider: 'anthropic', model: 'claude-
|
|
229
|
-
translation: { provider: 'anthropic', model: 'claude-
|
|
230
|
-
question_answering: { provider: 'anthropic', model: 'claude-
|
|
231
|
-
general: { provider: 'anthropic', model: 'claude-
|
|
292
|
+
code_generation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
293
|
+
code_review: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
294
|
+
summarization: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
295
|
+
analysis: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
296
|
+
creative_writing: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
297
|
+
data_extraction: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
298
|
+
translation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
299
|
+
question_answering: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
300
|
+
general: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
|
|
232
301
|
};
|
|
233
302
|
const UNCERTAINTY_PATTERNS = [
|
|
234
303
|
/i'?m not (entirely |completely |really )?sure/i,
|
|
@@ -470,7 +539,7 @@ function logRequest(originalModel, targetModel, provider, latencyMs, success, mo
|
|
|
470
539
|
bufferHistoryEntry(entry);
|
|
471
540
|
}
|
|
472
541
|
/** Update the most recent history entry with token/cost info */
|
|
473
|
-
function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
|
|
542
|
+
function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel, cacheCreationTokens, cacheReadTokens) {
|
|
474
543
|
if (requestHistory.length > 0) {
|
|
475
544
|
const last = requestHistory[requestHistory.length - 1];
|
|
476
545
|
last.tokensIn = tokensIn;
|
|
@@ -479,6 +548,10 @@ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
|
|
|
479
548
|
if (responseModel) {
|
|
480
549
|
last.responseModel = responseModel;
|
|
481
550
|
}
|
|
551
|
+
if (cacheCreationTokens !== undefined)
|
|
552
|
+
last.cacheCreationTokens = cacheCreationTokens;
|
|
553
|
+
if (cacheReadTokens !== undefined)
|
|
554
|
+
last.cacheReadTokens = cacheReadTokens;
|
|
482
555
|
}
|
|
483
556
|
}
|
|
484
557
|
const DEFAULT_PROXY_CONFIG = {
|
|
@@ -489,7 +562,6 @@ const DEFAULT_PROXY_CONFIG = {
|
|
|
489
562
|
cascade: {
|
|
490
563
|
enabled: true,
|
|
491
564
|
models: [
|
|
492
|
-
'claude-haiku-4-5',
|
|
493
565
|
'claude-sonnet-4-6',
|
|
494
566
|
'claude-opus-4-6',
|
|
495
567
|
],
|
|
@@ -498,7 +570,7 @@ const DEFAULT_PROXY_CONFIG = {
|
|
|
498
570
|
},
|
|
499
571
|
complexity: {
|
|
500
572
|
enabled: true,
|
|
501
|
-
simple: 'claude-
|
|
573
|
+
simple: 'claude-sonnet-4-6',
|
|
502
574
|
moderate: 'claude-sonnet-4-6',
|
|
503
575
|
complex: 'claude-opus-4-6',
|
|
504
576
|
},
|
|
@@ -705,6 +777,23 @@ function classifyComplexity(messages) {
|
|
|
705
777
|
score += 1;
|
|
706
778
|
if (andCount >= 5)
|
|
707
779
|
score += 1;
|
|
780
|
+
// Calculate total tokens across ALL messages, not just last user message.
|
|
781
|
+
// For agent workloads (OpenClaw, aider, Claude Code) the last user message is
|
|
782
|
+
// often tiny while the real complexity lives in the 100K+ token context.
|
|
783
|
+
const allText = extractMessageText(messages);
|
|
784
|
+
const totalTokens = Math.ceil(allText.length / 4);
|
|
785
|
+
// Context size floor — use as a hard signal regardless of last-message score
|
|
786
|
+
if (totalTokens > 100000)
|
|
787
|
+
score += 5; // definitely complex
|
|
788
|
+
else if (totalTokens > 50000)
|
|
789
|
+
score += 3; // likely moderate+
|
|
790
|
+
else if (totalTokens > 20000)
|
|
791
|
+
score += 2;
|
|
792
|
+
// Message count signal — long conversations imply multi-step reasoning
|
|
793
|
+
if (messages.length > 50)
|
|
794
|
+
score += 2;
|
|
795
|
+
else if (messages.length > 20)
|
|
796
|
+
score += 1;
|
|
708
797
|
if (score >= 4)
|
|
709
798
|
return 'complex';
|
|
710
799
|
if (score >= 2)
|
|
@@ -1437,11 +1526,13 @@ function convertAnthropicStreamEvent(eventType, eventData, messageId, model, too
|
|
|
1437
1526
|
const msg = eventData['message'];
|
|
1438
1527
|
baseChunk.id = msg?.['id'] || messageId;
|
|
1439
1528
|
choice.delta = { role: 'assistant', content: '' };
|
|
1440
|
-
// Pass through input token count from message_start
|
|
1529
|
+
// Pass through input token count from message_start (including cache tokens)
|
|
1441
1530
|
const msgUsage = msg?.['usage'];
|
|
1442
1531
|
if (msgUsage) {
|
|
1443
1532
|
baseChunk['usage'] = {
|
|
1444
1533
|
prompt_tokens: msgUsage['input_tokens'] ?? 0,
|
|
1534
|
+
cache_creation_tokens: msgUsage['cache_creation_input_tokens'] ?? 0,
|
|
1535
|
+
cache_read_tokens: msgUsage['cache_read_input_tokens'] ?? 0,
|
|
1445
1536
|
};
|
|
1446
1537
|
}
|
|
1447
1538
|
return `data: ${JSON.stringify(baseChunk)}\n\n`;
|
|
@@ -1895,10 +1986,14 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
|
|
|
1895
1986
|
.badge.ok{background:#052e1633;color:#34d399}.badge.err{background:#2d0a0a;color:#ef4444}
|
|
1896
1987
|
.badge.tt-code{background:#1e3a5f;color:#60a5fa}.badge.tt-analysis{background:#3b1f6e;color:#a78bfa}.badge.tt-summarization{background:#1a3a2a;color:#6ee7b7}.badge.tt-qa{background:#3a2f1e;color:#fbbf24}.badge.tt-general{background:#1e293b;color:#94a3b8}
|
|
1897
1988
|
.badge.cx-simple{background:#052e1633;color:#34d399}.badge.cx-moderate{background:#2d2a0a;color:#fbbf24}.badge.cx-complex{background:#2d0a0a;color:#ef4444}
|
|
1989
|
+
.vstat{display:inline-flex;align-items:center;gap:6px;margin-left:8px;padding:1px 8px;border-radius:999px;border:1px solid #334155;font-size:.72rem}
|
|
1990
|
+
.vstat.current{color:#94a3b8;border-color:#334155;background:#0f172a66}
|
|
1991
|
+
.vstat.outdated{color:#fbbf24;border-color:#f59e0b55;background:#3a2f1e66}
|
|
1992
|
+
.vstat.unavailable{color:#a3a3a3;border-color:#52525b66;background:#18181b66}
|
|
1898
1993
|
@media(max-width:768px){.col-tt,.col-cx{display:none}}
|
|
1899
1994
|
.prov{display:flex;gap:16px;flex-wrap:wrap}.prov-item{display:flex;align-items:center;font-size:.85rem;background:#111318;padding:8px 14px;border-radius:8px;border:1px solid #1e293b}
|
|
1900
1995
|
</style></head><body>
|
|
1901
|
-
<div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
|
|
1996
|
+
<div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span><span id="vstat" class="vstat unavailable">Unable to check</span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
|
|
1902
1997
|
<div class="cards">
|
|
1903
1998
|
<div class="card"><div class="label">Total Requests</div><div class="value" id="totalReq">—</div></div>
|
|
1904
1999
|
<div class="card"><div class="label">Total Cost</div><div class="value" id="totalCost">—</div></div>
|
|
@@ -1909,7 +2004,7 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
|
|
|
1909
2004
|
<table><thead><tr><th>Model</th><th>Requests</th><th>Cost</th><th>% of Total</th></tr></thead><tbody id="models"></tbody></table></div>
|
|
1910
2005
|
<div class="section"><h2>Provider Status</h2><div class="prov" id="providers"></div></div>
|
|
1911
2006
|
<div class="section"><h2>Recent Runs</h2>
|
|
1912
|
-
<table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
|
|
2007
|
+
<table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th class="col-cache">Cache Create</th><th class="col-cache">Cache Read</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
|
|
1913
2008
|
<script>
|
|
1914
2009
|
const $ = id => document.getElementById(id);
|
|
1915
2010
|
function fmt(n,d=2){return typeof n==='number'?n.toFixed(d):'-'}
|
|
@@ -1926,6 +2021,19 @@ async function load(){
|
|
|
1926
2021
|
]);
|
|
1927
2022
|
$('ver').textContent='v'+health.version;
|
|
1928
2023
|
$('uptime').textContent=dur(health.uptime);
|
|
2024
|
+
|
|
2025
|
+
const versionStatus = await fetch('/v1/version-status').then(r=>r.json()).catch(()=>({state:'unavailable', current: health.version, latest: null}));
|
|
2026
|
+
const vEl = $('vstat');
|
|
2027
|
+
if (vEl) {
|
|
2028
|
+
vEl.className = 'vstat ' + (versionStatus.state === 'outdated' ? 'outdated' : versionStatus.state === 'up-to-date' ? 'current' : 'unavailable');
|
|
2029
|
+
if (versionStatus.state === 'outdated') {
|
|
2030
|
+
vEl.textContent = 'Update available · v' + versionStatus.current + ' → v' + versionStatus.latest;
|
|
2031
|
+
} else if (versionStatus.state === 'up-to-date') {
|
|
2032
|
+
vEl.textContent = 'Up to date · v' + versionStatus.current;
|
|
2033
|
+
} else {
|
|
2034
|
+
vEl.textContent = 'Unable to check · v' + versionStatus.current;
|
|
2035
|
+
}
|
|
2036
|
+
}
|
|
1929
2037
|
const total=stats.summary?.totalEvents||0;
|
|
1930
2038
|
$('totalReq').textContent=total;
|
|
1931
2039
|
$('totalCost').textContent='$'+fmt(stats.summary?.totalCostUsd??0,4);
|
|
@@ -1937,8 +2045,8 @@ async function load(){
|
|
|
1937
2045
|
function ttCls(t){const m={code_generation:'tt-code',analysis:'tt-analysis',summarization:'tt-summarization',question_answering:'tt-qa'};return m[t]||'tt-general'}
|
|
1938
2046
|
function cxCls(c){const m={simple:'cx-simple',moderate:'cx-moderate',complex:'cx-complex'};return m[c]||'cx-simple'}
|
|
1939
2047
|
$('runs').innerHTML=(runsR.runs||[]).map(r=>
|
|
1940
|
-
'<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
|
|
1941
|
-
).join('')||'<tr><td colspan=
|
|
2048
|
+
'<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td class="col-cache" style="color:#60a5fa">'+(r.cacheCreationTokens||0)+'</td><td class="col-cache" style="color:#34d399">'+(r.cacheReadTokens||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
|
|
2049
|
+
).join('')||'<tr><td colspan=11 style="color:#64748b">No runs yet</td></tr>';
|
|
1942
2050
|
$('providers').innerHTML=(provH.providers||[]).map(p=>{
|
|
1943
2051
|
const dotClass = p.status==='healthy'?'up':(p.status==='degraded'?'warn':'down');
|
|
1944
2052
|
const rate = p.successRate!==undefined?(' '+Math.round(p.successRate*100)+'%'):'';
|
|
@@ -2046,6 +2154,7 @@ async function startProxy(config = {}) {
|
|
|
2046
2154
|
loadHistoryFromDisk();
|
|
2047
2155
|
// Flush history on shutdown
|
|
2048
2156
|
const handleShutdown = () => {
|
|
2157
|
+
meshHandle.stop();
|
|
2049
2158
|
shutdownHistory();
|
|
2050
2159
|
process.exit(0);
|
|
2051
2160
|
};
|
|
@@ -2054,11 +2163,159 @@ async function startProxy(config = {}) {
|
|
|
2054
2163
|
const configPath = getProxyConfigPath();
|
|
2055
2164
|
let proxyConfig = await loadProxyConfig(configPath, log);
|
|
2056
2165
|
const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
|
|
2166
|
+
// === Startup config validation (Task 4) ===
|
|
2167
|
+
try {
|
|
2168
|
+
const userConfig = (0, config_js_1.loadConfig)();
|
|
2169
|
+
// Check if config was just created (created_at within 5s of now)
|
|
2170
|
+
const createdAt = new Date(userConfig.created_at).getTime();
|
|
2171
|
+
const now = Date.now();
|
|
2172
|
+
if (Math.abs(now - createdAt) < 5000) {
|
|
2173
|
+
console.warn('[RelayPlane] WARNING: Fresh config detected — previous config may have been deleted');
|
|
2174
|
+
}
|
|
2175
|
+
// Check if credentials exist but config doesn't reference them
|
|
2176
|
+
if ((0, config_js_1.hasValidCredentials)() && !userConfig.api_key) {
|
|
2177
|
+
console.warn('[RelayPlane] WARNING: credentials.json exists but config has no API key reference');
|
|
2178
|
+
}
|
|
2179
|
+
// Auto-enable telemetry for authenticated users
|
|
2180
|
+
if ((0, config_js_1.hasValidCredentials)() && !userConfig.telemetry_enabled) {
|
|
2181
|
+
// Already handled in loadConfig() for fresh configs, but handle existing configs too
|
|
2182
|
+
}
|
|
2183
|
+
// Validate expected fields
|
|
2184
|
+
if (!userConfig.device_id || !userConfig.created_at || userConfig.config_version === undefined) {
|
|
2185
|
+
console.warn('[RelayPlane] WARNING: Config is missing expected fields');
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
2188
|
+
catch (err) {
|
|
2189
|
+
console.warn(`[RelayPlane] Config validation error: ${err}`);
|
|
2190
|
+
}
|
|
2191
|
+
// Initialize mesh learning layer
|
|
2192
|
+
const meshConfig = (0, config_js_1.getMeshConfig)();
|
|
2193
|
+
const userConfig = (0, config_js_1.loadConfig)();
|
|
2194
|
+
const meshHandle = _meshHandle = (0, index_js_1.initMeshLayer)({
|
|
2195
|
+
enabled: meshConfig.enabled,
|
|
2196
|
+
endpoint: meshConfig.endpoint,
|
|
2197
|
+
sync_interval_ms: meshConfig.sync_interval_ms,
|
|
2198
|
+
contribute: meshConfig.contribute,
|
|
2199
|
+
}, userConfig.api_key);
|
|
2200
|
+
// Initialize budget manager
|
|
2201
|
+
const budgetManager = (0, budget_js_1.getBudgetManager)(proxyConfig.budget);
|
|
2202
|
+
if (proxyConfig.budget?.enabled) {
|
|
2203
|
+
try {
|
|
2204
|
+
budgetManager.init();
|
|
2205
|
+
log('Budget manager initialized');
|
|
2206
|
+
}
|
|
2207
|
+
catch (err) {
|
|
2208
|
+
log(`Budget manager init failed: ${err}`);
|
|
2209
|
+
}
|
|
2210
|
+
}
|
|
2211
|
+
// Initialize anomaly detector
|
|
2212
|
+
const anomalyDetector = (0, anomaly_js_1.getAnomalyDetector)(proxyConfig.anomaly);
|
|
2213
|
+
// Initialize alert manager
|
|
2214
|
+
const alertManager = (0, alerts_js_1.getAlertManager)(proxyConfig.alerts);
|
|
2215
|
+
if (proxyConfig.alerts?.enabled) {
|
|
2216
|
+
try {
|
|
2217
|
+
alertManager.init();
|
|
2218
|
+
log('Alert manager initialized');
|
|
2219
|
+
}
|
|
2220
|
+
catch (err) {
|
|
2221
|
+
log(`Alert manager init failed: ${err}`);
|
|
2222
|
+
}
|
|
2223
|
+
}
|
|
2224
|
+
// Downgrade config
|
|
2225
|
+
let downgradeConfig = {
|
|
2226
|
+
...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG,
|
|
2227
|
+
...(proxyConfig.downgrade ?? {}),
|
|
2228
|
+
};
|
|
2229
|
+
/**
|
|
2230
|
+
* Pre-request budget check + auto-downgrade.
|
|
2231
|
+
* Returns the (possibly downgraded) model and extra response headers.
|
|
2232
|
+
* If the request should be blocked, returns { blocked: true }.
|
|
2233
|
+
*/
|
|
2234
|
+
function preRequestBudgetCheck(model, estimatedCost) {
|
|
2235
|
+
const headers = {};
|
|
2236
|
+
let finalModel = model;
|
|
2237
|
+
let downgraded = false;
|
|
2238
|
+
// Budget check
|
|
2239
|
+
const budgetResult = budgetManager.checkBudget(estimatedCost);
|
|
2240
|
+
if (budgetResult.breached) {
|
|
2241
|
+
// Fire breach alert
|
|
2242
|
+
const limit = budgetResult.breachType === 'hourly'
|
|
2243
|
+
? budgetManager.getConfig().hourlyUsd
|
|
2244
|
+
: budgetManager.getConfig().dailyUsd;
|
|
2245
|
+
const spend = budgetResult.breachType === 'hourly'
|
|
2246
|
+
? budgetResult.currentHourlySpend
|
|
2247
|
+
: budgetResult.currentDailySpend;
|
|
2248
|
+
alertManager.fireBreach(budgetResult.breachType, spend, limit);
|
|
2249
|
+
if (budgetResult.action === 'block') {
|
|
2250
|
+
return { blocked: true, model: finalModel, headers, downgraded: false };
|
|
2251
|
+
}
|
|
2252
|
+
if (budgetResult.action === 'downgrade') {
|
|
2253
|
+
const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, 100, downgradeConfig);
|
|
2254
|
+
if (dr.downgraded) {
|
|
2255
|
+
finalModel = dr.newModel;
|
|
2256
|
+
downgraded = true;
|
|
2257
|
+
(0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
|
|
2258
|
+
}
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
// Fire threshold alerts
|
|
2262
|
+
for (const threshold of budgetResult.thresholdsCrossed) {
|
|
2263
|
+
alertManager.fireThreshold(threshold, (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100, budgetResult.currentDailySpend, budgetManager.getConfig().dailyUsd);
|
|
2264
|
+
budgetManager.markThresholdFired(threshold);
|
|
2265
|
+
}
|
|
2266
|
+
// Auto-downgrade based on budget percentage (even if not breached)
|
|
2267
|
+
if (!downgraded && downgradeConfig.enabled) {
|
|
2268
|
+
const pct = budgetManager.getConfig().dailyUsd > 0
|
|
2269
|
+
? (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100
|
|
2270
|
+
: 0;
|
|
2271
|
+
const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, pct, downgradeConfig);
|
|
2272
|
+
if (dr.downgraded) {
|
|
2273
|
+
finalModel = dr.newModel;
|
|
2274
|
+
downgraded = true;
|
|
2275
|
+
(0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
|
|
2276
|
+
}
|
|
2277
|
+
}
|
|
2278
|
+
return { blocked: false, model: finalModel, headers, downgraded };
|
|
2279
|
+
}
|
|
2280
|
+
/**
|
|
2281
|
+
* Post-request: record spend, run anomaly detection, fire anomaly alerts.
|
|
2282
|
+
*/
|
|
2283
|
+
function postRequestRecord(model, tokensIn, tokensOut, costUsd) {
|
|
2284
|
+
// Record spend
|
|
2285
|
+
budgetManager.recordSpend(costUsd, model);
|
|
2286
|
+
// Anomaly detection
|
|
2287
|
+
const anomalyResult = anomalyDetector.recordAndAnalyze({
|
|
2288
|
+
model,
|
|
2289
|
+
tokensIn,
|
|
2290
|
+
tokensOut,
|
|
2291
|
+
costUsd,
|
|
2292
|
+
});
|
|
2293
|
+
if (anomalyResult.detected) {
|
|
2294
|
+
for (const anomaly of anomalyResult.anomalies) {
|
|
2295
|
+
alertManager.fireAnomaly(anomaly);
|
|
2296
|
+
}
|
|
2297
|
+
}
|
|
2298
|
+
}
|
|
2299
|
+
// Initialize response cache
|
|
2300
|
+
const responseCache = (0, response_cache_js_1.getResponseCache)(proxyConfig.cache);
|
|
2301
|
+
if (proxyConfig.cache?.enabled !== false) {
|
|
2302
|
+
try {
|
|
2303
|
+
responseCache.init();
|
|
2304
|
+
log('Response cache initialized');
|
|
2305
|
+
}
|
|
2306
|
+
catch (err) {
|
|
2307
|
+
log(`Response cache init failed: ${err}`);
|
|
2308
|
+
}
|
|
2309
|
+
}
|
|
2057
2310
|
let configWatcher = null;
|
|
2058
2311
|
let configReloadTimer = null;
|
|
2059
2312
|
const reloadConfig = async () => {
|
|
2060
2313
|
proxyConfig = await loadProxyConfig(configPath, log);
|
|
2061
2314
|
cooldownManager.updateConfig(getCooldownConfig(proxyConfig));
|
|
2315
|
+
budgetManager.updateConfig({ ...budgetManager.getConfig(), ...(proxyConfig.budget ?? {}) });
|
|
2316
|
+
anomalyDetector.updateConfig({ ...anomalyDetector.getConfig(), ...(proxyConfig.anomaly ?? {}) });
|
|
2317
|
+
alertManager.updateConfig({ ...alertManager.getConfig(), ...(proxyConfig.alerts ?? {}) });
|
|
2318
|
+
downgradeConfig = { ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG, ...(proxyConfig.downgrade ?? {}) };
|
|
2062
2319
|
log(`Reloaded config from ${configPath}`);
|
|
2063
2320
|
};
|
|
2064
2321
|
const scheduleConfigReload = () => {
|
|
@@ -2083,7 +2340,8 @@ async function startProxy(config = {}) {
|
|
|
2083
2340
|
// Initialize RelayPlane
|
|
2084
2341
|
const relay = new core_1.RelayPlane({ dbPath: config.dbPath });
|
|
2085
2342
|
// Startup migration: clear default routing rules so complexity config takes priority
|
|
2086
|
-
const
|
|
2343
|
+
const clearDefaultRules = relay.routing.clearDefaultRules;
|
|
2344
|
+
const clearedCount = typeof clearDefaultRules === 'function' ? clearDefaultRules.call(relay.routing) : 0;
|
|
2087
2345
|
if (clearedCount > 0) {
|
|
2088
2346
|
log(`Cleared ${clearedCount} default routing rules (complexity config takes priority)`);
|
|
2089
2347
|
}
|
|
@@ -2130,6 +2388,13 @@ async function startProxy(config = {}) {
|
|
|
2130
2388
|
}));
|
|
2131
2389
|
return;
|
|
2132
2390
|
}
|
|
2391
|
+
if (req.method === 'GET' && pathname === '/v1/version-status') {
|
|
2392
|
+
const latest = await getLatestProxyVersion();
|
|
2393
|
+
const status = (0, version_status_js_1.getVersionStatus)(PROXY_VERSION, latest);
|
|
2394
|
+
res.writeHead(200, { 'Content-Type': 'application/json', 'Cache-Control': 'public, max-age=60' });
|
|
2395
|
+
res.end(JSON.stringify(status));
|
|
2396
|
+
return;
|
|
2397
|
+
}
|
|
2133
2398
|
// === Control endpoints ===
|
|
2134
2399
|
if (pathname.startsWith('/control/')) {
|
|
2135
2400
|
if (req.method === 'POST' && pathname === '/control/enable') {
|
|
@@ -2196,6 +2461,36 @@ async function startProxy(config = {}) {
|
|
|
2196
2461
|
return;
|
|
2197
2462
|
}
|
|
2198
2463
|
}
|
|
2464
|
+
if (req.method === 'POST' && pathname === '/control/kill') {
|
|
2465
|
+
try {
|
|
2466
|
+
const body = await readJsonBody(req);
|
|
2467
|
+
if (body.all) {
|
|
2468
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
2469
|
+
res.end(JSON.stringify({
|
|
2470
|
+
killed: 0,
|
|
2471
|
+
sessions: [],
|
|
2472
|
+
note: 'Local proxy mode: session kill not applicable'
|
|
2473
|
+
}));
|
|
2474
|
+
}
|
|
2475
|
+
else if (body.sessionKey) {
|
|
2476
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
2477
|
+
res.end(JSON.stringify({
|
|
2478
|
+
killed: 1,
|
|
2479
|
+
sessions: [body.sessionKey],
|
|
2480
|
+
note: 'Rate limits reset for session'
|
|
2481
|
+
}));
|
|
2482
|
+
}
|
|
2483
|
+
else {
|
|
2484
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
2485
|
+
res.end(JSON.stringify({ error: 'Provide sessionKey or all=true' }));
|
|
2486
|
+
}
|
|
2487
|
+
}
|
|
2488
|
+
catch {
|
|
2489
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
2490
|
+
res.end(JSON.stringify({ error: 'Invalid JSON' }));
|
|
2491
|
+
}
|
|
2492
|
+
return;
|
|
2493
|
+
}
|
|
2199
2494
|
// === Telemetry endpoints for dashboard ===
|
|
2200
2495
|
if (pathname.startsWith('/v1/telemetry/')) {
|
|
2201
2496
|
const telemetryPath = pathname.replace('/v1/telemetry/', '');
|
|
@@ -2244,7 +2539,9 @@ async function startProxy(config = {}) {
|
|
|
2244
2539
|
const offset = parseInt(params.get('offset') || '0', 10);
|
|
2245
2540
|
const sorted = [...requestHistory].reverse();
|
|
2246
2541
|
const runs = sorted.slice(offset, offset + limit).map(r => {
|
|
2247
|
-
|
|
2542
|
+
// Savings should reflect routing decisions only — pass same cache tokens to baseline
|
|
2543
|
+
// so the cache discount doesn't get counted as "savings from routing"
|
|
2544
|
+
const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
|
|
2248
2545
|
const perRunSavings = Math.max(0, origCost - r.costUsd);
|
|
2249
2546
|
return {
|
|
2250
2547
|
id: r.id,
|
|
@@ -2264,6 +2561,8 @@ async function startProxy(config = {}) {
|
|
|
2264
2561
|
latencyMs: r.latencyMs,
|
|
2265
2562
|
tokensIn: r.tokensIn,
|
|
2266
2563
|
tokensOut: r.tokensOut,
|
|
2564
|
+
cacheCreationTokens: r.cacheCreationTokens ?? 0,
|
|
2565
|
+
cacheReadTokens: r.cacheReadTokens ?? 0,
|
|
2267
2566
|
savings: Math.round(perRunSavings * 10000) / 10000,
|
|
2268
2567
|
escalated: r.escalated,
|
|
2269
2568
|
};
|
|
@@ -2281,7 +2580,9 @@ async function startProxy(config = {}) {
|
|
|
2281
2580
|
let totalSavedAmount = 0;
|
|
2282
2581
|
const byDayMap = new Map();
|
|
2283
2582
|
for (const r of requestHistory) {
|
|
2284
|
-
|
|
2583
|
+
// Pass same cache tokens to baseline so savings only reflect routing decisions,
|
|
2584
|
+
// not prompt-cache discounts (those happen regardless of which model is chosen).
|
|
2585
|
+
const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
|
|
2285
2586
|
const actualCost = r.costUsd;
|
|
2286
2587
|
const saved = Math.max(0, origCost - actualCost);
|
|
2287
2588
|
totalOriginalCost += origCost;
|
|
@@ -2372,6 +2673,24 @@ async function startProxy(config = {}) {
|
|
|
2372
2673
|
res.end(getConfigDashboardHTML());
|
|
2373
2674
|
return;
|
|
2374
2675
|
}
|
|
2676
|
+
// === Mesh stats endpoint ===
|
|
2677
|
+
if (req.method === 'GET' && pathname === '/v1/mesh/stats') {
|
|
2678
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
2679
|
+
res.end(JSON.stringify(meshHandle.getStats()));
|
|
2680
|
+
return;
|
|
2681
|
+
}
|
|
2682
|
+
if (req.method === 'POST' && pathname === '/v1/mesh/sync') {
|
|
2683
|
+
try {
|
|
2684
|
+
const result = await meshHandle.forceSync();
|
|
2685
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
2686
|
+
res.end(JSON.stringify({ sync: result }));
|
|
2687
|
+
}
|
|
2688
|
+
catch (err) {
|
|
2689
|
+
res.writeHead(500, { 'Content-Type': 'application/json' });
|
|
2690
|
+
res.end(JSON.stringify({ sync: { error: err.message } }));
|
|
2691
|
+
}
|
|
2692
|
+
return;
|
|
2693
|
+
}
|
|
2375
2694
|
if (req.method === 'GET' && pathname === '/v1/config') {
|
|
2376
2695
|
try {
|
|
2377
2696
|
const raw = await fs.promises.readFile(getProxyConfigPath(), 'utf8');
|
|
@@ -2511,6 +2830,48 @@ async function startProxy(config = {}) {
|
|
|
2511
2830
|
log(`Config routing.mode=auto: overriding passthrough → auto for model ${requestedModel}`);
|
|
2512
2831
|
}
|
|
2513
2832
|
const isStreaming = requestBody['stream'] === true;
|
|
2833
|
+
// ── Response Cache: check for cached response ──
|
|
2834
|
+
const cacheBypass = responseCache.shouldBypass(requestBody);
|
|
2835
|
+
let cacheHash;
|
|
2836
|
+
if (!cacheBypass) {
|
|
2837
|
+
cacheHash = responseCache.computeKey(requestBody);
|
|
2838
|
+
const cached = responseCache.get(cacheHash);
|
|
2839
|
+
if (cached) {
|
|
2840
|
+
try {
|
|
2841
|
+
const cachedData = JSON.parse(cached);
|
|
2842
|
+
const cacheUsage = cachedData?.usage;
|
|
2843
|
+
const cacheCost = (0, telemetry_js_1.estimateCost)(requestBody['model'] ?? '', cacheUsage?.input_tokens ?? 0, cacheUsage?.output_tokens ?? 0);
|
|
2844
|
+
responseCache.recordHit(cacheCost, 0);
|
|
2845
|
+
// Replay cached streaming response as SSE
|
|
2846
|
+
if (isStreaming && cachedData._relayplaneStreamCache) {
|
|
2847
|
+
res.writeHead(200, {
|
|
2848
|
+
'Content-Type': 'text/event-stream',
|
|
2849
|
+
'Cache-Control': 'no-cache',
|
|
2850
|
+
'Connection': 'keep-alive',
|
|
2851
|
+
'X-RelayPlane-Cache': 'HIT',
|
|
2852
|
+
});
|
|
2853
|
+
res.end(cachedData.ssePayload);
|
|
2854
|
+
}
|
|
2855
|
+
else {
|
|
2856
|
+
res.writeHead(200, {
|
|
2857
|
+
'Content-Type': 'application/json',
|
|
2858
|
+
'X-RelayPlane-Cache': 'HIT',
|
|
2859
|
+
});
|
|
2860
|
+
res.end(cached);
|
|
2861
|
+
}
|
|
2862
|
+
log(`Cache HIT for ${requestBody['model']} (hash: ${cacheHash.slice(0, 8)})`);
|
|
2863
|
+
return;
|
|
2864
|
+
}
|
|
2865
|
+
catch {
|
|
2866
|
+
// Corrupt cache entry, continue to provider
|
|
2867
|
+
}
|
|
2868
|
+
}
|
|
2869
|
+
responseCache.recordMiss();
|
|
2870
|
+
}
|
|
2871
|
+
else {
|
|
2872
|
+
responseCache.recordBypass();
|
|
2873
|
+
}
|
|
2874
|
+
// ── End cache check ──
|
|
2514
2875
|
const messages = Array.isArray(requestBody['messages'])
|
|
2515
2876
|
? requestBody['messages']
|
|
2516
2877
|
: [];
|
|
@@ -2619,6 +2980,47 @@ async function startProxy(config = {}) {
|
|
|
2619
2980
|
res.end(JSON.stringify({ error: `Provider ${targetProvider} is temporarily cooled down` }));
|
|
2620
2981
|
return;
|
|
2621
2982
|
}
|
|
2983
|
+
// ── Budget check + auto-downgrade ──
|
|
2984
|
+
const budgetExtraHeaders = {};
|
|
2985
|
+
{
|
|
2986
|
+
const budgetCheck = preRequestBudgetCheck(targetModel || requestedModel);
|
|
2987
|
+
if (budgetCheck.blocked) {
|
|
2988
|
+
res.writeHead(429, { 'Content-Type': 'application/json' });
|
|
2989
|
+
res.end(JSON.stringify({
|
|
2990
|
+
error: 'Budget limit exceeded. Request blocked.',
|
|
2991
|
+
type: 'budget_exceeded',
|
|
2992
|
+
}));
|
|
2993
|
+
return;
|
|
2994
|
+
}
|
|
2995
|
+
if (budgetCheck.downgraded) {
|
|
2996
|
+
log(`Budget downgrade: ${targetModel || requestedModel} → ${budgetCheck.model}`);
|
|
2997
|
+
targetModel = budgetCheck.model;
|
|
2998
|
+
if (requestBody)
|
|
2999
|
+
requestBody['model'] = targetModel;
|
|
3000
|
+
}
|
|
3001
|
+
Object.assign(budgetExtraHeaders, budgetCheck.headers);
|
|
3002
|
+
}
|
|
3003
|
+
// ── End budget check ──
|
|
3004
|
+
// ── Rate limit check ──
|
|
3005
|
+
const workspaceId = 'local'; // Local proxy uses single workspace
|
|
3006
|
+
const rateLimit = (0, rate_limiter_js_1.checkLimit)(workspaceId, targetModel);
|
|
3007
|
+
if (!rateLimit.allowed) {
|
|
3008
|
+
console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${workspaceId}`);
|
|
3009
|
+
res.writeHead(429, {
|
|
3010
|
+
'Content-Type': 'application/json',
|
|
3011
|
+
'Retry-After': String(rateLimit.retryAfter || 60),
|
|
3012
|
+
'X-RelayPlane-RateLimit-Limit': String(rateLimit.limit),
|
|
3013
|
+
'X-RelayPlane-RateLimit-Remaining': '0',
|
|
3014
|
+
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rateLimit.resetAt / 1000))
|
|
3015
|
+
});
|
|
3016
|
+
res.end(JSON.stringify({
|
|
3017
|
+
error: `Rate limit exceeded for ${targetModel}. Max ${rateLimit.limit} requests per minute.`,
|
|
3018
|
+
type: 'rate_limit_exceeded',
|
|
3019
|
+
retry_after: rateLimit.retryAfter || 60
|
|
3020
|
+
}));
|
|
3021
|
+
return;
|
|
3022
|
+
}
|
|
3023
|
+
// ── End rate limit check ──
|
|
2622
3024
|
const startTime = Date.now();
|
|
2623
3025
|
let nativeResponseData;
|
|
2624
3026
|
try {
|
|
@@ -2688,11 +3090,16 @@ async function startProxy(config = {}) {
|
|
|
2688
3090
|
'Content-Type': 'text/event-stream',
|
|
2689
3091
|
'Cache-Control': 'no-cache',
|
|
2690
3092
|
'Connection': 'keep-alive',
|
|
3093
|
+
'X-RelayPlane-Cache': cacheBypass ? 'BYPASS' : 'MISS',
|
|
2691
3094
|
...nativeStreamRpHeaders,
|
|
2692
3095
|
});
|
|
2693
3096
|
const reader = providerResponse.body?.getReader();
|
|
2694
3097
|
let streamTokensIn = 0;
|
|
2695
3098
|
let streamTokensOut = 0;
|
|
3099
|
+
let streamCacheCreation = 0;
|
|
3100
|
+
let streamCacheRead = 0;
|
|
3101
|
+
// Buffer raw SSE chunks for cache storage
|
|
3102
|
+
const rawChunks = [];
|
|
2696
3103
|
if (reader) {
|
|
2697
3104
|
const decoder = new TextDecoder();
|
|
2698
3105
|
let sseBuffer = '';
|
|
@@ -2703,6 +3110,8 @@ async function startProxy(config = {}) {
|
|
|
2703
3110
|
break;
|
|
2704
3111
|
const chunk = decoder.decode(value, { stream: true });
|
|
2705
3112
|
res.write(chunk);
|
|
3113
|
+
if (cacheHash && !cacheBypass)
|
|
3114
|
+
rawChunks.push(chunk);
|
|
2706
3115
|
// Parse SSE events to extract usage from message_delta / message_stop
|
|
2707
3116
|
sseBuffer += chunk;
|
|
2708
3117
|
const lines = sseBuffer.split('\n');
|
|
@@ -2715,9 +3124,11 @@ async function startProxy(config = {}) {
|
|
|
2715
3124
|
if (evt.type === 'message_delta' && evt.usage) {
|
|
2716
3125
|
streamTokensOut = evt.usage.output_tokens ?? streamTokensOut;
|
|
2717
3126
|
}
|
|
2718
|
-
// Anthropic: message_start has usage.input_tokens
|
|
3127
|
+
// Anthropic: message_start has usage.input_tokens + cache tokens
|
|
2719
3128
|
if (evt.type === 'message_start' && evt.message?.usage) {
|
|
2720
3129
|
streamTokensIn = evt.message.usage.input_tokens ?? streamTokensIn;
|
|
3130
|
+
streamCacheCreation = evt.message.usage.cache_creation_input_tokens ?? 0;
|
|
3131
|
+
streamCacheRead = evt.message.usage.cache_read_input_tokens ?? 0;
|
|
2721
3132
|
}
|
|
2722
3133
|
// OpenAI format: choices with usage
|
|
2723
3134
|
if (evt.usage) {
|
|
@@ -2736,15 +3147,45 @@ async function startProxy(config = {}) {
|
|
|
2736
3147
|
reader.releaseLock();
|
|
2737
3148
|
}
|
|
2738
3149
|
}
|
|
3150
|
+
// ── Cache: store streaming response as raw SSE payload ──
|
|
3151
|
+
if (cacheHash && !cacheBypass && rawChunks.length > 0) {
|
|
3152
|
+
const streamPayload = JSON.stringify({
|
|
3153
|
+
_relayplaneStreamCache: true,
|
|
3154
|
+
ssePayload: rawChunks.join(''),
|
|
3155
|
+
usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
|
|
3156
|
+
});
|
|
3157
|
+
responseCache.set(cacheHash, streamPayload, {
|
|
3158
|
+
model: targetModel || requestedModel,
|
|
3159
|
+
tokensIn: streamTokensIn,
|
|
3160
|
+
tokensOut: streamTokensOut,
|
|
3161
|
+
costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
|
|
3162
|
+
taskType,
|
|
3163
|
+
});
|
|
3164
|
+
log(`Cache STORE (stream) for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
|
|
3165
|
+
}
|
|
2739
3166
|
// Store streaming token counts so telemetry can use them
|
|
2740
|
-
nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut } };
|
|
3167
|
+
nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead } };
|
|
2741
3168
|
res.end();
|
|
2742
3169
|
}
|
|
2743
3170
|
else {
|
|
2744
3171
|
nativeResponseData = await providerResponse.json();
|
|
2745
3172
|
const nativeRespModel = checkResponseModelMismatch(nativeResponseData, targetModel || requestedModel, targetProvider, log);
|
|
2746
3173
|
const nativeRpHeaders = buildRelayPlaneResponseHeaders(targetModel || requestedModel, originalModel ?? 'unknown', complexity, targetProvider, routingMode);
|
|
2747
|
-
|
|
3174
|
+
// ── Cache: store non-streaming response ──
|
|
3175
|
+
const nativeCacheHeader = cacheBypass ? 'BYPASS' : 'MISS';
|
|
3176
|
+
if (cacheHash && !cacheBypass) {
|
|
3177
|
+
const nativeRespJson = JSON.stringify(nativeResponseData);
|
|
3178
|
+
const nativeUsage = nativeResponseData?.usage;
|
|
3179
|
+
responseCache.set(cacheHash, nativeRespJson, {
|
|
3180
|
+
model: targetModel || requestedModel,
|
|
3181
|
+
tokensIn: nativeUsage?.input_tokens ?? 0,
|
|
3182
|
+
tokensOut: nativeUsage?.output_tokens ?? 0,
|
|
3183
|
+
costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeUsage?.input_tokens ?? 0, nativeUsage?.output_tokens ?? 0, nativeUsage?.cache_creation_input_tokens || undefined, nativeUsage?.cache_read_input_tokens || undefined),
|
|
3184
|
+
taskType,
|
|
3185
|
+
});
|
|
3186
|
+
log(`Cache STORE for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
|
|
3187
|
+
}
|
|
3188
|
+
res.writeHead(providerResponse.status, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': nativeCacheHeader, ...nativeRpHeaders });
|
|
2748
3189
|
res.end(JSON.stringify(nativeResponseData));
|
|
2749
3190
|
}
|
|
2750
3191
|
}
|
|
@@ -2754,18 +3195,31 @@ async function startProxy(config = {}) {
|
|
|
2754
3195
|
// nativeResponseData holds response JSON for non-streaming, or { usage: { input_tokens, output_tokens } }
|
|
2755
3196
|
// synthesised from SSE events for streaming
|
|
2756
3197
|
const nativeUsageData = nativeResponseData?.usage;
|
|
2757
|
-
const
|
|
3198
|
+
const nativeBaseTokIn = nativeUsageData?.input_tokens ?? nativeUsageData?.prompt_tokens ?? 0;
|
|
2758
3199
|
const nativeTokOut = nativeUsageData?.output_tokens ?? nativeUsageData?.completion_tokens ?? 0;
|
|
2759
|
-
|
|
3200
|
+
const nativeCacheCreation = nativeUsageData?.cache_creation_input_tokens ?? 0;
|
|
3201
|
+
const nativeCacheRead = nativeUsageData?.cache_read_input_tokens ?? 0;
|
|
3202
|
+
// Include cache tokens in displayed/recorded token count
|
|
3203
|
+
const nativeTokIn = nativeBaseTokIn + nativeCacheCreation + nativeCacheRead;
|
|
3204
|
+
// Cost calculation expects inputTokens to include cache tokens when cache params are provided
|
|
3205
|
+
const nativeCostUsd = (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined);
|
|
3206
|
+
updateLastHistoryEntry(nativeTokIn, nativeTokOut, nativeCostUsd, undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
|
|
3207
|
+
// ── Post-request: budget spend + anomaly detection ──
|
|
3208
|
+
postRequestRecord(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCostUsd);
|
|
2760
3209
|
if (recordTelemetry) {
|
|
2761
3210
|
relay
|
|
2762
3211
|
.run({
|
|
2763
3212
|
prompt: promptText.slice(0, 500),
|
|
2764
3213
|
taskType,
|
|
2765
3214
|
model: `${targetProvider}:${targetModel || requestedModel}`,
|
|
3215
|
+
})
|
|
3216
|
+
.then((runResult) => {
|
|
3217
|
+
// Backfill token/cost data — relay.run() has no adapters so records NULLs
|
|
3218
|
+
relay.patchRunTokens(runResult.runId, nativeTokIn, nativeTokOut, nativeCostUsd);
|
|
2766
3219
|
})
|
|
2767
3220
|
.catch(() => { });
|
|
2768
|
-
sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined);
|
|
3221
|
+
sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
|
|
3222
|
+
meshCapture(targetModel || requestedModel, targetProvider, taskType, nativeTokIn, nativeTokOut, (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined), durationMs, true);
|
|
2769
3223
|
}
|
|
2770
3224
|
}
|
|
2771
3225
|
catch (err) {
|
|
@@ -2847,6 +3301,47 @@ async function startProxy(config = {}) {
|
|
|
2847
3301
|
return;
|
|
2848
3302
|
}
|
|
2849
3303
|
const isStreaming = request.stream === true;
|
|
3304
|
+
// ── Response Cache: check for cached response (chat/completions) ──
|
|
3305
|
+
const chatCacheBypass = responseCache.shouldBypass(request);
|
|
3306
|
+
let chatCacheHash;
|
|
3307
|
+
if (!chatCacheBypass) {
|
|
3308
|
+
chatCacheHash = responseCache.computeKey(request);
|
|
3309
|
+
const chatCached = responseCache.get(chatCacheHash);
|
|
3310
|
+
if (chatCached) {
|
|
3311
|
+
try {
|
|
3312
|
+
const chatCachedData = JSON.parse(chatCached);
|
|
3313
|
+
const chatCacheUsage = chatCachedData?.usage;
|
|
3314
|
+
const chatCacheCost = (0, telemetry_js_1.estimateCost)(request.model ?? '', chatCacheUsage?.prompt_tokens ?? chatCacheUsage?.input_tokens ?? 0, chatCacheUsage?.completion_tokens ?? chatCacheUsage?.output_tokens ?? 0);
|
|
3315
|
+
responseCache.recordHit(chatCacheCost, 0);
|
|
3316
|
+
if (isStreaming && chatCachedData._relayplaneStreamCache) {
|
|
3317
|
+
res.writeHead(200, {
|
|
3318
|
+
'Content-Type': 'text/event-stream',
|
|
3319
|
+
'Cache-Control': 'no-cache',
|
|
3320
|
+
'Connection': 'keep-alive',
|
|
3321
|
+
'X-RelayPlane-Cache': 'HIT',
|
|
3322
|
+
});
|
|
3323
|
+
res.end(chatCachedData.ssePayload);
|
|
3324
|
+
}
|
|
3325
|
+
else {
|
|
3326
|
+
res.writeHead(200, {
|
|
3327
|
+
'Content-Type': 'application/json',
|
|
3328
|
+
'X-RelayPlane-Cache': 'HIT',
|
|
3329
|
+
});
|
|
3330
|
+
res.end(chatCached);
|
|
3331
|
+
}
|
|
3332
|
+
log(`Cache HIT for chat/completions ${request.model} (hash: ${chatCacheHash.slice(0, 8)})`);
|
|
3333
|
+
return;
|
|
3334
|
+
}
|
|
3335
|
+
catch {
|
|
3336
|
+
// Corrupt, continue
|
|
3337
|
+
}
|
|
3338
|
+
}
|
|
3339
|
+
responseCache.recordMiss();
|
|
3340
|
+
}
|
|
3341
|
+
else {
|
|
3342
|
+
responseCache.recordBypass();
|
|
3343
|
+
}
|
|
3344
|
+
// ── End cache check ──
|
|
2850
3345
|
const bypassRouting = !relayplaneEnabled || relayplaneBypass;
|
|
2851
3346
|
// Extract routing mode from model name
|
|
2852
3347
|
const originalRequestedModel = request.model;
|
|
@@ -3065,10 +3560,48 @@ async function startProxy(config = {}) {
|
|
|
3065
3560
|
}
|
|
3066
3561
|
apiKey = apiKeyResult.apiKey;
|
|
3067
3562
|
}
|
|
3563
|
+
// ── Budget check + auto-downgrade (chat/completions) ──
|
|
3564
|
+
{
|
|
3565
|
+
const chatBudgetCheck = preRequestBudgetCheck(targetModel);
|
|
3566
|
+
if (chatBudgetCheck.blocked) {
|
|
3567
|
+
res.writeHead(429, { 'Content-Type': 'application/json' });
|
|
3568
|
+
res.end(JSON.stringify({
|
|
3569
|
+
error: 'Budget limit exceeded. Request blocked.',
|
|
3570
|
+
type: 'budget_exceeded',
|
|
3571
|
+
}));
|
|
3572
|
+
return;
|
|
3573
|
+
}
|
|
3574
|
+
if (chatBudgetCheck.downgraded) {
|
|
3575
|
+
log(`Budget downgrade: ${targetModel} → ${chatBudgetCheck.model}`);
|
|
3576
|
+
targetModel = chatBudgetCheck.model;
|
|
3577
|
+
request.model = targetModel;
|
|
3578
|
+
}
|
|
3579
|
+
}
|
|
3580
|
+
// ── End budget check ──
|
|
3581
|
+
// ── Rate limit check ──
|
|
3582
|
+
const chatWorkspaceId = 'local'; // Local proxy uses single workspace
|
|
3583
|
+
const chatRateLimit = (0, rate_limiter_js_1.checkLimit)(chatWorkspaceId, targetModel);
|
|
3584
|
+
if (!chatRateLimit.allowed) {
|
|
3585
|
+
console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${chatWorkspaceId}`);
|
|
3586
|
+
res.writeHead(429, {
|
|
3587
|
+
'Content-Type': 'application/json',
|
|
3588
|
+
'Retry-After': String(chatRateLimit.retryAfter || 60),
|
|
3589
|
+
'X-RelayPlane-RateLimit-Limit': String(chatRateLimit.limit),
|
|
3590
|
+
'X-RelayPlane-RateLimit-Remaining': '0',
|
|
3591
|
+
'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRateLimit.resetAt / 1000))
|
|
3592
|
+
});
|
|
3593
|
+
res.end(JSON.stringify({
|
|
3594
|
+
error: `Rate limit exceeded for ${targetModel}. Max ${chatRateLimit.limit} requests per minute.`,
|
|
3595
|
+
type: 'rate_limit_exceeded',
|
|
3596
|
+
retry_after: chatRateLimit.retryAfter || 60
|
|
3597
|
+
}));
|
|
3598
|
+
return;
|
|
3599
|
+
}
|
|
3600
|
+
// ── End rate limit check ──
|
|
3068
3601
|
const startTime = Date.now();
|
|
3069
3602
|
// Handle streaming vs non-streaming
|
|
3070
3603
|
if (isStreaming) {
|
|
3071
|
-
await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity);
|
|
3604
|
+
await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatCacheHash, chatCacheBypass);
|
|
3072
3605
|
}
|
|
3073
3606
|
else {
|
|
3074
3607
|
if (useCascade && cascadeConfig) {
|
|
@@ -3105,8 +3638,10 @@ async function startProxy(config = {}) {
|
|
|
3105
3638
|
const cascadeUsage = responseData?.usage;
|
|
3106
3639
|
const cascadeTokensIn = cascadeUsage?.input_tokens ?? cascadeUsage?.prompt_tokens ?? 0;
|
|
3107
3640
|
const cascadeTokensOut = cascadeUsage?.output_tokens ?? cascadeUsage?.completion_tokens ?? 0;
|
|
3108
|
-
const
|
|
3109
|
-
|
|
3641
|
+
const cascadeCacheCreation = cascadeUsage?.cache_creation_input_tokens || undefined;
|
|
3642
|
+
const cascadeCacheRead = cascadeUsage?.cache_read_input_tokens || undefined;
|
|
3643
|
+
const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut, cascadeCacheCreation, cascadeCacheRead);
|
|
3644
|
+
updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel, cascadeCacheCreation, cascadeCacheRead);
|
|
3110
3645
|
if (recordTelemetry) {
|
|
3111
3646
|
try {
|
|
3112
3647
|
const runResult = await relay.run({
|
|
@@ -3114,6 +3649,8 @@ async function startProxy(config = {}) {
|
|
|
3114
3649
|
taskType,
|
|
3115
3650
|
model: `${cascadeResult.provider}:${cascadeResult.model}`,
|
|
3116
3651
|
});
|
|
3652
|
+
// Backfill token/cost data — relay.run() has no adapters so records NULLs
|
|
3653
|
+
relay.patchRunTokens(runResult.runId, cascadeTokensIn, cascadeTokensOut, cascadeCost);
|
|
3117
3654
|
responseData['_relayplane'] = {
|
|
3118
3655
|
runId: runResult.runId,
|
|
3119
3656
|
routedTo: `${cascadeResult.provider}/${cascadeResult.model}`,
|
|
@@ -3128,7 +3665,8 @@ async function startProxy(config = {}) {
|
|
|
3128
3665
|
catch (err) {
|
|
3129
3666
|
log(`Failed to record run: ${err}`);
|
|
3130
3667
|
}
|
|
3131
|
-
sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined);
|
|
3668
|
+
sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined, cascadeCacheCreation, cascadeCacheRead);
|
|
3669
|
+
meshCapture(cascadeResult.model, cascadeResult.provider, taskType, cascadeTokensIn, cascadeTokensOut, cascadeCost, durationMs, true);
|
|
3132
3670
|
}
|
|
3133
3671
|
const chatCascadeRpHeaders = buildRelayPlaneResponseHeaders(cascadeResult.model, originalRequestedModel ?? 'unknown', complexity, cascadeResult.provider, 'cascade');
|
|
3134
3672
|
res.writeHead(200, { 'Content-Type': 'application/json', ...chatCascadeRpHeaders });
|
|
@@ -3152,6 +3690,74 @@ async function startProxy(config = {}) {
|
|
|
3152
3690
|
}
|
|
3153
3691
|
}
|
|
3154
3692
|
});
|
|
3693
|
+
// ── Health Watchdog ──
|
|
3694
|
+
let watchdogFailures = 0;
|
|
3695
|
+
const WATCHDOG_MAX_FAILURES = 3;
|
|
3696
|
+
const WATCHDOG_INTERVAL_MS = 15_000; // Must be < WatchdogSec (30s) to avoid false kills
|
|
3697
|
+
let watchdogTimer = null;
|
|
3698
|
+
/**
|
|
3699
|
+
* sd_notify: write to $NOTIFY_SOCKET for systemd watchdog integration
|
|
3700
|
+
*/
|
|
3701
|
+
function sdNotify(state) {
|
|
3702
|
+
const notifySocket = process.env['NOTIFY_SOCKET'];
|
|
3703
|
+
if (!notifySocket)
|
|
3704
|
+
return;
|
|
3705
|
+
try {
|
|
3706
|
+
const dgram = require('node:dgram');
|
|
3707
|
+
const client = dgram.createSocket('unix_dgram');
|
|
3708
|
+
const buf = Buffer.from(state);
|
|
3709
|
+
client.send(buf, 0, buf.length, notifySocket, () => {
|
|
3710
|
+
client.close();
|
|
3711
|
+
});
|
|
3712
|
+
}
|
|
3713
|
+
catch (err) {
|
|
3714
|
+
log(`sd_notify error: ${err}`);
|
|
3715
|
+
}
|
|
3716
|
+
}
|
|
3717
|
+
function startWatchdog() {
|
|
3718
|
+
// Notify systemd we're ready
|
|
3719
|
+
sdNotify('READY=1');
|
|
3720
|
+
watchdogTimer = setInterval(async () => {
|
|
3721
|
+
try {
|
|
3722
|
+
const controller = new AbortController();
|
|
3723
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
3724
|
+
const res = await fetch(`http://${host}:${port}/health`, { signal: controller.signal });
|
|
3725
|
+
clearTimeout(timeout);
|
|
3726
|
+
if (res.ok) {
|
|
3727
|
+
watchdogFailures = 0;
|
|
3728
|
+
// Notify systemd watchdog we're alive
|
|
3729
|
+
sdNotify('WATCHDOG=1');
|
|
3730
|
+
}
|
|
3731
|
+
else {
|
|
3732
|
+
watchdogFailures++;
|
|
3733
|
+
console.error(`[RelayPlane] Watchdog: health check returned ${res.status} (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES})`);
|
|
3734
|
+
}
|
|
3735
|
+
}
|
|
3736
|
+
catch (err) {
|
|
3737
|
+
watchdogFailures++;
|
|
3738
|
+
console.error(`[RelayPlane] Watchdog: health check failed (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES}): ${err}`);
|
|
3739
|
+
}
|
|
3740
|
+
if (watchdogFailures >= WATCHDOG_MAX_FAILURES) {
|
|
3741
|
+
console.error('[RelayPlane] CRITICAL: 3 consecutive watchdog failures. Attempting graceful restart...');
|
|
3742
|
+
sdNotify('STOPPING=1');
|
|
3743
|
+
// Close server and exit — systemd Restart=always will restart us
|
|
3744
|
+
server.close(() => {
|
|
3745
|
+
process.exit(1);
|
|
3746
|
+
});
|
|
3747
|
+
// Force exit after 10s if graceful close hangs
|
|
3748
|
+
setTimeout(() => process.exit(1), 10_000).unref();
|
|
3749
|
+
}
|
|
3750
|
+
}, WATCHDOG_INTERVAL_MS);
|
|
3751
|
+
watchdogTimer.unref();
|
|
3752
|
+
}
|
|
3753
|
+
// Clean up watchdog on shutdown
|
|
3754
|
+
const origHandleShutdown = () => {
|
|
3755
|
+
if (watchdogTimer)
|
|
3756
|
+
clearInterval(watchdogTimer);
|
|
3757
|
+
sdNotify('STOPPING=1');
|
|
3758
|
+
};
|
|
3759
|
+
process.on('SIGINT', origHandleShutdown);
|
|
3760
|
+
process.on('SIGTERM', origHandleShutdown);
|
|
3155
3761
|
return new Promise((resolve, reject) => {
|
|
3156
3762
|
server.on('error', reject);
|
|
3157
3763
|
server.listen(port, host, () => {
|
|
@@ -3164,6 +3770,8 @@ async function startProxy(config = {}) {
|
|
|
3164
3770
|
console.log(` Models: relayplane:auto, relayplane:cost, relayplane:fast, relayplane:quality`);
|
|
3165
3771
|
console.log(` Auth: Passthrough for Anthropic, env vars for other providers`);
|
|
3166
3772
|
console.log(` Streaming: ✅ Enabled`);
|
|
3773
|
+
startWatchdog();
|
|
3774
|
+
log('Health watchdog started (30s interval, sd_notify enabled)');
|
|
3167
3775
|
resolve(server);
|
|
3168
3776
|
});
|
|
3169
3777
|
});
|
|
@@ -3221,7 +3829,7 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
|
|
|
3221
3829
|
}
|
|
3222
3830
|
return { responseData, ok: true, status: 200 };
|
|
3223
3831
|
}
|
|
3224
|
-
async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple') {
|
|
3832
|
+
async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', cacheHash, cacheBypass) {
|
|
3225
3833
|
let providerResponse;
|
|
3226
3834
|
try {
|
|
3227
3835
|
switch (targetProvider) {
|
|
@@ -3274,9 +3882,13 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3274
3882
|
'Connection': 'keep-alive',
|
|
3275
3883
|
...streamRpHeaders,
|
|
3276
3884
|
});
|
|
3277
|
-
// Track token usage from streaming events
|
|
3885
|
+
// Track token usage from streaming events (including Anthropic prompt cache tokens)
|
|
3278
3886
|
let streamTokensIn = 0;
|
|
3279
3887
|
let streamTokensOut = 0;
|
|
3888
|
+
let streamCacheCreation = 0;
|
|
3889
|
+
let streamCacheRead = 0;
|
|
3890
|
+
const shouldCacheStream = !!(cacheHash && !cacheBypass);
|
|
3891
|
+
const rawChunks = [];
|
|
3280
3892
|
try {
|
|
3281
3893
|
// Stream the response based on provider format
|
|
3282
3894
|
switch (targetProvider) {
|
|
@@ -3284,7 +3896,10 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3284
3896
|
// Convert Anthropic stream to OpenAI format
|
|
3285
3897
|
for await (const chunk of convertAnthropicStream(providerResponse, targetModel)) {
|
|
3286
3898
|
res.write(chunk);
|
|
3287
|
-
|
|
3899
|
+
if (shouldCacheStream)
|
|
3900
|
+
rawChunks.push(chunk);
|
|
3901
|
+
// Parse OpenAI-format chunks for usage — the converter embeds
|
|
3902
|
+
// cache_creation_tokens and cache_read_tokens from message_start.
|
|
3288
3903
|
try {
|
|
3289
3904
|
const lines = chunk.split('\n');
|
|
3290
3905
|
for (const line of lines) {
|
|
@@ -3293,6 +3908,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3293
3908
|
if (evt.usage) {
|
|
3294
3909
|
streamTokensIn = evt.usage.prompt_tokens ?? streamTokensIn;
|
|
3295
3910
|
streamTokensOut = evt.usage.completion_tokens ?? streamTokensOut;
|
|
3911
|
+
streamCacheCreation = evt.usage.cache_creation_tokens ?? streamCacheCreation;
|
|
3912
|
+
streamCacheRead = evt.usage.cache_read_tokens ?? streamCacheRead;
|
|
3296
3913
|
}
|
|
3297
3914
|
}
|
|
3298
3915
|
}
|
|
@@ -3304,6 +3921,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3304
3921
|
// Convert Gemini stream to OpenAI format
|
|
3305
3922
|
for await (const chunk of convertGeminiStream(providerResponse, targetModel)) {
|
|
3306
3923
|
res.write(chunk);
|
|
3924
|
+
if (shouldCacheStream)
|
|
3925
|
+
rawChunks.push(chunk);
|
|
3307
3926
|
try {
|
|
3308
3927
|
const lines = chunk.split('\n');
|
|
3309
3928
|
for (const line of lines) {
|
|
@@ -3323,6 +3942,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3323
3942
|
// xAI, OpenRouter, DeepSeek, Groq, OpenAI all use OpenAI-compatible streaming format
|
|
3324
3943
|
for await (const chunk of pipeOpenAIStream(providerResponse)) {
|
|
3325
3944
|
res.write(chunk);
|
|
3945
|
+
if (shouldCacheStream)
|
|
3946
|
+
rawChunks.push(chunk);
|
|
3326
3947
|
try {
|
|
3327
3948
|
const lines = chunk.split('\n');
|
|
3328
3949
|
for (const line of lines) {
|
|
@@ -3342,15 +3963,43 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3342
3963
|
catch (err) {
|
|
3343
3964
|
log(`Streaming error: ${err}`);
|
|
3344
3965
|
}
|
|
3966
|
+
// ── Cache: store streaming response ──
|
|
3967
|
+
if (shouldCacheStream && cacheHash && rawChunks.length > 0) {
|
|
3968
|
+
const responseCache = (0, response_cache_js_1.getResponseCache)();
|
|
3969
|
+
const streamPayload = JSON.stringify({
|
|
3970
|
+
_relayplaneStreamCache: true,
|
|
3971
|
+
ssePayload: rawChunks.join(''),
|
|
3972
|
+
usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, prompt_tokens: streamTokensIn, completion_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
|
|
3973
|
+
});
|
|
3974
|
+
responseCache.set(cacheHash, streamPayload, {
|
|
3975
|
+
model: targetModel,
|
|
3976
|
+
tokensIn: streamTokensIn,
|
|
3977
|
+
tokensOut: streamTokensOut,
|
|
3978
|
+
costUsd: (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
|
|
3979
|
+
taskType,
|
|
3980
|
+
});
|
|
3981
|
+
log(`Cache STORE (stream) for chat/completions ${targetModel} (hash: ${cacheHash.slice(0, 8)})`);
|
|
3982
|
+
}
|
|
3345
3983
|
if (cooldownsEnabled) {
|
|
3346
3984
|
cooldownManager.recordSuccess(targetProvider);
|
|
3347
3985
|
}
|
|
3348
3986
|
const durationMs = Date.now() - startTime;
|
|
3349
3987
|
// Always log the request for stats/telemetry tracking
|
|
3350
3988
|
logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
|
|
3351
|
-
// Update token/cost info on the history entry
|
|
3352
|
-
const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut);
|
|
3353
|
-
updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost);
|
|
3989
|
+
// Update token/cost info on the history entry (with cache token discount)
|
|
3990
|
+
const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined);
|
|
3991
|
+
updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost, undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
|
|
3992
|
+
// ── Post-request: budget spend + anomaly detection ──
|
|
3993
|
+
try {
|
|
3994
|
+
(0, budget_js_1.getBudgetManager)().recordSpend(streamCost, targetModel);
|
|
3995
|
+
const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn: streamTokensIn, tokensOut: streamTokensOut, costUsd: streamCost });
|
|
3996
|
+
if (anomalyResult.detected) {
|
|
3997
|
+
for (const anomaly of anomalyResult.anomalies) {
|
|
3998
|
+
(0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
|
|
3999
|
+
}
|
|
4000
|
+
}
|
|
4001
|
+
}
|
|
4002
|
+
catch { /* budget/anomaly should never block */ }
|
|
3354
4003
|
if (recordTelemetry) {
|
|
3355
4004
|
// Record the run (non-blocking)
|
|
3356
4005
|
relay
|
|
@@ -3360,12 +4009,15 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
|
|
|
3360
4009
|
model: `${targetProvider}:${targetModel}`,
|
|
3361
4010
|
})
|
|
3362
4011
|
.then((runResult) => {
|
|
4012
|
+
// Backfill token/cost data — relay.run() has no adapters so records NULLs
|
|
4013
|
+
relay.patchRunTokens(runResult.runId, streamTokensIn, streamTokensOut, streamCost);
|
|
3363
4014
|
log(`Completed streaming in ${durationMs}ms, runId: ${runResult.runId}`);
|
|
3364
4015
|
})
|
|
3365
4016
|
.catch((err) => {
|
|
3366
4017
|
log(`Failed to record run: ${err}`);
|
|
3367
4018
|
});
|
|
3368
|
-
sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined);
|
|
4019
|
+
sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
|
|
4020
|
+
meshCapture(targetModel, targetProvider, taskType, streamTokensIn, streamTokensOut, streamCost, durationMs, true);
|
|
3369
4021
|
}
|
|
3370
4022
|
res.end();
|
|
3371
4023
|
}
|
|
@@ -3407,12 +4059,25 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
|
|
|
3407
4059
|
const nonStreamRespModel = checkResponseModelMismatch(responseData, targetModel, targetProvider, log);
|
|
3408
4060
|
// Log the successful request
|
|
3409
4061
|
logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
|
|
3410
|
-
// Update token/cost info
|
|
4062
|
+
// Update token/cost info (including Anthropic prompt cache tokens)
|
|
3411
4063
|
const usage = responseData?.usage;
|
|
3412
4064
|
const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
|
|
3413
4065
|
const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
|
|
3414
|
-
const
|
|
3415
|
-
|
|
4066
|
+
const cacheCreationTokens = usage?.cache_creation_input_tokens ?? 0;
|
|
4067
|
+
const cacheReadTokens = usage?.cache_read_input_tokens ?? 0;
|
|
4068
|
+
const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut, cacheCreationTokens || undefined, cacheReadTokens || undefined);
|
|
4069
|
+
updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel, cacheCreationTokens || undefined, cacheReadTokens || undefined);
|
|
4070
|
+
// ── Post-request: budget spend + anomaly detection ──
|
|
4071
|
+
try {
|
|
4072
|
+
(0, budget_js_1.getBudgetManager)().recordSpend(cost, targetModel);
|
|
4073
|
+
const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn, tokensOut, costUsd: cost });
|
|
4074
|
+
if (anomalyResult.detected) {
|
|
4075
|
+
for (const anomaly of anomalyResult.anomalies) {
|
|
4076
|
+
(0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
|
|
4077
|
+
}
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4080
|
+
catch { /* budget/anomaly should never block */ }
|
|
3416
4081
|
if (recordTelemetry) {
|
|
3417
4082
|
// Record the run in RelayPlane
|
|
3418
4083
|
try {
|
|
@@ -3421,6 +4086,8 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
|
|
|
3421
4086
|
taskType,
|
|
3422
4087
|
model: `${targetProvider}:${targetModel}`,
|
|
3423
4088
|
});
|
|
4089
|
+
// Backfill token/cost data — relay.run() has no adapters so records NULLs
|
|
4090
|
+
relay.patchRunTokens(runResult.runId, tokensIn, tokensOut, cost);
|
|
3424
4091
|
// Add routing metadata to response
|
|
3425
4092
|
responseData['_relayplane'] = {
|
|
3426
4093
|
runId: runResult.runId,
|
|
@@ -3435,15 +4102,34 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
|
|
|
3435
4102
|
catch (err) {
|
|
3436
4103
|
log(`Failed to record run: ${err}`);
|
|
3437
4104
|
}
|
|
3438
|
-
// Extract token counts from response if available (Anthropic/OpenAI format)
|
|
3439
|
-
const
|
|
3440
|
-
const
|
|
3441
|
-
const
|
|
3442
|
-
|
|
4105
|
+
// Extract token counts from response if available (Anthropic/OpenAI format, including cache)
|
|
4106
|
+
const innerUsage = responseData?.usage;
|
|
4107
|
+
const innerTokIn = innerUsage?.input_tokens ?? innerUsage?.prompt_tokens ?? 0;
|
|
4108
|
+
const innerTokOut = innerUsage?.output_tokens ?? innerUsage?.completion_tokens ?? 0;
|
|
4109
|
+
const innerCacheCreation = innerUsage?.cache_creation_input_tokens ?? 0;
|
|
4110
|
+
const innerCacheRead = innerUsage?.cache_read_input_tokens ?? 0;
|
|
4111
|
+
sendCloudTelemetry(taskType, targetModel, innerTokIn, innerTokOut, durationMs, true, undefined, undefined, innerCacheCreation || undefined, innerCacheRead || undefined);
|
|
4112
|
+
meshCapture(targetModel, targetProvider, taskType, innerTokIn, innerTokOut, cost, durationMs, true);
|
|
4113
|
+
}
|
|
4114
|
+
// ── Cache: store non-streaming chat/completions response ──
|
|
4115
|
+
const chatRespCache = (0, response_cache_js_1.getResponseCache)();
|
|
4116
|
+
const chatReqAsRecord = request;
|
|
4117
|
+
const chatCacheBypassLocal = chatRespCache.shouldBypass(chatReqAsRecord);
|
|
4118
|
+
let chatCacheHeaderVal = chatCacheBypassLocal ? 'BYPASS' : 'MISS';
|
|
4119
|
+
if (!chatCacheBypassLocal) {
|
|
4120
|
+
const chatHashLocal = chatRespCache.computeKey(chatReqAsRecord);
|
|
4121
|
+
chatRespCache.set(chatHashLocal, JSON.stringify(responseData), {
|
|
4122
|
+
model: targetModel,
|
|
4123
|
+
tokensIn: tokensIn,
|
|
4124
|
+
tokensOut: tokensOut,
|
|
4125
|
+
costUsd: cost,
|
|
4126
|
+
taskType,
|
|
4127
|
+
});
|
|
4128
|
+
log(`Cache STORE for chat/completions ${targetModel} (hash: ${chatHashLocal.slice(0, 8)})`);
|
|
3443
4129
|
}
|
|
3444
4130
|
// Send response with RelayPlane routing headers
|
|
3445
4131
|
const nonStreamRpHeaders = buildRelayPlaneResponseHeaders(targetModel, request.model ?? 'unknown', complexity, targetProvider, routingMode);
|
|
3446
|
-
res.writeHead(200, { 'Content-Type': 'application/json', ...nonStreamRpHeaders });
|
|
4132
|
+
res.writeHead(200, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': chatCacheHeaderVal, ...nonStreamRpHeaders });
|
|
3447
4133
|
res.end(JSON.stringify(responseData));
|
|
3448
4134
|
}
|
|
3449
4135
|
// Note: CLI entry point is in cli.ts
|