@relayplane/proxy 1.5.46 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +297 -20
  2. package/assets/relayplane-proxy.service +20 -0
  3. package/dist/alerts.d.ts +72 -0
  4. package/dist/alerts.d.ts.map +1 -0
  5. package/dist/alerts.js +290 -0
  6. package/dist/alerts.js.map +1 -0
  7. package/dist/anomaly.d.ts +65 -0
  8. package/dist/anomaly.d.ts.map +1 -0
  9. package/dist/anomaly.js +193 -0
  10. package/dist/anomaly.js.map +1 -0
  11. package/dist/budget.d.ts +98 -0
  12. package/dist/budget.d.ts.map +1 -0
  13. package/dist/budget.js +356 -0
  14. package/dist/budget.js.map +1 -0
  15. package/dist/cli.js +512 -93
  16. package/dist/cli.js.map +1 -1
  17. package/dist/config.d.ts +28 -2
  18. package/dist/config.d.ts.map +1 -1
  19. package/dist/config.js +122 -24
  20. package/dist/config.js.map +1 -1
  21. package/dist/downgrade.d.ts +37 -0
  22. package/dist/downgrade.d.ts.map +1 -0
  23. package/dist/downgrade.js +79 -0
  24. package/dist/downgrade.js.map +1 -0
  25. package/dist/mesh/capture.d.ts +11 -0
  26. package/dist/mesh/capture.d.ts.map +1 -0
  27. package/dist/mesh/capture.js +43 -0
  28. package/dist/mesh/capture.js.map +1 -0
  29. package/dist/mesh/fitness.d.ts +14 -0
  30. package/dist/mesh/fitness.d.ts.map +1 -0
  31. package/dist/mesh/fitness.js +40 -0
  32. package/dist/mesh/fitness.js.map +1 -0
  33. package/dist/mesh/index.d.ts +39 -0
  34. package/dist/mesh/index.d.ts.map +1 -0
  35. package/dist/mesh/index.js +118 -0
  36. package/dist/mesh/index.js.map +1 -0
  37. package/dist/mesh/store.d.ts +30 -0
  38. package/dist/mesh/store.d.ts.map +1 -0
  39. package/dist/mesh/store.js +174 -0
  40. package/dist/mesh/store.js.map +1 -0
  41. package/dist/mesh/sync.d.ts +37 -0
  42. package/dist/mesh/sync.d.ts.map +1 -0
  43. package/dist/mesh/sync.js +154 -0
  44. package/dist/mesh/sync.js.map +1 -0
  45. package/dist/mesh/types.d.ts +57 -0
  46. package/dist/mesh/types.d.ts.map +1 -0
  47. package/dist/mesh/types.js +7 -0
  48. package/dist/mesh/types.js.map +1 -0
  49. package/dist/rate-limiter.d.ts +64 -0
  50. package/dist/rate-limiter.d.ts.map +1 -0
  51. package/dist/rate-limiter.js +159 -0
  52. package/dist/rate-limiter.js.map +1 -0
  53. package/dist/relay-config.d.ts +9 -0
  54. package/dist/relay-config.d.ts.map +1 -1
  55. package/dist/relay-config.js +2 -0
  56. package/dist/relay-config.js.map +1 -1
  57. package/dist/response-cache.d.ts +139 -0
  58. package/dist/response-cache.d.ts.map +1 -0
  59. package/dist/response-cache.js +515 -0
  60. package/dist/response-cache.js.map +1 -0
  61. package/dist/server.d.ts.map +1 -1
  62. package/dist/server.js +5 -1
  63. package/dist/server.js.map +1 -1
  64. package/dist/standalone-proxy.d.ts +2 -1
  65. package/dist/standalone-proxy.d.ts.map +1 -1
  66. package/dist/standalone-proxy.js +736 -50
  67. package/dist/standalone-proxy.js.map +1 -1
  68. package/dist/telemetry.d.ts.map +1 -1
  69. package/dist/telemetry.js +21 -5
  70. package/dist/telemetry.js.map +1 -1
  71. package/dist/utils/model-suggestions.d.ts.map +1 -1
  72. package/dist/utils/model-suggestions.js +19 -2
  73. package/dist/utils/model-suggestions.js.map +1 -1
  74. package/dist/utils/version-status.d.ts +9 -0
  75. package/dist/utils/version-status.d.ts.map +1 -0
  76. package/dist/utils/version-status.js +28 -0
  77. package/dist/utils/version-status.js.map +1 -0
  78. package/package.json +7 -3
@@ -67,7 +67,16 @@ const path = __importStar(require("node:path"));
67
67
  const core_1 = require("@relayplane/core");
68
68
  const model_suggestions_js_1 = require("./utils/model-suggestions.js");
69
69
  const telemetry_js_1 = require("./telemetry.js");
70
+ const config_js_1 = require("./config.js");
71
+ const index_js_1 = require("./mesh/index.js");
72
+ const response_cache_js_1 = require("./response-cache.js");
70
73
  const stats_js_1 = require("./stats.js");
74
+ const rate_limiter_js_1 = require("./rate-limiter.js");
75
+ const budget_js_1 = require("./budget.js");
76
+ const anomaly_js_1 = require("./anomaly.js");
77
+ const alerts_js_1 = require("./alerts.js");
78
+ const downgrade_js_1 = require("./downgrade.js");
79
+ const version_status_js_1 = require("./utils/version-status.js");
71
80
  const PROXY_VERSION = (() => {
72
81
  try {
73
82
  const pkgPath = path.join(__dirname, '..', 'package.json');
@@ -77,8 +86,54 @@ const PROXY_VERSION = (() => {
77
86
  return '0.0.0';
78
87
  }
79
88
  })();
89
+ let latestProxyVersionCache = { value: null, checkedAt: 0 };
90
+ const LATEST_PROXY_VERSION_TTL_MS = 30 * 60 * 1000;
91
+ async function getLatestProxyVersion() {
92
+ const now = Date.now();
93
+ if (now - latestProxyVersionCache.checkedAt < LATEST_PROXY_VERSION_TTL_MS) {
94
+ return latestProxyVersionCache.value;
95
+ }
96
+ try {
97
+ const controller = new AbortController();
98
+ const timeout = setTimeout(() => controller.abort(), 2500);
99
+ const res = await fetch('https://registry.npmjs.org/@relayplane/proxy/latest', {
100
+ signal: controller.signal,
101
+ headers: { Accept: 'application/json' },
102
+ });
103
+ clearTimeout(timeout);
104
+ if (!res.ok) {
105
+ latestProxyVersionCache = { value: null, checkedAt: now };
106
+ return null;
107
+ }
108
+ const data = await res.json();
109
+ const latest = data.version ?? null;
110
+ latestProxyVersionCache = { value: latest, checkedAt: now };
111
+ return latest;
112
+ }
113
+ catch {
114
+ latestProxyVersionCache = { value: null, checkedAt: now };
115
+ return null;
116
+ }
117
+ }
80
118
  /** Shared stats collector instance for the proxy server */
81
119
  exports.proxyStatsCollector = new stats_js_1.StatsCollector();
120
+ /** Shared mesh handle — set during startProxy() */
121
+ let _meshHandle = null;
122
+ /** Capture a request into the mesh (fire-and-forget, never blocks) */
123
+ function meshCapture(model, provider, taskType, tokensIn, tokensOut, costUsd, latencyMs, success, errorType) {
124
+ if (!_meshHandle)
125
+ return;
126
+ try {
127
+ _meshHandle.captureRequest({
128
+ model, provider, task_type: taskType,
129
+ input_tokens: tokensIn, output_tokens: tokensOut,
130
+ cost_usd: costUsd, latency_ms: latencyMs,
131
+ success, error_type: errorType,
132
+ timestamp: new Date().toISOString(),
133
+ });
134
+ }
135
+ catch { }
136
+ }
82
137
  /**
83
138
  * Default provider endpoints
84
139
  */
@@ -171,10 +226,10 @@ exports.SMART_ALIASES = {
171
226
  * Send a telemetry event to the cloud (anonymous or authenticated).
172
227
  * Non-blocking — errors are silently swallowed.
173
228
  */
174
- function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel) {
229
+ function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel, cacheCreationTokens, cacheReadTokens) {
175
230
  try {
176
- const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut);
177
- (0, telemetry_js_1.recordTelemetry)({
231
+ const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut, cacheCreationTokens, cacheReadTokens);
232
+ const event = {
178
233
  task_type: taskType,
179
234
  model,
180
235
  tokens_in: tokensIn,
@@ -183,7 +238,21 @@ function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, suc
183
238
  success,
184
239
  cost_usd: cost,
185
240
  requested_model: requestedModel,
186
- });
241
+ cache_creation_tokens: cacheCreationTokens,
242
+ cache_read_tokens: cacheReadTokens,
243
+ };
244
+ // Record locally (writes to telemetry.jsonl + queues upload if telemetry_enabled)
245
+ (0, telemetry_js_1.recordTelemetry)(event);
246
+ // Ensure cloud upload even if local telemetry_enabled is false
247
+ // recordCloudTelemetry skips queueForUpload when telemetry is disabled,
248
+ // but cloud dashboard needs these events regardless of local config
249
+ if (!(0, config_js_1.isTelemetryEnabled)()) {
250
+ (0, telemetry_js_1.queueForUpload)({
251
+ ...event,
252
+ device_id: (0, config_js_1.getDeviceId)(),
253
+ timestamp: new Date().toISOString(),
254
+ });
255
+ }
187
256
  }
188
257
  catch {
189
258
  // Telemetry should never break the proxy
@@ -220,15 +289,15 @@ function resolveModelAlias(model) {
220
289
  * Uses Haiku 3.5 for cost optimization, upgrades based on learned rules
221
290
  */
222
291
  const DEFAULT_ROUTING = {
223
- code_generation: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
224
- code_review: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
225
- summarization: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
226
- analysis: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
227
- creative_writing: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
228
- data_extraction: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
229
- translation: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
230
- question_answering: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
231
- general: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
292
+ code_generation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
293
+ code_review: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
294
+ summarization: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
295
+ analysis: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
296
+ creative_writing: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
297
+ data_extraction: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
298
+ translation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
299
+ question_answering: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
300
+ general: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
232
301
  };
233
302
  const UNCERTAINTY_PATTERNS = [
234
303
  /i'?m not (entirely |completely |really )?sure/i,
@@ -470,7 +539,7 @@ function logRequest(originalModel, targetModel, provider, latencyMs, success, mo
470
539
  bufferHistoryEntry(entry);
471
540
  }
472
541
  /** Update the most recent history entry with token/cost info */
473
- function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
542
+ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel, cacheCreationTokens, cacheReadTokens) {
474
543
  if (requestHistory.length > 0) {
475
544
  const last = requestHistory[requestHistory.length - 1];
476
545
  last.tokensIn = tokensIn;
@@ -479,6 +548,10 @@ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
479
548
  if (responseModel) {
480
549
  last.responseModel = responseModel;
481
550
  }
551
+ if (cacheCreationTokens !== undefined)
552
+ last.cacheCreationTokens = cacheCreationTokens;
553
+ if (cacheReadTokens !== undefined)
554
+ last.cacheReadTokens = cacheReadTokens;
482
555
  }
483
556
  }
484
557
  const DEFAULT_PROXY_CONFIG = {
@@ -489,7 +562,6 @@ const DEFAULT_PROXY_CONFIG = {
489
562
  cascade: {
490
563
  enabled: true,
491
564
  models: [
492
- 'claude-haiku-4-5',
493
565
  'claude-sonnet-4-6',
494
566
  'claude-opus-4-6',
495
567
  ],
@@ -498,7 +570,7 @@ const DEFAULT_PROXY_CONFIG = {
498
570
  },
499
571
  complexity: {
500
572
  enabled: true,
501
- simple: 'claude-haiku-4-5',
573
+ simple: 'claude-sonnet-4-6',
502
574
  moderate: 'claude-sonnet-4-6',
503
575
  complex: 'claude-opus-4-6',
504
576
  },
@@ -705,6 +777,23 @@ function classifyComplexity(messages) {
705
777
  score += 1;
706
778
  if (andCount >= 5)
707
779
  score += 1;
780
+ // Calculate total tokens across ALL messages, not just last user message.
781
+ // For agent workloads (OpenClaw, aider, Claude Code) the last user message is
782
+ // often tiny while the real complexity lives in the 100K+ token context.
783
+ const allText = extractMessageText(messages);
784
+ const totalTokens = Math.ceil(allText.length / 4);
785
+ // Context size floor — use as a hard signal regardless of last-message score
786
+ if (totalTokens > 100000)
787
+ score += 5; // definitely complex
788
+ else if (totalTokens > 50000)
789
+ score += 3; // likely moderate+
790
+ else if (totalTokens > 20000)
791
+ score += 2;
792
+ // Message count signal — long conversations imply multi-step reasoning
793
+ if (messages.length > 50)
794
+ score += 2;
795
+ else if (messages.length > 20)
796
+ score += 1;
708
797
  if (score >= 4)
709
798
  return 'complex';
710
799
  if (score >= 2)
@@ -1437,11 +1526,13 @@ function convertAnthropicStreamEvent(eventType, eventData, messageId, model, too
1437
1526
  const msg = eventData['message'];
1438
1527
  baseChunk.id = msg?.['id'] || messageId;
1439
1528
  choice.delta = { role: 'assistant', content: '' };
1440
- // Pass through input token count from message_start
1529
+ // Pass through input token count from message_start (including cache tokens)
1441
1530
  const msgUsage = msg?.['usage'];
1442
1531
  if (msgUsage) {
1443
1532
  baseChunk['usage'] = {
1444
1533
  prompt_tokens: msgUsage['input_tokens'] ?? 0,
1534
+ cache_creation_tokens: msgUsage['cache_creation_input_tokens'] ?? 0,
1535
+ cache_read_tokens: msgUsage['cache_read_input_tokens'] ?? 0,
1445
1536
  };
1446
1537
  }
1447
1538
  return `data: ${JSON.stringify(baseChunk)}\n\n`;
@@ -1895,10 +1986,14 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
1895
1986
  .badge.ok{background:#052e1633;color:#34d399}.badge.err{background:#2d0a0a;color:#ef4444}
1896
1987
  .badge.tt-code{background:#1e3a5f;color:#60a5fa}.badge.tt-analysis{background:#3b1f6e;color:#a78bfa}.badge.tt-summarization{background:#1a3a2a;color:#6ee7b7}.badge.tt-qa{background:#3a2f1e;color:#fbbf24}.badge.tt-general{background:#1e293b;color:#94a3b8}
1897
1988
  .badge.cx-simple{background:#052e1633;color:#34d399}.badge.cx-moderate{background:#2d2a0a;color:#fbbf24}.badge.cx-complex{background:#2d0a0a;color:#ef4444}
1989
+ .vstat{display:inline-flex;align-items:center;gap:6px;margin-left:8px;padding:1px 8px;border-radius:999px;border:1px solid #334155;font-size:.72rem}
1990
+ .vstat.current{color:#94a3b8;border-color:#334155;background:#0f172a66}
1991
+ .vstat.outdated{color:#fbbf24;border-color:#f59e0b55;background:#3a2f1e66}
1992
+ .vstat.unavailable{color:#a3a3a3;border-color:#52525b66;background:#18181b66}
1898
1993
  @media(max-width:768px){.col-tt,.col-cx{display:none}}
1899
1994
  .prov{display:flex;gap:16px;flex-wrap:wrap}.prov-item{display:flex;align-items:center;font-size:.85rem;background:#111318;padding:8px 14px;border-radius:8px;border:1px solid #1e293b}
1900
1995
  </style></head><body>
1901
- <div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
1996
+ <div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span><span id="vstat" class="vstat unavailable">Unable to check</span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
1902
1997
  <div class="cards">
1903
1998
  <div class="card"><div class="label">Total Requests</div><div class="value" id="totalReq">—</div></div>
1904
1999
  <div class="card"><div class="label">Total Cost</div><div class="value" id="totalCost">—</div></div>
@@ -1909,7 +2004,7 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
1909
2004
  <table><thead><tr><th>Model</th><th>Requests</th><th>Cost</th><th>% of Total</th></tr></thead><tbody id="models"></tbody></table></div>
1910
2005
  <div class="section"><h2>Provider Status</h2><div class="prov" id="providers"></div></div>
1911
2006
  <div class="section"><h2>Recent Runs</h2>
1912
- <table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
2007
+ <table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th class="col-cache">Cache Create</th><th class="col-cache">Cache Read</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
1913
2008
  <script>
1914
2009
  const $ = id => document.getElementById(id);
1915
2010
  function fmt(n,d=2){return typeof n==='number'?n.toFixed(d):'-'}
@@ -1926,6 +2021,19 @@ async function load(){
1926
2021
  ]);
1927
2022
  $('ver').textContent='v'+health.version;
1928
2023
  $('uptime').textContent=dur(health.uptime);
2024
+
2025
+ const versionStatus = await fetch('/v1/version-status').then(r=>r.json()).catch(()=>({state:'unavailable', current: health.version, latest: null}));
2026
+ const vEl = $('vstat');
2027
+ if (vEl) {
2028
+ vEl.className = 'vstat ' + (versionStatus.state === 'outdated' ? 'outdated' : versionStatus.state === 'up-to-date' ? 'current' : 'unavailable');
2029
+ if (versionStatus.state === 'outdated') {
2030
+ vEl.textContent = 'Update available · v' + versionStatus.current + ' → v' + versionStatus.latest;
2031
+ } else if (versionStatus.state === 'up-to-date') {
2032
+ vEl.textContent = 'Up to date · v' + versionStatus.current;
2033
+ } else {
2034
+ vEl.textContent = 'Unable to check · v' + versionStatus.current;
2035
+ }
2036
+ }
1929
2037
  const total=stats.summary?.totalEvents||0;
1930
2038
  $('totalReq').textContent=total;
1931
2039
  $('totalCost').textContent='$'+fmt(stats.summary?.totalCostUsd??0,4);
@@ -1937,8 +2045,8 @@ async function load(){
1937
2045
  function ttCls(t){const m={code_generation:'tt-code',analysis:'tt-analysis',summarization:'tt-summarization',question_answering:'tt-qa'};return m[t]||'tt-general'}
1938
2046
  function cxCls(c){const m={simple:'cx-simple',moderate:'cx-moderate',complex:'cx-complex'};return m[c]||'cx-simple'}
1939
2047
  $('runs').innerHTML=(runsR.runs||[]).map(r=>
1940
- '<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
1941
- ).join('')||'<tr><td colspan=9 style="color:#64748b">No runs yet</td></tr>';
2048
+ '<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td class="col-cache" style="color:#60a5fa">'+(r.cacheCreationTokens||0)+'</td><td class="col-cache" style="color:#34d399">'+(r.cacheReadTokens||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
2049
+ ).join('')||'<tr><td colspan=11 style="color:#64748b">No runs yet</td></tr>';
1942
2050
  $('providers').innerHTML=(provH.providers||[]).map(p=>{
1943
2051
  const dotClass = p.status==='healthy'?'up':(p.status==='degraded'?'warn':'down');
1944
2052
  const rate = p.successRate!==undefined?(' '+Math.round(p.successRate*100)+'%'):'';
@@ -2046,6 +2154,7 @@ async function startProxy(config = {}) {
2046
2154
  loadHistoryFromDisk();
2047
2155
  // Flush history on shutdown
2048
2156
  const handleShutdown = () => {
2157
+ meshHandle.stop();
2049
2158
  shutdownHistory();
2050
2159
  process.exit(0);
2051
2160
  };
@@ -2054,11 +2163,159 @@ async function startProxy(config = {}) {
2054
2163
  const configPath = getProxyConfigPath();
2055
2164
  let proxyConfig = await loadProxyConfig(configPath, log);
2056
2165
  const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
2166
+ // === Startup config validation (Task 4) ===
2167
+ try {
2168
+ const userConfig = (0, config_js_1.loadConfig)();
2169
+ // Check if config was just created (created_at within 5s of now)
2170
+ const createdAt = new Date(userConfig.created_at).getTime();
2171
+ const now = Date.now();
2172
+ if (Math.abs(now - createdAt) < 5000) {
2173
+ console.warn('[RelayPlane] WARNING: Fresh config detected — previous config may have been deleted');
2174
+ }
2175
+ // Check if credentials exist but config doesn't reference them
2176
+ if ((0, config_js_1.hasValidCredentials)() && !userConfig.api_key) {
2177
+ console.warn('[RelayPlane] WARNING: credentials.json exists but config has no API key reference');
2178
+ }
2179
+ // Auto-enable telemetry for authenticated users
2180
+ if ((0, config_js_1.hasValidCredentials)() && !userConfig.telemetry_enabled) {
2181
+ // Already handled in loadConfig() for fresh configs, but handle existing configs too
2182
+ }
2183
+ // Validate expected fields
2184
+ if (!userConfig.device_id || !userConfig.created_at || userConfig.config_version === undefined) {
2185
+ console.warn('[RelayPlane] WARNING: Config is missing expected fields');
2186
+ }
2187
+ }
2188
+ catch (err) {
2189
+ console.warn(`[RelayPlane] Config validation error: ${err}`);
2190
+ }
2191
+ // Initialize mesh learning layer
2192
+ const meshConfig = (0, config_js_1.getMeshConfig)();
2193
+ const userConfig = (0, config_js_1.loadConfig)();
2194
+ const meshHandle = _meshHandle = (0, index_js_1.initMeshLayer)({
2195
+ enabled: meshConfig.enabled,
2196
+ endpoint: meshConfig.endpoint,
2197
+ sync_interval_ms: meshConfig.sync_interval_ms,
2198
+ contribute: meshConfig.contribute,
2199
+ }, userConfig.api_key);
2200
+ // Initialize budget manager
2201
+ const budgetManager = (0, budget_js_1.getBudgetManager)(proxyConfig.budget);
2202
+ if (proxyConfig.budget?.enabled) {
2203
+ try {
2204
+ budgetManager.init();
2205
+ log('Budget manager initialized');
2206
+ }
2207
+ catch (err) {
2208
+ log(`Budget manager init failed: ${err}`);
2209
+ }
2210
+ }
2211
+ // Initialize anomaly detector
2212
+ const anomalyDetector = (0, anomaly_js_1.getAnomalyDetector)(proxyConfig.anomaly);
2213
+ // Initialize alert manager
2214
+ const alertManager = (0, alerts_js_1.getAlertManager)(proxyConfig.alerts);
2215
+ if (proxyConfig.alerts?.enabled) {
2216
+ try {
2217
+ alertManager.init();
2218
+ log('Alert manager initialized');
2219
+ }
2220
+ catch (err) {
2221
+ log(`Alert manager init failed: ${err}`);
2222
+ }
2223
+ }
2224
+ // Downgrade config
2225
+ let downgradeConfig = {
2226
+ ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG,
2227
+ ...(proxyConfig.downgrade ?? {}),
2228
+ };
2229
+ /**
2230
+ * Pre-request budget check + auto-downgrade.
2231
+ * Returns the (possibly downgraded) model and extra response headers.
2232
+ * If the request should be blocked, returns { blocked: true }.
2233
+ */
2234
+ function preRequestBudgetCheck(model, estimatedCost) {
2235
+ const headers = {};
2236
+ let finalModel = model;
2237
+ let downgraded = false;
2238
+ // Budget check
2239
+ const budgetResult = budgetManager.checkBudget(estimatedCost);
2240
+ if (budgetResult.breached) {
2241
+ // Fire breach alert
2242
+ const limit = budgetResult.breachType === 'hourly'
2243
+ ? budgetManager.getConfig().hourlyUsd
2244
+ : budgetManager.getConfig().dailyUsd;
2245
+ const spend = budgetResult.breachType === 'hourly'
2246
+ ? budgetResult.currentHourlySpend
2247
+ : budgetResult.currentDailySpend;
2248
+ alertManager.fireBreach(budgetResult.breachType, spend, limit);
2249
+ if (budgetResult.action === 'block') {
2250
+ return { blocked: true, model: finalModel, headers, downgraded: false };
2251
+ }
2252
+ if (budgetResult.action === 'downgrade') {
2253
+ const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, 100, downgradeConfig);
2254
+ if (dr.downgraded) {
2255
+ finalModel = dr.newModel;
2256
+ downgraded = true;
2257
+ (0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
2258
+ }
2259
+ }
2260
+ }
2261
+ // Fire threshold alerts
2262
+ for (const threshold of budgetResult.thresholdsCrossed) {
2263
+ alertManager.fireThreshold(threshold, (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100, budgetResult.currentDailySpend, budgetManager.getConfig().dailyUsd);
2264
+ budgetManager.markThresholdFired(threshold);
2265
+ }
2266
+ // Auto-downgrade based on budget percentage (even if not breached)
2267
+ if (!downgraded && downgradeConfig.enabled) {
2268
+ const pct = budgetManager.getConfig().dailyUsd > 0
2269
+ ? (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100
2270
+ : 0;
2271
+ const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, pct, downgradeConfig);
2272
+ if (dr.downgraded) {
2273
+ finalModel = dr.newModel;
2274
+ downgraded = true;
2275
+ (0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
2276
+ }
2277
+ }
2278
+ return { blocked: false, model: finalModel, headers, downgraded };
2279
+ }
2280
+ /**
2281
+ * Post-request: record spend, run anomaly detection, fire anomaly alerts.
2282
+ */
2283
+ function postRequestRecord(model, tokensIn, tokensOut, costUsd) {
2284
+ // Record spend
2285
+ budgetManager.recordSpend(costUsd, model);
2286
+ // Anomaly detection
2287
+ const anomalyResult = anomalyDetector.recordAndAnalyze({
2288
+ model,
2289
+ tokensIn,
2290
+ tokensOut,
2291
+ costUsd,
2292
+ });
2293
+ if (anomalyResult.detected) {
2294
+ for (const anomaly of anomalyResult.anomalies) {
2295
+ alertManager.fireAnomaly(anomaly);
2296
+ }
2297
+ }
2298
+ }
2299
+ // Initialize response cache
2300
+ const responseCache = (0, response_cache_js_1.getResponseCache)(proxyConfig.cache);
2301
+ if (proxyConfig.cache?.enabled !== false) {
2302
+ try {
2303
+ responseCache.init();
2304
+ log('Response cache initialized');
2305
+ }
2306
+ catch (err) {
2307
+ log(`Response cache init failed: ${err}`);
2308
+ }
2309
+ }
2057
2310
  let configWatcher = null;
2058
2311
  let configReloadTimer = null;
2059
2312
  const reloadConfig = async () => {
2060
2313
  proxyConfig = await loadProxyConfig(configPath, log);
2061
2314
  cooldownManager.updateConfig(getCooldownConfig(proxyConfig));
2315
+ budgetManager.updateConfig({ ...budgetManager.getConfig(), ...(proxyConfig.budget ?? {}) });
2316
+ anomalyDetector.updateConfig({ ...anomalyDetector.getConfig(), ...(proxyConfig.anomaly ?? {}) });
2317
+ alertManager.updateConfig({ ...alertManager.getConfig(), ...(proxyConfig.alerts ?? {}) });
2318
+ downgradeConfig = { ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG, ...(proxyConfig.downgrade ?? {}) };
2062
2319
  log(`Reloaded config from ${configPath}`);
2063
2320
  };
2064
2321
  const scheduleConfigReload = () => {
@@ -2083,7 +2340,8 @@ async function startProxy(config = {}) {
2083
2340
  // Initialize RelayPlane
2084
2341
  const relay = new core_1.RelayPlane({ dbPath: config.dbPath });
2085
2342
  // Startup migration: clear default routing rules so complexity config takes priority
2086
- const clearedCount = relay.routing.clearDefaultRules();
2343
+ const clearDefaultRules = relay.routing.clearDefaultRules;
2344
+ const clearedCount = typeof clearDefaultRules === 'function' ? clearDefaultRules.call(relay.routing) : 0;
2087
2345
  if (clearedCount > 0) {
2088
2346
  log(`Cleared ${clearedCount} default routing rules (complexity config takes priority)`);
2089
2347
  }
@@ -2130,6 +2388,13 @@ async function startProxy(config = {}) {
2130
2388
  }));
2131
2389
  return;
2132
2390
  }
2391
+ if (req.method === 'GET' && pathname === '/v1/version-status') {
2392
+ const latest = await getLatestProxyVersion();
2393
+ const status = (0, version_status_js_1.getVersionStatus)(PROXY_VERSION, latest);
2394
+ res.writeHead(200, { 'Content-Type': 'application/json', 'Cache-Control': 'public, max-age=60' });
2395
+ res.end(JSON.stringify(status));
2396
+ return;
2397
+ }
2133
2398
  // === Control endpoints ===
2134
2399
  if (pathname.startsWith('/control/')) {
2135
2400
  if (req.method === 'POST' && pathname === '/control/enable') {
@@ -2196,6 +2461,36 @@ async function startProxy(config = {}) {
2196
2461
  return;
2197
2462
  }
2198
2463
  }
2464
+ if (req.method === 'POST' && pathname === '/control/kill') {
2465
+ try {
2466
+ const body = await readJsonBody(req);
2467
+ if (body.all) {
2468
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2469
+ res.end(JSON.stringify({
2470
+ killed: 0,
2471
+ sessions: [],
2472
+ note: 'Local proxy mode: session kill not applicable'
2473
+ }));
2474
+ }
2475
+ else if (body.sessionKey) {
2476
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2477
+ res.end(JSON.stringify({
2478
+ killed: 1,
2479
+ sessions: [body.sessionKey],
2480
+ note: 'Rate limits reset for session'
2481
+ }));
2482
+ }
2483
+ else {
2484
+ res.writeHead(400, { 'Content-Type': 'application/json' });
2485
+ res.end(JSON.stringify({ error: 'Provide sessionKey or all=true' }));
2486
+ }
2487
+ }
2488
+ catch {
2489
+ res.writeHead(400, { 'Content-Type': 'application/json' });
2490
+ res.end(JSON.stringify({ error: 'Invalid JSON' }));
2491
+ }
2492
+ return;
2493
+ }
2199
2494
  // === Telemetry endpoints for dashboard ===
2200
2495
  if (pathname.startsWith('/v1/telemetry/')) {
2201
2496
  const telemetryPath = pathname.replace('/v1/telemetry/', '');
@@ -2244,7 +2539,9 @@ async function startProxy(config = {}) {
2244
2539
  const offset = parseInt(params.get('offset') || '0', 10);
2245
2540
  const sorted = [...requestHistory].reverse();
2246
2541
  const runs = sorted.slice(offset, offset + limit).map(r => {
2247
- const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut);
2542
+ // Savings should reflect routing decisions only — pass same cache tokens to baseline
2543
+ // so the cache discount doesn't get counted as "savings from routing"
2544
+ const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
2248
2545
  const perRunSavings = Math.max(0, origCost - r.costUsd);
2249
2546
  return {
2250
2547
  id: r.id,
@@ -2264,6 +2561,8 @@ async function startProxy(config = {}) {
2264
2561
  latencyMs: r.latencyMs,
2265
2562
  tokensIn: r.tokensIn,
2266
2563
  tokensOut: r.tokensOut,
2564
+ cacheCreationTokens: r.cacheCreationTokens ?? 0,
2565
+ cacheReadTokens: r.cacheReadTokens ?? 0,
2267
2566
  savings: Math.round(perRunSavings * 10000) / 10000,
2268
2567
  escalated: r.escalated,
2269
2568
  };
@@ -2281,7 +2580,9 @@ async function startProxy(config = {}) {
2281
2580
  let totalSavedAmount = 0;
2282
2581
  const byDayMap = new Map();
2283
2582
  for (const r of requestHistory) {
2284
- const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut);
2583
+ // Pass same cache tokens to baseline so savings only reflect routing decisions,
2584
+ // not prompt-cache discounts (those happen regardless of which model is chosen).
2585
+ const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
2285
2586
  const actualCost = r.costUsd;
2286
2587
  const saved = Math.max(0, origCost - actualCost);
2287
2588
  totalOriginalCost += origCost;
@@ -2372,6 +2673,24 @@ async function startProxy(config = {}) {
2372
2673
  res.end(getConfigDashboardHTML());
2373
2674
  return;
2374
2675
  }
2676
+ // === Mesh stats endpoint ===
2677
+ if (req.method === 'GET' && pathname === '/v1/mesh/stats') {
2678
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2679
+ res.end(JSON.stringify(meshHandle.getStats()));
2680
+ return;
2681
+ }
2682
+ if (req.method === 'POST' && pathname === '/v1/mesh/sync') {
2683
+ try {
2684
+ const result = await meshHandle.forceSync();
2685
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2686
+ res.end(JSON.stringify({ sync: result }));
2687
+ }
2688
+ catch (err) {
2689
+ res.writeHead(500, { 'Content-Type': 'application/json' });
2690
+ res.end(JSON.stringify({ sync: { error: err.message } }));
2691
+ }
2692
+ return;
2693
+ }
2375
2694
  if (req.method === 'GET' && pathname === '/v1/config') {
2376
2695
  try {
2377
2696
  const raw = await fs.promises.readFile(getProxyConfigPath(), 'utf8');
@@ -2511,6 +2830,48 @@ async function startProxy(config = {}) {
2511
2830
  log(`Config routing.mode=auto: overriding passthrough → auto for model ${requestedModel}`);
2512
2831
  }
2513
2832
  const isStreaming = requestBody['stream'] === true;
2833
+ // ── Response Cache: check for cached response ──
2834
+ const cacheBypass = responseCache.shouldBypass(requestBody);
2835
+ let cacheHash;
2836
+ if (!cacheBypass) {
2837
+ cacheHash = responseCache.computeKey(requestBody);
2838
+ const cached = responseCache.get(cacheHash);
2839
+ if (cached) {
2840
+ try {
2841
+ const cachedData = JSON.parse(cached);
2842
+ const cacheUsage = cachedData?.usage;
2843
+ const cacheCost = (0, telemetry_js_1.estimateCost)(requestBody['model'] ?? '', cacheUsage?.input_tokens ?? 0, cacheUsage?.output_tokens ?? 0);
2844
+ responseCache.recordHit(cacheCost, 0);
2845
+ // Replay cached streaming response as SSE
2846
+ if (isStreaming && cachedData._relayplaneStreamCache) {
2847
+ res.writeHead(200, {
2848
+ 'Content-Type': 'text/event-stream',
2849
+ 'Cache-Control': 'no-cache',
2850
+ 'Connection': 'keep-alive',
2851
+ 'X-RelayPlane-Cache': 'HIT',
2852
+ });
2853
+ res.end(cachedData.ssePayload);
2854
+ }
2855
+ else {
2856
+ res.writeHead(200, {
2857
+ 'Content-Type': 'application/json',
2858
+ 'X-RelayPlane-Cache': 'HIT',
2859
+ });
2860
+ res.end(cached);
2861
+ }
2862
+ log(`Cache HIT for ${requestBody['model']} (hash: ${cacheHash.slice(0, 8)})`);
2863
+ return;
2864
+ }
2865
+ catch {
2866
+ // Corrupt cache entry, continue to provider
2867
+ }
2868
+ }
2869
+ responseCache.recordMiss();
2870
+ }
2871
+ else {
2872
+ responseCache.recordBypass();
2873
+ }
2874
+ // ── End cache check ──
2514
2875
  const messages = Array.isArray(requestBody['messages'])
2515
2876
  ? requestBody['messages']
2516
2877
  : [];
@@ -2619,6 +2980,47 @@ async function startProxy(config = {}) {
2619
2980
  res.end(JSON.stringify({ error: `Provider ${targetProvider} is temporarily cooled down` }));
2620
2981
  return;
2621
2982
  }
2983
+ // ── Budget check + auto-downgrade ──
2984
+ const budgetExtraHeaders = {};
2985
+ {
2986
+ const budgetCheck = preRequestBudgetCheck(targetModel || requestedModel);
2987
+ if (budgetCheck.blocked) {
2988
+ res.writeHead(429, { 'Content-Type': 'application/json' });
2989
+ res.end(JSON.stringify({
2990
+ error: 'Budget limit exceeded. Request blocked.',
2991
+ type: 'budget_exceeded',
2992
+ }));
2993
+ return;
2994
+ }
2995
+ if (budgetCheck.downgraded) {
2996
+ log(`Budget downgrade: ${targetModel || requestedModel} → ${budgetCheck.model}`);
2997
+ targetModel = budgetCheck.model;
2998
+ if (requestBody)
2999
+ requestBody['model'] = targetModel;
3000
+ }
3001
+ Object.assign(budgetExtraHeaders, budgetCheck.headers);
3002
+ }
3003
+ // ── End budget check ──
3004
+ // ── Rate limit check ──
3005
+ const workspaceId = 'local'; // Local proxy uses single workspace
3006
+ const rateLimit = (0, rate_limiter_js_1.checkLimit)(workspaceId, targetModel);
3007
+ if (!rateLimit.allowed) {
3008
+ console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${workspaceId}`);
3009
+ res.writeHead(429, {
3010
+ 'Content-Type': 'application/json',
3011
+ 'Retry-After': String(rateLimit.retryAfter || 60),
3012
+ 'X-RelayPlane-RateLimit-Limit': String(rateLimit.limit),
3013
+ 'X-RelayPlane-RateLimit-Remaining': '0',
3014
+ 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rateLimit.resetAt / 1000))
3015
+ });
3016
+ res.end(JSON.stringify({
3017
+ error: `Rate limit exceeded for ${targetModel}. Max ${rateLimit.limit} requests per minute.`,
3018
+ type: 'rate_limit_exceeded',
3019
+ retry_after: rateLimit.retryAfter || 60
3020
+ }));
3021
+ return;
3022
+ }
3023
+ // ── End rate limit check ──
2622
3024
  const startTime = Date.now();
2623
3025
  let nativeResponseData;
2624
3026
  try {
@@ -2688,11 +3090,16 @@ async function startProxy(config = {}) {
2688
3090
  'Content-Type': 'text/event-stream',
2689
3091
  'Cache-Control': 'no-cache',
2690
3092
  'Connection': 'keep-alive',
3093
+ 'X-RelayPlane-Cache': cacheBypass ? 'BYPASS' : 'MISS',
2691
3094
  ...nativeStreamRpHeaders,
2692
3095
  });
2693
3096
  const reader = providerResponse.body?.getReader();
2694
3097
  let streamTokensIn = 0;
2695
3098
  let streamTokensOut = 0;
3099
+ let streamCacheCreation = 0;
3100
+ let streamCacheRead = 0;
3101
+ // Buffer raw SSE chunks for cache storage
3102
+ const rawChunks = [];
2696
3103
  if (reader) {
2697
3104
  const decoder = new TextDecoder();
2698
3105
  let sseBuffer = '';
@@ -2703,6 +3110,8 @@ async function startProxy(config = {}) {
2703
3110
  break;
2704
3111
  const chunk = decoder.decode(value, { stream: true });
2705
3112
  res.write(chunk);
3113
+ if (cacheHash && !cacheBypass)
3114
+ rawChunks.push(chunk);
2706
3115
  // Parse SSE events to extract usage from message_delta / message_stop
2707
3116
  sseBuffer += chunk;
2708
3117
  const lines = sseBuffer.split('\n');
@@ -2715,9 +3124,11 @@ async function startProxy(config = {}) {
2715
3124
  if (evt.type === 'message_delta' && evt.usage) {
2716
3125
  streamTokensOut = evt.usage.output_tokens ?? streamTokensOut;
2717
3126
  }
2718
- // Anthropic: message_start has usage.input_tokens
3127
+ // Anthropic: message_start has usage.input_tokens + cache tokens
2719
3128
  if (evt.type === 'message_start' && evt.message?.usage) {
2720
3129
  streamTokensIn = evt.message.usage.input_tokens ?? streamTokensIn;
3130
+ streamCacheCreation = evt.message.usage.cache_creation_input_tokens ?? 0;
3131
+ streamCacheRead = evt.message.usage.cache_read_input_tokens ?? 0;
2721
3132
  }
2722
3133
  // OpenAI format: choices with usage
2723
3134
  if (evt.usage) {
@@ -2736,15 +3147,45 @@ async function startProxy(config = {}) {
2736
3147
  reader.releaseLock();
2737
3148
  }
2738
3149
  }
3150
+ // ── Cache: store streaming response as raw SSE payload ──
3151
+ if (cacheHash && !cacheBypass && rawChunks.length > 0) {
3152
+ const streamPayload = JSON.stringify({
3153
+ _relayplaneStreamCache: true,
3154
+ ssePayload: rawChunks.join(''),
3155
+ usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
3156
+ });
3157
+ responseCache.set(cacheHash, streamPayload, {
3158
+ model: targetModel || requestedModel,
3159
+ tokensIn: streamTokensIn,
3160
+ tokensOut: streamTokensOut,
3161
+ costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
3162
+ taskType,
3163
+ });
3164
+ log(`Cache STORE (stream) for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
3165
+ }
2739
3166
  // Store streaming token counts so telemetry can use them
2740
- nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut } };
3167
+ nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead } };
2741
3168
  res.end();
2742
3169
  }
2743
3170
  else {
2744
3171
  nativeResponseData = await providerResponse.json();
2745
3172
  const nativeRespModel = checkResponseModelMismatch(nativeResponseData, targetModel || requestedModel, targetProvider, log);
2746
3173
  const nativeRpHeaders = buildRelayPlaneResponseHeaders(targetModel || requestedModel, originalModel ?? 'unknown', complexity, targetProvider, routingMode);
2747
- res.writeHead(providerResponse.status, { 'Content-Type': 'application/json', ...nativeRpHeaders });
3174
+ // ── Cache: store non-streaming response ──
3175
+ const nativeCacheHeader = cacheBypass ? 'BYPASS' : 'MISS';
3176
+ if (cacheHash && !cacheBypass) {
3177
+ const nativeRespJson = JSON.stringify(nativeResponseData);
3178
+ const nativeUsage = nativeResponseData?.usage;
3179
+ responseCache.set(cacheHash, nativeRespJson, {
3180
+ model: targetModel || requestedModel,
3181
+ tokensIn: nativeUsage?.input_tokens ?? 0,
3182
+ tokensOut: nativeUsage?.output_tokens ?? 0,
3183
+ costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeUsage?.input_tokens ?? 0, nativeUsage?.output_tokens ?? 0, nativeUsage?.cache_creation_input_tokens || undefined, nativeUsage?.cache_read_input_tokens || undefined),
3184
+ taskType,
3185
+ });
3186
+ log(`Cache STORE for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
3187
+ }
3188
+ res.writeHead(providerResponse.status, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': nativeCacheHeader, ...nativeRpHeaders });
2748
3189
  res.end(JSON.stringify(nativeResponseData));
2749
3190
  }
2750
3191
  }
@@ -2754,18 +3195,31 @@ async function startProxy(config = {}) {
2754
3195
  // nativeResponseData holds response JSON for non-streaming, or { usage: { input_tokens, output_tokens } }
2755
3196
  // synthesised from SSE events for streaming
2756
3197
  const nativeUsageData = nativeResponseData?.usage;
2757
- const nativeTokIn = nativeUsageData?.input_tokens ?? nativeUsageData?.prompt_tokens ?? 0;
3198
+ const nativeBaseTokIn = nativeUsageData?.input_tokens ?? nativeUsageData?.prompt_tokens ?? 0;
2758
3199
  const nativeTokOut = nativeUsageData?.output_tokens ?? nativeUsageData?.completion_tokens ?? 0;
2759
- updateLastHistoryEntry(nativeTokIn, nativeTokOut, (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut));
3200
+ const nativeCacheCreation = nativeUsageData?.cache_creation_input_tokens ?? 0;
3201
+ const nativeCacheRead = nativeUsageData?.cache_read_input_tokens ?? 0;
3202
+ // Include cache tokens in displayed/recorded token count
3203
+ const nativeTokIn = nativeBaseTokIn + nativeCacheCreation + nativeCacheRead;
3204
+ // Cost calculation expects inputTokens to include cache tokens when cache params are provided
3205
+ const nativeCostUsd = (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined);
3206
+ updateLastHistoryEntry(nativeTokIn, nativeTokOut, nativeCostUsd, undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
3207
+ // ── Post-request: budget spend + anomaly detection ──
3208
+ postRequestRecord(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCostUsd);
2760
3209
  if (recordTelemetry) {
2761
3210
  relay
2762
3211
  .run({
2763
3212
  prompt: promptText.slice(0, 500),
2764
3213
  taskType,
2765
3214
  model: `${targetProvider}:${targetModel || requestedModel}`,
3215
+ })
3216
+ .then((runResult) => {
3217
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
3218
+ relay.patchRunTokens(runResult.runId, nativeTokIn, nativeTokOut, nativeCostUsd);
2766
3219
  })
2767
3220
  .catch(() => { });
2768
- sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined);
3221
+ sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
3222
+ meshCapture(targetModel || requestedModel, targetProvider, taskType, nativeTokIn, nativeTokOut, (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined), durationMs, true);
2769
3223
  }
2770
3224
  }
2771
3225
  catch (err) {
@@ -2847,6 +3301,47 @@ async function startProxy(config = {}) {
2847
3301
  return;
2848
3302
  }
2849
3303
  const isStreaming = request.stream === true;
3304
+ // ── Response Cache: check for cached response (chat/completions) ──
3305
+ const chatCacheBypass = responseCache.shouldBypass(request);
3306
+ let chatCacheHash;
3307
+ if (!chatCacheBypass) {
3308
+ chatCacheHash = responseCache.computeKey(request);
3309
+ const chatCached = responseCache.get(chatCacheHash);
3310
+ if (chatCached) {
3311
+ try {
3312
+ const chatCachedData = JSON.parse(chatCached);
3313
+ const chatCacheUsage = chatCachedData?.usage;
3314
+ const chatCacheCost = (0, telemetry_js_1.estimateCost)(request.model ?? '', chatCacheUsage?.prompt_tokens ?? chatCacheUsage?.input_tokens ?? 0, chatCacheUsage?.completion_tokens ?? chatCacheUsage?.output_tokens ?? 0);
3315
+ responseCache.recordHit(chatCacheCost, 0);
3316
+ if (isStreaming && chatCachedData._relayplaneStreamCache) {
3317
+ res.writeHead(200, {
3318
+ 'Content-Type': 'text/event-stream',
3319
+ 'Cache-Control': 'no-cache',
3320
+ 'Connection': 'keep-alive',
3321
+ 'X-RelayPlane-Cache': 'HIT',
3322
+ });
3323
+ res.end(chatCachedData.ssePayload);
3324
+ }
3325
+ else {
3326
+ res.writeHead(200, {
3327
+ 'Content-Type': 'application/json',
3328
+ 'X-RelayPlane-Cache': 'HIT',
3329
+ });
3330
+ res.end(chatCached);
3331
+ }
3332
+ log(`Cache HIT for chat/completions ${request.model} (hash: ${chatCacheHash.slice(0, 8)})`);
3333
+ return;
3334
+ }
3335
+ catch {
3336
+ // Corrupt, continue
3337
+ }
3338
+ }
3339
+ responseCache.recordMiss();
3340
+ }
3341
+ else {
3342
+ responseCache.recordBypass();
3343
+ }
3344
+ // ── End cache check ──
2850
3345
  const bypassRouting = !relayplaneEnabled || relayplaneBypass;
2851
3346
  // Extract routing mode from model name
2852
3347
  const originalRequestedModel = request.model;
@@ -3065,10 +3560,48 @@ async function startProxy(config = {}) {
3065
3560
  }
3066
3561
  apiKey = apiKeyResult.apiKey;
3067
3562
  }
3563
+ // ── Budget check + auto-downgrade (chat/completions) ──
3564
+ {
3565
+ const chatBudgetCheck = preRequestBudgetCheck(targetModel);
3566
+ if (chatBudgetCheck.blocked) {
3567
+ res.writeHead(429, { 'Content-Type': 'application/json' });
3568
+ res.end(JSON.stringify({
3569
+ error: 'Budget limit exceeded. Request blocked.',
3570
+ type: 'budget_exceeded',
3571
+ }));
3572
+ return;
3573
+ }
3574
+ if (chatBudgetCheck.downgraded) {
3575
+ log(`Budget downgrade: ${targetModel} → ${chatBudgetCheck.model}`);
3576
+ targetModel = chatBudgetCheck.model;
3577
+ request.model = targetModel;
3578
+ }
3579
+ }
3580
+ // ── End budget check ──
3581
+ // ── Rate limit check ──
3582
+ const chatWorkspaceId = 'local'; // Local proxy uses single workspace
3583
+ const chatRateLimit = (0, rate_limiter_js_1.checkLimit)(chatWorkspaceId, targetModel);
3584
+ if (!chatRateLimit.allowed) {
3585
+ console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${chatWorkspaceId}`);
3586
+ res.writeHead(429, {
3587
+ 'Content-Type': 'application/json',
3588
+ 'Retry-After': String(chatRateLimit.retryAfter || 60),
3589
+ 'X-RelayPlane-RateLimit-Limit': String(chatRateLimit.limit),
3590
+ 'X-RelayPlane-RateLimit-Remaining': '0',
3591
+ 'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRateLimit.resetAt / 1000))
3592
+ });
3593
+ res.end(JSON.stringify({
3594
+ error: `Rate limit exceeded for ${targetModel}. Max ${chatRateLimit.limit} requests per minute.`,
3595
+ type: 'rate_limit_exceeded',
3596
+ retry_after: chatRateLimit.retryAfter || 60
3597
+ }));
3598
+ return;
3599
+ }
3600
+ // ── End rate limit check ──
3068
3601
  const startTime = Date.now();
3069
3602
  // Handle streaming vs non-streaming
3070
3603
  if (isStreaming) {
3071
- await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity);
3604
+ await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatCacheHash, chatCacheBypass);
3072
3605
  }
3073
3606
  else {
3074
3607
  if (useCascade && cascadeConfig) {
@@ -3105,8 +3638,10 @@ async function startProxy(config = {}) {
3105
3638
  const cascadeUsage = responseData?.usage;
3106
3639
  const cascadeTokensIn = cascadeUsage?.input_tokens ?? cascadeUsage?.prompt_tokens ?? 0;
3107
3640
  const cascadeTokensOut = cascadeUsage?.output_tokens ?? cascadeUsage?.completion_tokens ?? 0;
3108
- const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut);
3109
- updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel);
3641
+ const cascadeCacheCreation = cascadeUsage?.cache_creation_input_tokens || undefined;
3642
+ const cascadeCacheRead = cascadeUsage?.cache_read_input_tokens || undefined;
3643
+ const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut, cascadeCacheCreation, cascadeCacheRead);
3644
+ updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel, cascadeCacheCreation, cascadeCacheRead);
3110
3645
  if (recordTelemetry) {
3111
3646
  try {
3112
3647
  const runResult = await relay.run({
@@ -3114,6 +3649,8 @@ async function startProxy(config = {}) {
3114
3649
  taskType,
3115
3650
  model: `${cascadeResult.provider}:${cascadeResult.model}`,
3116
3651
  });
3652
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
3653
+ relay.patchRunTokens(runResult.runId, cascadeTokensIn, cascadeTokensOut, cascadeCost);
3117
3654
  responseData['_relayplane'] = {
3118
3655
  runId: runResult.runId,
3119
3656
  routedTo: `${cascadeResult.provider}/${cascadeResult.model}`,
@@ -3128,7 +3665,8 @@ async function startProxy(config = {}) {
3128
3665
  catch (err) {
3129
3666
  log(`Failed to record run: ${err}`);
3130
3667
  }
3131
- sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined);
3668
+ sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined, cascadeCacheCreation, cascadeCacheRead);
3669
+ meshCapture(cascadeResult.model, cascadeResult.provider, taskType, cascadeTokensIn, cascadeTokensOut, cascadeCost, durationMs, true);
3132
3670
  }
3133
3671
  const chatCascadeRpHeaders = buildRelayPlaneResponseHeaders(cascadeResult.model, originalRequestedModel ?? 'unknown', complexity, cascadeResult.provider, 'cascade');
3134
3672
  res.writeHead(200, { 'Content-Type': 'application/json', ...chatCascadeRpHeaders });
@@ -3152,6 +3690,74 @@ async function startProxy(config = {}) {
3152
3690
  }
3153
3691
  }
3154
3692
  });
3693
+ // ── Health Watchdog ──
3694
+ let watchdogFailures = 0;
3695
+ const WATCHDOG_MAX_FAILURES = 3;
3696
+ const WATCHDOG_INTERVAL_MS = 15_000; // Must be < WatchdogSec (30s) to avoid false kills
3697
+ let watchdogTimer = null;
3698
+ /**
3699
+ * sd_notify: write to $NOTIFY_SOCKET for systemd watchdog integration
3700
+ */
3701
+ function sdNotify(state) {
3702
+ const notifySocket = process.env['NOTIFY_SOCKET'];
3703
+ if (!notifySocket)
3704
+ return;
3705
+ try {
3706
+ const dgram = require('node:dgram');
3707
+ const client = dgram.createSocket('unix_dgram');
3708
+ const buf = Buffer.from(state);
3709
+ client.send(buf, 0, buf.length, notifySocket, () => {
3710
+ client.close();
3711
+ });
3712
+ }
3713
+ catch (err) {
3714
+ log(`sd_notify error: ${err}`);
3715
+ }
3716
+ }
3717
+ function startWatchdog() {
3718
+ // Notify systemd we're ready
3719
+ sdNotify('READY=1');
3720
+ watchdogTimer = setInterval(async () => {
3721
+ try {
3722
+ const controller = new AbortController();
3723
+ const timeout = setTimeout(() => controller.abort(), 5000);
3724
+ const res = await fetch(`http://${host}:${port}/health`, { signal: controller.signal });
3725
+ clearTimeout(timeout);
3726
+ if (res.ok) {
3727
+ watchdogFailures = 0;
3728
+ // Notify systemd watchdog we're alive
3729
+ sdNotify('WATCHDOG=1');
3730
+ }
3731
+ else {
3732
+ watchdogFailures++;
3733
+ console.error(`[RelayPlane] Watchdog: health check returned ${res.status} (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES})`);
3734
+ }
3735
+ }
3736
+ catch (err) {
3737
+ watchdogFailures++;
3738
+ console.error(`[RelayPlane] Watchdog: health check failed (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES}): ${err}`);
3739
+ }
3740
+ if (watchdogFailures >= WATCHDOG_MAX_FAILURES) {
3741
+ console.error('[RelayPlane] CRITICAL: 3 consecutive watchdog failures. Attempting graceful restart...');
3742
+ sdNotify('STOPPING=1');
3743
+ // Close server and exit — systemd Restart=always will restart us
3744
+ server.close(() => {
3745
+ process.exit(1);
3746
+ });
3747
+ // Force exit after 10s if graceful close hangs
3748
+ setTimeout(() => process.exit(1), 10_000).unref();
3749
+ }
3750
+ }, WATCHDOG_INTERVAL_MS);
3751
+ watchdogTimer.unref();
3752
+ }
3753
+ // Clean up watchdog on shutdown
3754
+ const origHandleShutdown = () => {
3755
+ if (watchdogTimer)
3756
+ clearInterval(watchdogTimer);
3757
+ sdNotify('STOPPING=1');
3758
+ };
3759
+ process.on('SIGINT', origHandleShutdown);
3760
+ process.on('SIGTERM', origHandleShutdown);
3155
3761
  return new Promise((resolve, reject) => {
3156
3762
  server.on('error', reject);
3157
3763
  server.listen(port, host, () => {
@@ -3164,6 +3770,8 @@ async function startProxy(config = {}) {
3164
3770
  console.log(` Models: relayplane:auto, relayplane:cost, relayplane:fast, relayplane:quality`);
3165
3771
  console.log(` Auth: Passthrough for Anthropic, env vars for other providers`);
3166
3772
  console.log(` Streaming: ✅ Enabled`);
3773
+ startWatchdog();
3774
+ log('Health watchdog started (30s interval, sd_notify enabled)');
3167
3775
  resolve(server);
3168
3776
  });
3169
3777
  });
@@ -3221,7 +3829,7 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
3221
3829
  }
3222
3830
  return { responseData, ok: true, status: 200 };
3223
3831
  }
3224
- async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple') {
3832
+ async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', cacheHash, cacheBypass) {
3225
3833
  let providerResponse;
3226
3834
  try {
3227
3835
  switch (targetProvider) {
@@ -3274,9 +3882,13 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3274
3882
  'Connection': 'keep-alive',
3275
3883
  ...streamRpHeaders,
3276
3884
  });
3277
- // Track token usage from streaming events
3885
+ // Track token usage from streaming events (including Anthropic prompt cache tokens)
3278
3886
  let streamTokensIn = 0;
3279
3887
  let streamTokensOut = 0;
3888
+ let streamCacheCreation = 0;
3889
+ let streamCacheRead = 0;
3890
+ const shouldCacheStream = !!(cacheHash && !cacheBypass);
3891
+ const rawChunks = [];
3280
3892
  try {
3281
3893
  // Stream the response based on provider format
3282
3894
  switch (targetProvider) {
@@ -3284,7 +3896,10 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3284
3896
  // Convert Anthropic stream to OpenAI format
3285
3897
  for await (const chunk of convertAnthropicStream(providerResponse, targetModel)) {
3286
3898
  res.write(chunk);
3287
- // Parse OpenAI-format chunks for usage (emitted at end of stream)
3899
+ if (shouldCacheStream)
3900
+ rawChunks.push(chunk);
3901
+ // Parse OpenAI-format chunks for usage — the converter embeds
3902
+ // cache_creation_tokens and cache_read_tokens from message_start.
3288
3903
  try {
3289
3904
  const lines = chunk.split('\n');
3290
3905
  for (const line of lines) {
@@ -3293,6 +3908,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3293
3908
  if (evt.usage) {
3294
3909
  streamTokensIn = evt.usage.prompt_tokens ?? streamTokensIn;
3295
3910
  streamTokensOut = evt.usage.completion_tokens ?? streamTokensOut;
3911
+ streamCacheCreation = evt.usage.cache_creation_tokens ?? streamCacheCreation;
3912
+ streamCacheRead = evt.usage.cache_read_tokens ?? streamCacheRead;
3296
3913
  }
3297
3914
  }
3298
3915
  }
@@ -3304,6 +3921,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3304
3921
  // Convert Gemini stream to OpenAI format
3305
3922
  for await (const chunk of convertGeminiStream(providerResponse, targetModel)) {
3306
3923
  res.write(chunk);
3924
+ if (shouldCacheStream)
3925
+ rawChunks.push(chunk);
3307
3926
  try {
3308
3927
  const lines = chunk.split('\n');
3309
3928
  for (const line of lines) {
@@ -3323,6 +3942,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3323
3942
  // xAI, OpenRouter, DeepSeek, Groq, OpenAI all use OpenAI-compatible streaming format
3324
3943
  for await (const chunk of pipeOpenAIStream(providerResponse)) {
3325
3944
  res.write(chunk);
3945
+ if (shouldCacheStream)
3946
+ rawChunks.push(chunk);
3326
3947
  try {
3327
3948
  const lines = chunk.split('\n');
3328
3949
  for (const line of lines) {
@@ -3342,15 +3963,43 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3342
3963
  catch (err) {
3343
3964
  log(`Streaming error: ${err}`);
3344
3965
  }
3966
+ // ── Cache: store streaming response ──
3967
+ if (shouldCacheStream && cacheHash && rawChunks.length > 0) {
3968
+ const responseCache = (0, response_cache_js_1.getResponseCache)();
3969
+ const streamPayload = JSON.stringify({
3970
+ _relayplaneStreamCache: true,
3971
+ ssePayload: rawChunks.join(''),
3972
+ usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, prompt_tokens: streamTokensIn, completion_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
3973
+ });
3974
+ responseCache.set(cacheHash, streamPayload, {
3975
+ model: targetModel,
3976
+ tokensIn: streamTokensIn,
3977
+ tokensOut: streamTokensOut,
3978
+ costUsd: (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
3979
+ taskType,
3980
+ });
3981
+ log(`Cache STORE (stream) for chat/completions ${targetModel} (hash: ${cacheHash.slice(0, 8)})`);
3982
+ }
3345
3983
  if (cooldownsEnabled) {
3346
3984
  cooldownManager.recordSuccess(targetProvider);
3347
3985
  }
3348
3986
  const durationMs = Date.now() - startTime;
3349
3987
  // Always log the request for stats/telemetry tracking
3350
3988
  logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
3351
- // Update token/cost info on the history entry
3352
- const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut);
3353
- updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost);
3989
+ // Update token/cost info on the history entry (with cache token discount)
3990
+ const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined);
3991
+ updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost, undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
3992
+ // ── Post-request: budget spend + anomaly detection ──
3993
+ try {
3994
+ (0, budget_js_1.getBudgetManager)().recordSpend(streamCost, targetModel);
3995
+ const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn: streamTokensIn, tokensOut: streamTokensOut, costUsd: streamCost });
3996
+ if (anomalyResult.detected) {
3997
+ for (const anomaly of anomalyResult.anomalies) {
3998
+ (0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
3999
+ }
4000
+ }
4001
+ }
4002
+ catch { /* budget/anomaly should never block */ }
3354
4003
  if (recordTelemetry) {
3355
4004
  // Record the run (non-blocking)
3356
4005
  relay
@@ -3360,12 +4009,15 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3360
4009
  model: `${targetProvider}:${targetModel}`,
3361
4010
  })
3362
4011
  .then((runResult) => {
4012
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
4013
+ relay.patchRunTokens(runResult.runId, streamTokensIn, streamTokensOut, streamCost);
3363
4014
  log(`Completed streaming in ${durationMs}ms, runId: ${runResult.runId}`);
3364
4015
  })
3365
4016
  .catch((err) => {
3366
4017
  log(`Failed to record run: ${err}`);
3367
4018
  });
3368
- sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined);
4019
+ sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
4020
+ meshCapture(targetModel, targetProvider, taskType, streamTokensIn, streamTokensOut, streamCost, durationMs, true);
3369
4021
  }
3370
4022
  res.end();
3371
4023
  }
@@ -3407,12 +4059,25 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
3407
4059
  const nonStreamRespModel = checkResponseModelMismatch(responseData, targetModel, targetProvider, log);
3408
4060
  // Log the successful request
3409
4061
  logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
3410
- // Update token/cost info
4062
+ // Update token/cost info (including Anthropic prompt cache tokens)
3411
4063
  const usage = responseData?.usage;
3412
4064
  const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
3413
4065
  const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
3414
- const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut);
3415
- updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel);
4066
+ const cacheCreationTokens = usage?.cache_creation_input_tokens ?? 0;
4067
+ const cacheReadTokens = usage?.cache_read_input_tokens ?? 0;
4068
+ const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut, cacheCreationTokens || undefined, cacheReadTokens || undefined);
4069
+ updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel, cacheCreationTokens || undefined, cacheReadTokens || undefined);
4070
+ // ── Post-request: budget spend + anomaly detection ──
4071
+ try {
4072
+ (0, budget_js_1.getBudgetManager)().recordSpend(cost, targetModel);
4073
+ const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn, tokensOut, costUsd: cost });
4074
+ if (anomalyResult.detected) {
4075
+ for (const anomaly of anomalyResult.anomalies) {
4076
+ (0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
4077
+ }
4078
+ }
4079
+ }
4080
+ catch { /* budget/anomaly should never block */ }
3416
4081
  if (recordTelemetry) {
3417
4082
  // Record the run in RelayPlane
3418
4083
  try {
@@ -3421,6 +4086,8 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
3421
4086
  taskType,
3422
4087
  model: `${targetProvider}:${targetModel}`,
3423
4088
  });
4089
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
4090
+ relay.patchRunTokens(runResult.runId, tokensIn, tokensOut, cost);
3424
4091
  // Add routing metadata to response
3425
4092
  responseData['_relayplane'] = {
3426
4093
  runId: runResult.runId,
@@ -3435,15 +4102,34 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
3435
4102
  catch (err) {
3436
4103
  log(`Failed to record run: ${err}`);
3437
4104
  }
3438
- // Extract token counts from response if available (Anthropic/OpenAI format)
3439
- const usage = responseData?.usage;
3440
- const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
3441
- const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
3442
- sendCloudTelemetry(taskType, targetModel, tokensIn, tokensOut, durationMs, true);
4105
+ // Extract token counts from response if available (Anthropic/OpenAI format, including cache)
4106
+ const innerUsage = responseData?.usage;
4107
+ const innerTokIn = innerUsage?.input_tokens ?? innerUsage?.prompt_tokens ?? 0;
4108
+ const innerTokOut = innerUsage?.output_tokens ?? innerUsage?.completion_tokens ?? 0;
4109
+ const innerCacheCreation = innerUsage?.cache_creation_input_tokens ?? 0;
4110
+ const innerCacheRead = innerUsage?.cache_read_input_tokens ?? 0;
4111
+ sendCloudTelemetry(taskType, targetModel, innerTokIn, innerTokOut, durationMs, true, undefined, undefined, innerCacheCreation || undefined, innerCacheRead || undefined);
4112
+ meshCapture(targetModel, targetProvider, taskType, innerTokIn, innerTokOut, cost, durationMs, true);
4113
+ }
4114
+ // ── Cache: store non-streaming chat/completions response ──
4115
+ const chatRespCache = (0, response_cache_js_1.getResponseCache)();
4116
+ const chatReqAsRecord = request;
4117
+ const chatCacheBypassLocal = chatRespCache.shouldBypass(chatReqAsRecord);
4118
+ let chatCacheHeaderVal = chatCacheBypassLocal ? 'BYPASS' : 'MISS';
4119
+ if (!chatCacheBypassLocal) {
4120
+ const chatHashLocal = chatRespCache.computeKey(chatReqAsRecord);
4121
+ chatRespCache.set(chatHashLocal, JSON.stringify(responseData), {
4122
+ model: targetModel,
4123
+ tokensIn: tokensIn,
4124
+ tokensOut: tokensOut,
4125
+ costUsd: cost,
4126
+ taskType,
4127
+ });
4128
+ log(`Cache STORE for chat/completions ${targetModel} (hash: ${chatHashLocal.slice(0, 8)})`);
3443
4129
  }
3444
4130
  // Send response with RelayPlane routing headers
3445
4131
  const nonStreamRpHeaders = buildRelayPlaneResponseHeaders(targetModel, request.model ?? 'unknown', complexity, targetProvider, routingMode);
3446
- res.writeHead(200, { 'Content-Type': 'application/json', ...nonStreamRpHeaders });
4132
+ res.writeHead(200, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': chatCacheHeaderVal, ...nonStreamRpHeaders });
3447
4133
  res.end(JSON.stringify(responseData));
3448
4134
  }
3449
4135
  // Note: CLI entry point is in cli.ts