@relayplane/proxy 1.7.1 → 1.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,6 +56,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
56
56
  exports.SMART_ALIASES = exports.RELAYPLANE_ALIASES = exports.MODEL_MAPPING = exports.DEFAULT_ENDPOINTS = exports.proxyStatsCollector = void 0;
57
57
  exports.getAvailableModelNames = getAvailableModelNames;
58
58
  exports.resolveModelAlias = resolveModelAlias;
59
+ exports.extractRequestContent = extractRequestContent;
60
+ exports.extractResponseText = extractResponseText;
59
61
  exports.parseModelSuffix = parseModelSuffix;
60
62
  exports.classifyComplexity = classifyComplexity;
61
63
  exports.shouldEscalate = shouldEscalate;
@@ -76,6 +78,7 @@ const budget_js_1 = require("./budget.js");
76
78
  const anomaly_js_1 = require("./anomaly.js");
77
79
  const alerts_js_1 = require("./alerts.js");
78
80
  const downgrade_js_1 = require("./downgrade.js");
81
+ const agent_tracker_js_1 = require("./agent-tracker.js");
79
82
  const version_status_js_1 = require("./utils/version-status.js");
80
83
  const PROXY_VERSION = (() => {
81
84
  try {
@@ -488,7 +491,7 @@ function shutdownHistory() {
488
491
  }
489
492
  flushHistoryBuffer();
490
493
  }
491
- function logRequest(originalModel, targetModel, provider, latencyMs, success, mode, escalated, taskType, complexity) {
494
+ function logRequest(originalModel, targetModel, provider, latencyMs, success, mode, escalated, taskType, complexity, agentFingerprint, agentId) {
492
495
  const timestamp = new Date().toISOString();
493
496
  const status = success ? '✓' : '✗';
494
497
  const escalateTag = escalated ? ' [ESCALATED]' : '';
@@ -531,6 +534,8 @@ function logRequest(originalModel, targetModel, provider, latencyMs, success, mo
531
534
  costUsd: 0,
532
535
  taskType: taskType || 'general',
533
536
  complexity: complexity || 'simple',
537
+ agentFingerprint,
538
+ agentId,
534
539
  };
535
540
  requestHistory.push(entry);
536
541
  if (requestHistory.length > MAX_HISTORY) {
@@ -539,7 +544,7 @@ function logRequest(originalModel, targetModel, provider, latencyMs, success, mo
539
544
  bufferHistoryEntry(entry);
540
545
  }
541
546
  /** Update the most recent history entry with token/cost info */
542
- function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
547
+ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel, cacheCreationTokens, cacheReadTokens, agentFingerprint, agentId, requestContent) {
543
548
  if (requestHistory.length > 0) {
544
549
  const last = requestHistory[requestHistory.length - 1];
545
550
  last.tokensIn = tokensIn;
@@ -548,8 +553,86 @@ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
548
553
  if (responseModel) {
549
554
  last.responseModel = responseModel;
550
555
  }
556
+ if (cacheCreationTokens !== undefined)
557
+ last.cacheCreationTokens = cacheCreationTokens;
558
+ if (cacheReadTokens !== undefined)
559
+ last.cacheReadTokens = cacheReadTokens;
560
+ if (agentFingerprint !== undefined)
561
+ last.agentFingerprint = agentFingerprint;
562
+ if (agentId !== undefined)
563
+ last.agentId = agentId;
564
+ if (requestContent)
565
+ last.requestContent = requestContent;
551
566
  }
552
567
  }
568
+ /**
569
+ * Extract request content for logging. Handles Anthropic and OpenAI formats.
570
+ */
571
+ function extractRequestContent(body, isAnthropic) {
572
+ let systemPrompt = '';
573
+ let userMessage = '';
574
+ if (isAnthropic) {
575
+ if (typeof body.system === 'string') {
576
+ systemPrompt = body.system;
577
+ }
578
+ else if (Array.isArray(body.system)) {
579
+ systemPrompt = body.system
580
+ .map(p => p.type === 'text' ? (p.text ?? '') : (typeof p === 'string' ? String(p) : ''))
581
+ .join('');
582
+ }
583
+ }
584
+ else {
585
+ const sysmsgs = body.messages;
586
+ if (Array.isArray(sysmsgs)) {
587
+ for (const msg of sysmsgs) {
588
+ if (msg.role === 'system') {
589
+ systemPrompt = typeof msg.content === 'string' ? msg.content : '';
590
+ break;
591
+ }
592
+ }
593
+ }
594
+ }
595
+ const msgs = body.messages;
596
+ if (Array.isArray(msgs)) {
597
+ for (let i = msgs.length - 1; i >= 0; i--) {
598
+ if (msgs[i].role === 'user') {
599
+ const content = msgs[i].content;
600
+ if (typeof content === 'string') {
601
+ userMessage = content;
602
+ }
603
+ else if (Array.isArray(content)) {
604
+ userMessage = content
605
+ .filter(p => p.type === 'text')
606
+ .map(p => p.text ?? '')
607
+ .join('');
608
+ }
609
+ break;
610
+ }
611
+ }
612
+ }
613
+ return {
614
+ systemPrompt: systemPrompt ? systemPrompt.slice(0, 200) : undefined,
615
+ userMessage: userMessage || undefined,
616
+ };
617
+ }
618
+ /**
619
+ * Extract assistant response text from response payload.
620
+ */
621
+ function extractResponseText(responseData, isAnthropic) {
622
+ if (isAnthropic) {
623
+ const content = responseData.content;
624
+ if (Array.isArray(content)) {
625
+ return content.filter(p => p.type === 'text').map(p => p.text ?? '').join('');
626
+ }
627
+ }
628
+ else {
629
+ const choices = responseData.choices;
630
+ if (Array.isArray(choices) && choices[0]?.message?.content) {
631
+ return choices[0].message.content;
632
+ }
633
+ }
634
+ return '';
635
+ }
553
636
  const DEFAULT_PROXY_CONFIG = {
554
637
  enabled: true,
555
638
  modelOverrides: {},
@@ -580,6 +663,11 @@ const DEFAULT_PROXY_CONFIG = {
580
663
  },
581
664
  },
582
665
  };
666
+ /** Module-level ref to active proxy config (set during startProxy) */
667
+ let _activeProxyConfig = {};
668
+ function isContentLoggingEnabled() {
669
+ return _activeProxyConfig.dashboard?.showRequestContent !== false;
670
+ }
583
671
  function getProxyConfigPath() {
584
672
  const customPath = process.env['RELAYPLANE_CONFIG_PATH'];
585
673
  if (customPath && customPath.trim())
@@ -773,6 +861,23 @@ function classifyComplexity(messages) {
773
861
  score += 1;
774
862
  if (andCount >= 5)
775
863
  score += 1;
864
+ // Calculate total tokens across ALL messages, not just last user message.
865
+ // For agent workloads (OpenClaw, aider, Claude Code) the last user message is
866
+ // often tiny while the real complexity lives in the 100K+ token context.
867
+ const allText = extractMessageText(messages);
868
+ const totalTokens = Math.ceil(allText.length / 4);
869
+ // Context size floor — use as a hard signal regardless of last-message score
870
+ if (totalTokens > 100000)
871
+ score += 5; // definitely complex
872
+ else if (totalTokens > 50000)
873
+ score += 3; // likely moderate+
874
+ else if (totalTokens > 20000)
875
+ score += 2;
876
+ // Message count signal — long conversations imply multi-step reasoning
877
+ if (messages.length > 50)
878
+ score += 2;
879
+ else if (messages.length > 20)
880
+ score += 1;
776
881
  if (score >= 4)
777
882
  return 'complex';
778
883
  if (score >= 2)
@@ -1505,11 +1610,13 @@ function convertAnthropicStreamEvent(eventType, eventData, messageId, model, too
1505
1610
  const msg = eventData['message'];
1506
1611
  baseChunk.id = msg?.['id'] || messageId;
1507
1612
  choice.delta = { role: 'assistant', content: '' };
1508
- // Pass through input token count from message_start
1613
+ // Pass through input token count from message_start (including cache tokens)
1509
1614
  const msgUsage = msg?.['usage'];
1510
1615
  if (msgUsage) {
1511
1616
  baseChunk['usage'] = {
1512
1617
  prompt_tokens: msgUsage['input_tokens'] ?? 0,
1618
+ cache_creation_tokens: msgUsage['cache_creation_input_tokens'] ?? 0,
1619
+ cache_read_tokens: msgUsage['cache_read_input_tokens'] ?? 0,
1513
1620
  };
1514
1621
  }
1515
1622
  return `data: ${JSON.stringify(baseChunk)}\n\n`;
@@ -1749,7 +1856,7 @@ function resolveExplicitModel(modelName) {
1749
1856
  function resolveConfigModel(modelName) {
1750
1857
  return resolveExplicitModel(modelName) ?? parsePreferredModel(modelName);
1751
1858
  }
1752
- function extractResponseText(responseData) {
1859
+ function extractResponseTextAuto(responseData) {
1753
1860
  const openAiChoices = responseData['choices'];
1754
1861
  if (openAiChoices && openAiChoices.length > 0) {
1755
1862
  const first = openAiChoices[0];
@@ -1917,7 +2024,7 @@ async function cascadeRequest(config, makeRequest, log) {
1917
2024
  const isLastModel = i === config.models.length - 1;
1918
2025
  try {
1919
2026
  const { responseData, provider, model: resolvedModel } = await makeRequest(model);
1920
- const text = extractResponseText(responseData);
2027
+ const text = extractResponseTextAuto(responseData);
1921
2028
  if (isLastModel || escalations >= config.maxEscalations) {
1922
2029
  return { responseData, provider, model: resolvedModel, escalations };
1923
2030
  }
@@ -1969,6 +2076,7 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
1969
2076
  .vstat.unavailable{color:#a3a3a3;border-color:#52525b66;background:#18181b66}
1970
2077
  @media(max-width:768px){.col-tt,.col-cx{display:none}}
1971
2078
  .prov{display:flex;gap:16px;flex-wrap:wrap}.prov-item{display:flex;align-items:center;font-size:.85rem;background:#111318;padding:8px 14px;border-radius:8px;border:1px solid #1e293b}
2079
+ .rename-btn{background:none;border:none;cursor:pointer;font-size:.75rem;opacity:.5;padding:2px}.rename-btn:hover{opacity:1}
1972
2080
  </style></head><body>
1973
2081
  <div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span><span id="vstat" class="vstat unavailable">Unable to check</span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
1974
2082
  <div class="cards">
@@ -1979,9 +2087,11 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
1979
2087
  </div>
1980
2088
  <div class="section"><h2>Model Breakdown</h2>
1981
2089
  <table><thead><tr><th>Model</th><th>Requests</th><th>Cost</th><th>% of Total</th></tr></thead><tbody id="models"></tbody></table></div>
2090
+ <div class="section"><h2>Agent Cost Breakdown</h2>
2091
+ <table><thead><tr><th>Agent</th><th>Requests</th><th>Total Cost</th><th>Last Active</th><th></th></tr></thead><tbody id="agents"></tbody></table></div>
1982
2092
  <div class="section"><h2>Provider Status</h2><div class="prov" id="providers"></div></div>
1983
2093
  <div class="section"><h2>Recent Runs</h2>
1984
- <table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
2094
+ <table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th class="col-cache">Cache Create</th><th class="col-cache">Cache Read</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
1985
2095
  <script>
1986
2096
  const $ = id => document.getElementById(id);
1987
2097
  function fmt(n,d=2){return typeof n==='number'?n.toFixed(d):'-'}
@@ -1989,12 +2099,13 @@ function fmtTime(s){const d=new Date(s);return d.toLocaleTimeString()}
1989
2099
  function dur(s){const h=Math.floor(s/3600),m=Math.floor(s%3600/60);return h?h+'h '+m+'m':m+'m'}
1990
2100
  async function load(){
1991
2101
  try{
1992
- const [health,stats,runsR,sav,provH]=await Promise.all([
2102
+ const [health,stats,runsR,sav,provH,agentsR]=await Promise.all([
1993
2103
  fetch('/health').then(r=>r.json()),
1994
2104
  fetch('/v1/telemetry/stats').then(r=>r.json()),
1995
2105
  fetch('/v1/telemetry/runs?limit=20').then(r=>r.json()),
1996
2106
  fetch('/v1/telemetry/savings').then(r=>r.json()),
1997
- fetch('/v1/telemetry/health').then(r=>r.json())
2107
+ fetch('/v1/telemetry/health').then(r=>r.json()),
2108
+ fetch('/api/agents').then(r=>r.json()).catch(()=>({agents:[]}))
1998
2109
  ]);
1999
2110
  $('ver').textContent='v'+health.version;
2000
2111
  $('uptime').textContent=dur(health.uptime);
@@ -2021,9 +2132,26 @@ async function load(){
2021
2132
  ).join('')||'<tr><td colspan=4 style="color:#64748b">No data yet</td></tr>';
2022
2133
  function ttCls(t){const m={code_generation:'tt-code',analysis:'tt-analysis',summarization:'tt-summarization',question_answering:'tt-qa'};return m[t]||'tt-general'}
2023
2134
  function cxCls(c){const m={simple:'cx-simple',moderate:'cx-moderate',complex:'cx-complex'};return m[c]||'cx-simple'}
2024
- $('runs').innerHTML=(runsR.runs||[]).map(r=>
2025
- '<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
2026
- ).join('')||'<tr><td colspan=9 style="color:#64748b">No runs yet</td></tr>';
2135
+ function esc(s){if(!s)return'';return s.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;')}
2136
+ $('runs').innerHTML=(runsR.runs||[]).map((r,i)=>{
2137
+ const row='<tr style="cursor:pointer" onclick="toggleDetail('+i+')" title="Click to expand"><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td class="col-cache" style="color:#60a5fa">'+(r.cacheCreationTokens||0)+'</td><td class="col-cache" style="color:#34d399">'+(r.cacheReadTokens||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>';
2138
+ const c=r.requestContent||{};
2139
+ let detail='<tr id="run-detail-'+i+'" style="display:none"><td colspan="11" style="padding:16px;background:#111217;border-bottom:1px solid #1e293b">';
2140
+ if(c.systemPrompt||c.userMessage||c.responsePreview){
2141
+ if(c.systemPrompt) detail+='<div style="color:#64748b;font-size:.85rem;margin-bottom:10px;font-style:italic"><strong style="color:#94a3b8">System:</strong> '+esc(c.systemPrompt)+'</div>';
2142
+ if(c.userMessage) detail+='<div style="background:#1a1c23;border:1px solid #1e293b;border-radius:8px;padding:12px;margin-bottom:10px"><strong style="color:#94a3b8;font-size:.8rem">User Message</strong><div style="margin-top:6px;white-space:pre-wrap">'+esc(c.userMessage)+'</div></div>';
2143
+ if(c.responsePreview) detail+='<div style="background:#1a1c23;border:1px solid #1e293b;border-radius:8px;padding:12px;margin-bottom:10px"><strong style="color:#94a3b8;font-size:.8rem">Response Preview</strong><div style="margin-top:6px;white-space:pre-wrap">'+esc(c.responsePreview)+'</div></div>';
2144
+ detail+='<button onclick="event.stopPropagation();loadFullResponse(\''+r.id+'\','+i+')" id="full-btn-'+i+'" style="background:#1e293b;color:#e2e8f0;border:1px solid #334155;padding:6px 12px;border-radius:6px;cursor:pointer;font-size:.8rem">Show full response</button><pre id="full-resp-'+i+'" style="display:none;white-space:pre-wrap;margin-top:10px;background:#0d0e11;border:1px solid #1e293b;border-radius:8px;padding:12px;max-height:400px;overflow:auto;font-size:.8rem"></pre>';
2145
+ } else {
2146
+ detail+='<span style="color:#64748b">No content captured for this request</span>';
2147
+ }
2148
+ detail+='</td></tr>';
2149
+ return row+detail;
2150
+ }).join('')||'<tr><td colspan=11 style="color:#64748b">No runs yet</td></tr>';
2151
+ const agents=(agentsR.agents||[]).sort((a,b)=>(b.totalCost||0)-(a.totalCost||0));
2152
+ $('agents').innerHTML=agents.length?agents.map(a=>
2153
+ '<tr><td><span class="agent-name" data-fp="'+a.fingerprint+'">'+a.name+'</span> <button class="rename-btn" onclick="renameAgent(\''+a.fingerprint+'\',\''+a.name.replace(/'/g,"\\'")+'\')">✏️</button></td><td>'+a.totalRequests+'</td><td>$'+fmt(a.totalCost,4)+'</td><td>'+fmtTime(a.lastSeen)+'</td><td style="font-size:.7rem;color:#64748b" title="'+a.systemPromptPreview+'">'+a.fingerprint+'</td></tr>'
2154
+ ).join(''):'<tr><td colspan=5 style="color:#64748b">No agents detected yet</td></tr>';
2027
2155
  $('providers').innerHTML=(provH.providers||[]).map(p=>{
2028
2156
  const dotClass = p.status==='healthy'?'up':(p.status==='degraded'?'warn':'down');
2029
2157
  const rate = p.successRate!==undefined?(' '+Math.round(p.successRate*100)+'%'):'';
@@ -2031,8 +2159,27 @@ async function load(){
2031
2159
  }).join('');
2032
2160
  }catch(e){console.error(e)}
2033
2161
  }
2162
+ async function renameAgent(fp,currentName){
2163
+ const name=prompt('Rename agent:',currentName);
2164
+ if(!name||name===currentName)return;
2165
+ await fetch('/api/agents/rename',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({fingerprint:fp,name:name})});
2166
+ load();
2167
+ }
2168
+ function toggleDetail(i){var d=document.getElementById('run-detail-'+i);d.style.display=d.style.display==='none'?'table-row':'none'}
2169
+ async function loadFullResponse(runId,i){
2170
+ const btn=document.getElementById('full-btn-'+i);
2171
+ const pre=document.getElementById('full-resp-'+i);
2172
+ if(pre.style.display!=='none'){pre.style.display='none';btn.textContent='Show full response';return}
2173
+ btn.textContent='Loading...';
2174
+ try{
2175
+ const data=await fetch('/api/runs/'+runId).then(r=>r.json());
2176
+ const full=data.requestContent&&data.requestContent.fullResponse;
2177
+ if(full){pre.textContent=full;pre.style.display='block';btn.textContent='Hide full response'}
2178
+ else{btn.textContent='No full response available'}
2179
+ }catch{btn.textContent='Error loading response'}
2180
+ }
2034
2181
  load();setInterval(load,5000);
2035
- </script></body></html>`;
2182
+ </script><footer style="text-align:center;padding:20px 0;color:#475569;font-size:.75rem;border-top:1px solid #1e293b;margin-top:20px">🔒 Request content stays on your machine. Never sent to cloud.</footer></body></html>`;
2036
2183
  }
2037
2184
  function getConfigDashboardHTML() {
2038
2185
  return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"><title>RelayPlane Config</title>
@@ -2129,8 +2276,10 @@ async function startProxy(config = {}) {
2129
2276
  };
2130
2277
  // Load persistent history from disk
2131
2278
  loadHistoryFromDisk();
2279
+ (0, agent_tracker_js_1.loadAgentRegistry)();
2132
2280
  // Flush history on shutdown
2133
2281
  const handleShutdown = () => {
2282
+ (0, agent_tracker_js_1.flushAgentRegistry)();
2134
2283
  meshHandle.stop();
2135
2284
  shutdownHistory();
2136
2285
  process.exit(0);
@@ -2139,6 +2288,7 @@ async function startProxy(config = {}) {
2139
2288
  process.on('SIGTERM', handleShutdown);
2140
2289
  const configPath = getProxyConfigPath();
2141
2290
  let proxyConfig = await loadProxyConfig(configPath, log);
2291
+ _activeProxyConfig = proxyConfig;
2142
2292
  const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
2143
2293
  // === Startup config validation (Task 4) ===
2144
2294
  try {
@@ -2516,7 +2666,9 @@ async function startProxy(config = {}) {
2516
2666
  const offset = parseInt(params.get('offset') || '0', 10);
2517
2667
  const sorted = [...requestHistory].reverse();
2518
2668
  const runs = sorted.slice(offset, offset + limit).map(r => {
2519
- const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut);
2669
+ // Savings should reflect routing decisions only — pass same cache tokens to baseline
2670
+ // so the cache discount doesn't get counted as "savings from routing"
2671
+ const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
2520
2672
  const perRunSavings = Math.max(0, origCost - r.costUsd);
2521
2673
  return {
2522
2674
  id: r.id,
@@ -2536,8 +2688,16 @@ async function startProxy(config = {}) {
2536
2688
  latencyMs: r.latencyMs,
2537
2689
  tokensIn: r.tokensIn,
2538
2690
  tokensOut: r.tokensOut,
2691
+ cacheCreationTokens: r.cacheCreationTokens ?? 0,
2692
+ cacheReadTokens: r.cacheReadTokens ?? 0,
2539
2693
  savings: Math.round(perRunSavings * 10000) / 10000,
2540
2694
  escalated: r.escalated,
2695
+ requestContent: r.requestContent ? {
2696
+ systemPrompt: r.requestContent.systemPrompt,
2697
+ userMessage: r.requestContent.userMessage,
2698
+ responsePreview: r.requestContent.responsePreview,
2699
+ // fullResponse excluded from list endpoint to keep payloads small
2700
+ } : undefined,
2541
2701
  };
2542
2702
  });
2543
2703
  res.writeHead(200, { 'Content-Type': 'application/json' });
@@ -2553,7 +2713,9 @@ async function startProxy(config = {}) {
2553
2713
  let totalSavedAmount = 0;
2554
2714
  const byDayMap = new Map();
2555
2715
  for (const r of requestHistory) {
2556
- const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut);
2716
+ // Pass same cache tokens to baseline so savings only reflect routing decisions,
2717
+ // not prompt-cache discounts (those happen regardless of which model is chosen).
2718
+ const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
2557
2719
  const actualCost = r.costUsd;
2558
2720
  const saved = Math.max(0, origCost - actualCost);
2559
2721
  totalOriginalCost += origCost;
@@ -2633,6 +2795,63 @@ async function startProxy(config = {}) {
2633
2795
  res.end(JSON.stringify({ error: 'Not found' }));
2634
2796
  return;
2635
2797
  }
2798
+ // === Agent tracking API ===
2799
+ // === /api/runs/:id — full request/response content for a single run ===
2800
+ const runsIdMatch = pathname.match(/^\/api\/runs\/(.+)$/);
2801
+ if (req.method === 'GET' && runsIdMatch) {
2802
+ const runId = runsIdMatch[1];
2803
+ const run = requestHistory.find(r => r.id === runId);
2804
+ if (!run) {
2805
+ res.writeHead(404, { 'Content-Type': 'application/json' });
2806
+ res.end(JSON.stringify({ error: 'Run not found' }));
2807
+ return;
2808
+ }
2809
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2810
+ res.end(JSON.stringify({
2811
+ id: run.id,
2812
+ model: run.targetModel,
2813
+ provider: run.provider,
2814
+ timestamp: run.timestamp,
2815
+ tokensIn: run.tokensIn,
2816
+ tokensOut: run.tokensOut,
2817
+ costUsd: run.costUsd,
2818
+ latencyMs: run.latencyMs,
2819
+ success: run.success,
2820
+ requestContent: run.requestContent,
2821
+ }));
2822
+ return;
2823
+ }
2824
+ if (req.method === 'GET' && pathname === '/api/agents') {
2825
+ const summaries = (0, agent_tracker_js_1.getAgentSummaries)(requestHistory);
2826
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2827
+ res.end(JSON.stringify({ agents: summaries }));
2828
+ return;
2829
+ }
2830
+ if (req.method === 'POST' && pathname === '/api/agents/rename') {
2831
+ try {
2832
+ const body = await readJsonBody(req);
2833
+ const fingerprint = body['fingerprint'];
2834
+ const name = body['name'];
2835
+ if (!fingerprint || !name) {
2836
+ res.writeHead(400, { 'Content-Type': 'application/json' });
2837
+ res.end(JSON.stringify({ error: 'Missing fingerprint or name' }));
2838
+ return;
2839
+ }
2840
+ const ok = (0, agent_tracker_js_1.renameAgent)(fingerprint, name);
2841
+ if (!ok) {
2842
+ res.writeHead(404, { 'Content-Type': 'application/json' });
2843
+ res.end(JSON.stringify({ error: 'Agent not found' }));
2844
+ return;
2845
+ }
2846
+ res.writeHead(200, { 'Content-Type': 'application/json' });
2847
+ res.end(JSON.stringify({ ok: true }));
2848
+ }
2849
+ catch {
2850
+ res.writeHead(400, { 'Content-Type': 'application/json' });
2851
+ res.end(JSON.stringify({ error: 'Invalid JSON' }));
2852
+ }
2853
+ return;
2854
+ }
2636
2855
  // === Dashboard ===
2637
2856
  if (req.method === 'GET' && (pathname === '/' || pathname === '/dashboard')) {
2638
2857
  res.writeHead(200, { 'Content-Type': 'text/html' });
@@ -2713,6 +2932,14 @@ async function startProxy(config = {}) {
2713
2932
  res.end(JSON.stringify({ error: 'Invalid JSON' }));
2714
2933
  return;
2715
2934
  }
2935
+ // Extract agent fingerprint and explicit agent ID
2936
+ const nativeSystemPrompt = (0, agent_tracker_js_1.extractSystemPromptFromBody)(requestBody);
2937
+ const nativeExplicitAgentId = getHeaderValue(req, 'x-relayplane-agent') || undefined;
2938
+ let nativeAgentFingerprint;
2939
+ if (nativeSystemPrompt) {
2940
+ const agentResult = (0, agent_tracker_js_1.trackAgent)(nativeSystemPrompt, 0, nativeExplicitAgentId);
2941
+ nativeAgentFingerprint = agentResult.fingerprint;
2942
+ }
2716
2943
  const originalModel = requestBody['model'];
2717
2944
  let requestedModel = headerModelOverride ?? originalModel ?? '';
2718
2945
  if (headerModelOverride) {
@@ -3151,7 +3378,7 @@ async function startProxy(config = {}) {
3151
3378
  model: targetModel || requestedModel,
3152
3379
  tokensIn: nativeUsage?.input_tokens ?? 0,
3153
3380
  tokensOut: nativeUsage?.output_tokens ?? 0,
3154
- costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeUsage?.input_tokens ?? 0, nativeUsage?.output_tokens ?? 0),
3381
+ costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeUsage?.input_tokens ?? 0, nativeUsage?.output_tokens ?? 0, nativeUsage?.cache_creation_input_tokens || undefined, nativeUsage?.cache_read_input_tokens || undefined),
3155
3382
  taskType,
3156
3383
  });
3157
3384
  log(`Cache STORE for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
@@ -3174,7 +3401,22 @@ async function startProxy(config = {}) {
3174
3401
  const nativeTokIn = nativeBaseTokIn + nativeCacheCreation + nativeCacheRead;
3175
3402
  // Cost calculation expects inputTokens to include cache tokens when cache params are provided
3176
3403
  const nativeCostUsd = (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined);
3177
- updateLastHistoryEntry(nativeTokIn, nativeTokOut, nativeCostUsd);
3404
+ // Build request content if logging enabled
3405
+ let nativeContentData;
3406
+ if (isContentLoggingEnabled()) {
3407
+ const extracted = extractRequestContent(requestBody, true);
3408
+ const responseText = nativeResponseData ? extractResponseText(nativeResponseData, true) : '';
3409
+ nativeContentData = {
3410
+ ...extracted,
3411
+ responsePreview: responseText ? responseText.slice(0, 500) : undefined,
3412
+ fullResponse: responseText || undefined,
3413
+ };
3414
+ }
3415
+ updateLastHistoryEntry(nativeTokIn, nativeTokOut, nativeCostUsd, undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined, nativeAgentFingerprint, nativeExplicitAgentId, nativeContentData);
3416
+ // Update agent cost now that we know the actual cost
3417
+ if (nativeAgentFingerprint && nativeAgentFingerprint !== 'unknown') {
3418
+ (0, agent_tracker_js_1.updateAgentCost)(nativeAgentFingerprint, nativeCostUsd);
3419
+ }
3178
3420
  // ── Post-request: budget spend + anomaly detection ──
3179
3421
  postRequestRecord(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCostUsd);
3180
3422
  if (recordTelemetry) {
@@ -3183,6 +3425,10 @@ async function startProxy(config = {}) {
3183
3425
  prompt: promptText.slice(0, 500),
3184
3426
  taskType,
3185
3427
  model: `${targetProvider}:${targetModel || requestedModel}`,
3428
+ })
3429
+ .then((runResult) => {
3430
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
3431
+ relay.patchRunTokens(runResult.runId, nativeTokIn, nativeTokOut, nativeCostUsd);
3186
3432
  })
3187
3433
  .catch(() => { });
3188
3434
  sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
@@ -3268,6 +3514,14 @@ async function startProxy(config = {}) {
3268
3514
  return;
3269
3515
  }
3270
3516
  const isStreaming = request.stream === true;
3517
+ // Extract agent fingerprint for chat/completions
3518
+ const chatSystemPrompt = (0, agent_tracker_js_1.extractSystemPromptFromBody)(request);
3519
+ const chatExplicitAgentId = getHeaderValue(req, 'x-relayplane-agent') || undefined;
3520
+ let chatAgentFingerprint;
3521
+ if (chatSystemPrompt) {
3522
+ const agentResult = (0, agent_tracker_js_1.trackAgent)(chatSystemPrompt, 0, chatExplicitAgentId);
3523
+ chatAgentFingerprint = agentResult.fingerprint;
3524
+ }
3271
3525
  // ── Response Cache: check for cached response (chat/completions) ──
3272
3526
  const chatCacheBypass = responseCache.shouldBypass(request);
3273
3527
  let chatCacheHash;
@@ -3568,7 +3822,7 @@ async function startProxy(config = {}) {
3568
3822
  const startTime = Date.now();
3569
3823
  // Handle streaming vs non-streaming
3570
3824
  if (isStreaming) {
3571
- await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatCacheHash, chatCacheBypass);
3825
+ await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatCacheHash, chatCacheBypass, chatAgentFingerprint, chatExplicitAgentId);
3572
3826
  }
3573
3827
  else {
3574
3828
  if (useCascade && cascadeConfig) {
@@ -3605,8 +3859,12 @@ async function startProxy(config = {}) {
3605
3859
  const cascadeUsage = responseData?.usage;
3606
3860
  const cascadeTokensIn = cascadeUsage?.input_tokens ?? cascadeUsage?.prompt_tokens ?? 0;
3607
3861
  const cascadeTokensOut = cascadeUsage?.output_tokens ?? cascadeUsage?.completion_tokens ?? 0;
3608
- const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut);
3609
- updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel);
3862
+ const cascadeCacheCreation = cascadeUsage?.cache_creation_input_tokens || undefined;
3863
+ const cascadeCacheRead = cascadeUsage?.cache_read_input_tokens || undefined;
3864
+ const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut, cascadeCacheCreation, cascadeCacheRead);
3865
+ updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel, cascadeCacheCreation, cascadeCacheRead, chatAgentFingerprint, chatExplicitAgentId);
3866
+ if (chatAgentFingerprint && chatAgentFingerprint !== 'unknown')
3867
+ (0, agent_tracker_js_1.updateAgentCost)(chatAgentFingerprint, cascadeCost);
3610
3868
  if (recordTelemetry) {
3611
3869
  try {
3612
3870
  const runResult = await relay.run({
@@ -3614,6 +3872,8 @@ async function startProxy(config = {}) {
3614
3872
  taskType,
3615
3873
  model: `${cascadeResult.provider}:${cascadeResult.model}`,
3616
3874
  });
3875
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
3876
+ relay.patchRunTokens(runResult.runId, cascadeTokensIn, cascadeTokensOut, cascadeCost);
3617
3877
  responseData['_relayplane'] = {
3618
3878
  runId: runResult.runId,
3619
3879
  routedTo: `${cascadeResult.provider}/${cascadeResult.model}`,
@@ -3628,7 +3888,7 @@ async function startProxy(config = {}) {
3628
3888
  catch (err) {
3629
3889
  log(`Failed to record run: ${err}`);
3630
3890
  }
3631
- sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined);
3891
+ sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined, cascadeCacheCreation, cascadeCacheRead);
3632
3892
  meshCapture(cascadeResult.model, cascadeResult.provider, taskType, cascadeTokensIn, cascadeTokensOut, cascadeCost, durationMs, true);
3633
3893
  }
3634
3894
  const chatCascadeRpHeaders = buildRelayPlaneResponseHeaders(cascadeResult.model, originalRequestedModel ?? 'unknown', complexity, cascadeResult.provider, 'cascade');
@@ -3649,7 +3909,7 @@ async function startProxy(config = {}) {
3649
3909
  }
3650
3910
  }
3651
3911
  else {
3652
- await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity);
3912
+ await handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatAgentFingerprint, chatExplicitAgentId);
3653
3913
  }
3654
3914
  }
3655
3915
  });
@@ -3792,7 +4052,7 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
3792
4052
  }
3793
4053
  return { responseData, ok: true, status: 200 };
3794
4054
  }
3795
- async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', cacheHash, cacheBypass) {
4055
+ async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', cacheHash, cacheBypass, agentFingerprint, agentId) {
3796
4056
  let providerResponse;
3797
4057
  try {
3798
4058
  switch (targetProvider) {
@@ -3845,9 +4105,11 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3845
4105
  'Connection': 'keep-alive',
3846
4106
  ...streamRpHeaders,
3847
4107
  });
3848
- // Track token usage from streaming events
4108
+ // Track token usage from streaming events (including Anthropic prompt cache tokens)
3849
4109
  let streamTokensIn = 0;
3850
4110
  let streamTokensOut = 0;
4111
+ let streamCacheCreation = 0;
4112
+ let streamCacheRead = 0;
3851
4113
  const shouldCacheStream = !!(cacheHash && !cacheBypass);
3852
4114
  const rawChunks = [];
3853
4115
  try {
@@ -3859,7 +4121,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3859
4121
  res.write(chunk);
3860
4122
  if (shouldCacheStream)
3861
4123
  rawChunks.push(chunk);
3862
- // Parse OpenAI-format chunks for usage (emitted at end of stream)
4124
+ // Parse OpenAI-format chunks for usage the converter embeds
4125
+ // cache_creation_tokens and cache_read_tokens from message_start.
3863
4126
  try {
3864
4127
  const lines = chunk.split('\n');
3865
4128
  for (const line of lines) {
@@ -3868,6 +4131,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3868
4131
  if (evt.usage) {
3869
4132
  streamTokensIn = evt.usage.prompt_tokens ?? streamTokensIn;
3870
4133
  streamTokensOut = evt.usage.completion_tokens ?? streamTokensOut;
4134
+ streamCacheCreation = evt.usage.cache_creation_tokens ?? streamCacheCreation;
4135
+ streamCacheRead = evt.usage.cache_read_tokens ?? streamCacheRead;
3871
4136
  }
3872
4137
  }
3873
4138
  }
@@ -3927,13 +4192,13 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3927
4192
  const streamPayload = JSON.stringify({
3928
4193
  _relayplaneStreamCache: true,
3929
4194
  ssePayload: rawChunks.join(''),
3930
- usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, prompt_tokens: streamTokensIn, completion_tokens: streamTokensOut },
4195
+ usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, prompt_tokens: streamTokensIn, completion_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
3931
4196
  });
3932
4197
  responseCache.set(cacheHash, streamPayload, {
3933
4198
  model: targetModel,
3934
4199
  tokensIn: streamTokensIn,
3935
4200
  tokensOut: streamTokensOut,
3936
- costUsd: (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut),
4201
+ costUsd: (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
3937
4202
  taskType,
3938
4203
  });
3939
4204
  log(`Cache STORE (stream) for chat/completions ${targetModel} (hash: ${cacheHash.slice(0, 8)})`);
@@ -3944,9 +4209,11 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3944
4209
  const durationMs = Date.now() - startTime;
3945
4210
  // Always log the request for stats/telemetry tracking
3946
4211
  logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
3947
- // Update token/cost info on the history entry
3948
- const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut);
3949
- updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost);
4212
+ // Update token/cost info on the history entry (with cache token discount)
4213
+ const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined);
4214
+ updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost, undefined, streamCacheCreation || undefined, streamCacheRead || undefined, agentFingerprint, agentId);
4215
+ if (agentFingerprint && agentFingerprint !== 'unknown')
4216
+ (0, agent_tracker_js_1.updateAgentCost)(agentFingerprint, streamCost);
3950
4217
  // ── Post-request: budget spend + anomaly detection ──
3951
4218
  try {
3952
4219
  (0, budget_js_1.getBudgetManager)().recordSpend(streamCost, targetModel);
@@ -3967,12 +4234,14 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3967
4234
  model: `${targetProvider}:${targetModel}`,
3968
4235
  })
3969
4236
  .then((runResult) => {
4237
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
4238
+ relay.patchRunTokens(runResult.runId, streamTokensIn, streamTokensOut, streamCost);
3970
4239
  log(`Completed streaming in ${durationMs}ms, runId: ${runResult.runId}`);
3971
4240
  })
3972
4241
  .catch((err) => {
3973
4242
  log(`Failed to record run: ${err}`);
3974
4243
  });
3975
- sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined);
4244
+ sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
3976
4245
  meshCapture(targetModel, targetProvider, taskType, streamTokensIn, streamTokensOut, streamCost, durationMs, true);
3977
4246
  }
3978
4247
  res.end();
@@ -3980,7 +4249,7 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
3980
4249
  /**
3981
4250
  * Handle non-streaming request
3982
4251
  */
3983
- async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple') {
4252
+ async function handleNonStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', agentFingerprint, agentId) {
3984
4253
  let responseData;
3985
4254
  try {
3986
4255
  const result = await executeNonStreamingProviderRequest(request, targetProvider, targetModel, apiKey, ctx);
@@ -4015,12 +4284,16 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
4015
4284
  const nonStreamRespModel = checkResponseModelMismatch(responseData, targetModel, targetProvider, log);
4016
4285
  // Log the successful request
4017
4286
  logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
4018
- // Update token/cost info
4287
+ // Update token/cost info (including Anthropic prompt cache tokens)
4019
4288
  const usage = responseData?.usage;
4020
4289
  const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
4021
4290
  const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
4022
- const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut);
4023
- updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel);
4291
+ const cacheCreationTokens = usage?.cache_creation_input_tokens ?? 0;
4292
+ const cacheReadTokens = usage?.cache_read_input_tokens ?? 0;
4293
+ const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut, cacheCreationTokens || undefined, cacheReadTokens || undefined);
4294
+ updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel, cacheCreationTokens || undefined, cacheReadTokens || undefined, agentFingerprint, agentId);
4295
+ if (agentFingerprint && agentFingerprint !== 'unknown')
4296
+ (0, agent_tracker_js_1.updateAgentCost)(agentFingerprint, cost);
4024
4297
  // ── Post-request: budget spend + anomaly detection ──
4025
4298
  try {
4026
4299
  (0, budget_js_1.getBudgetManager)().recordSpend(cost, targetModel);
@@ -4040,6 +4313,8 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
4040
4313
  taskType,
4041
4314
  model: `${targetProvider}:${targetModel}`,
4042
4315
  });
4316
+ // Backfill token/cost data — relay.run() has no adapters so records NULLs
4317
+ relay.patchRunTokens(runResult.runId, tokensIn, tokensOut, cost);
4043
4318
  // Add routing metadata to response
4044
4319
  responseData['_relayplane'] = {
4045
4320
  runId: runResult.runId,
@@ -4054,12 +4329,14 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
4054
4329
  catch (err) {
4055
4330
  log(`Failed to record run: ${err}`);
4056
4331
  }
4057
- // Extract token counts from response if available (Anthropic/OpenAI format)
4058
- const usage = responseData?.usage;
4059
- const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
4060
- const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
4061
- sendCloudTelemetry(taskType, targetModel, tokensIn, tokensOut, durationMs, true);
4062
- meshCapture(targetModel, targetProvider, taskType, tokensIn, tokensOut, cost, durationMs, true);
4332
+ // Extract token counts from response if available (Anthropic/OpenAI format, including cache)
4333
+ const innerUsage = responseData?.usage;
4334
+ const innerTokIn = innerUsage?.input_tokens ?? innerUsage?.prompt_tokens ?? 0;
4335
+ const innerTokOut = innerUsage?.output_tokens ?? innerUsage?.completion_tokens ?? 0;
4336
+ const innerCacheCreation = innerUsage?.cache_creation_input_tokens ?? 0;
4337
+ const innerCacheRead = innerUsage?.cache_read_input_tokens ?? 0;
4338
+ sendCloudTelemetry(taskType, targetModel, innerTokIn, innerTokOut, durationMs, true, undefined, undefined, innerCacheCreation || undefined, innerCacheRead || undefined);
4339
+ meshCapture(targetModel, targetProvider, taskType, innerTokIn, innerTokOut, cost, durationMs, true);
4063
4340
  }
4064
4341
  // ── Cache: store non-streaming chat/completions response ──
4065
4342
  const chatRespCache = (0, response_cache_js_1.getResponseCache)();