npm - @relayplane/proxy - Versions diffs - 1.5.46 → 1.7.2 - Mend

@relayplane/proxy 1.5.46 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/README.md +297 -20
package/assets/relayplane-proxy.service +20 -0
package/dist/alerts.d.ts +72 -0
package/dist/alerts.d.ts.map +1 -0
package/dist/alerts.js +290 -0
package/dist/alerts.js.map +1 -0
package/dist/anomaly.d.ts +65 -0
package/dist/anomaly.d.ts.map +1 -0
package/dist/anomaly.js +193 -0
package/dist/anomaly.js.map +1 -0
package/dist/budget.d.ts +98 -0
package/dist/budget.d.ts.map +1 -0
package/dist/budget.js +356 -0
package/dist/budget.js.map +1 -0
package/dist/cli.js +512 -93
package/dist/cli.js.map +1 -1
package/dist/config.d.ts +28 -2
package/dist/config.d.ts.map +1 -1
package/dist/config.js +122 -24
package/dist/config.js.map +1 -1
package/dist/downgrade.d.ts +37 -0
package/dist/downgrade.d.ts.map +1 -0
package/dist/downgrade.js +79 -0
package/dist/downgrade.js.map +1 -0
package/dist/mesh/capture.d.ts +11 -0
package/dist/mesh/capture.d.ts.map +1 -0
package/dist/mesh/capture.js +43 -0
package/dist/mesh/capture.js.map +1 -0
package/dist/mesh/fitness.d.ts +14 -0
package/dist/mesh/fitness.d.ts.map +1 -0
package/dist/mesh/fitness.js +40 -0
package/dist/mesh/fitness.js.map +1 -0
package/dist/mesh/index.d.ts +39 -0
package/dist/mesh/index.d.ts.map +1 -0
package/dist/mesh/index.js +118 -0
package/dist/mesh/index.js.map +1 -0
package/dist/mesh/store.d.ts +30 -0
package/dist/mesh/store.d.ts.map +1 -0
package/dist/mesh/store.js +174 -0
package/dist/mesh/store.js.map +1 -0
package/dist/mesh/sync.d.ts +37 -0
package/dist/mesh/sync.d.ts.map +1 -0
package/dist/mesh/sync.js +154 -0
package/dist/mesh/sync.js.map +1 -0
package/dist/mesh/types.d.ts +57 -0
package/dist/mesh/types.d.ts.map +1 -0
package/dist/mesh/types.js +7 -0
package/dist/mesh/types.js.map +1 -0
package/dist/rate-limiter.d.ts +64 -0
package/dist/rate-limiter.d.ts.map +1 -0
package/dist/rate-limiter.js +159 -0
package/dist/rate-limiter.js.map +1 -0
package/dist/relay-config.d.ts +9 -0
package/dist/relay-config.d.ts.map +1 -1
package/dist/relay-config.js +2 -0
package/dist/relay-config.js.map +1 -1
package/dist/response-cache.d.ts +139 -0
package/dist/response-cache.d.ts.map +1 -0
package/dist/response-cache.js +515 -0
package/dist/response-cache.js.map +1 -0
package/dist/server.d.ts.map +1 -1
package/dist/server.js +5 -1
package/dist/server.js.map +1 -1
package/dist/standalone-proxy.d.ts +2 -1
package/dist/standalone-proxy.d.ts.map +1 -1
package/dist/standalone-proxy.js +736 -50
package/dist/standalone-proxy.js.map +1 -1
package/dist/telemetry.d.ts.map +1 -1
package/dist/telemetry.js +21 -5
package/dist/telemetry.js.map +1 -1
package/dist/utils/model-suggestions.d.ts.map +1 -1
package/dist/utils/model-suggestions.js +19 -2
package/dist/utils/model-suggestions.js.map +1 -1
package/dist/utils/version-status.d.ts +9 -0
package/dist/utils/version-status.d.ts.map +1 -0
package/dist/utils/version-status.js +28 -0
package/dist/utils/version-status.js.map +1 -0
package/package.json +7 -3

package/dist/standalone-proxy.js CHANGED Viewed

@@ -67,7 +67,16 @@ const path = __importStar(require("node:path"));
 const core_1 = require("@relayplane/core");
 const model_suggestions_js_1 = require("./utils/model-suggestions.js");
 const telemetry_js_1 = require("./telemetry.js");
+const config_js_1 = require("./config.js");
+const index_js_1 = require("./mesh/index.js");
+const response_cache_js_1 = require("./response-cache.js");
 const stats_js_1 = require("./stats.js");
+const rate_limiter_js_1 = require("./rate-limiter.js");
+const budget_js_1 = require("./budget.js");
+const anomaly_js_1 = require("./anomaly.js");
+const alerts_js_1 = require("./alerts.js");
+const downgrade_js_1 = require("./downgrade.js");
+const version_status_js_1 = require("./utils/version-status.js");
 const PROXY_VERSION = (() => {
     try {
         const pkgPath = path.join(__dirname, '..', 'package.json');
@@ -77,8 +86,54 @@ const PROXY_VERSION = (() => {
         return '0.0.0';
     }
 })();
+let latestProxyVersionCache = { value: null, checkedAt: 0 };
+const LATEST_PROXY_VERSION_TTL_MS = 30 * 60 * 1000;
+async function getLatestProxyVersion() {
+    const now = Date.now();
+    if (now - latestProxyVersionCache.checkedAt < LATEST_PROXY_VERSION_TTL_MS) {
+        return latestProxyVersionCache.value;
+    }
+    try {
+        const controller = new AbortController();
+        const timeout = setTimeout(() => controller.abort(), 2500);
+        const res = await fetch('https://registry.npmjs.org/@relayplane/proxy/latest', {
+            signal: controller.signal,
+            headers: { Accept: 'application/json' },
+        });
+        clearTimeout(timeout);
+        if (!res.ok) {
+            latestProxyVersionCache = { value: null, checkedAt: now };
+            return null;
+        }
+        const data = await res.json();
+        const latest = data.version ?? null;
+        latestProxyVersionCache = { value: latest, checkedAt: now };
+        return latest;
+    }
+    catch {
+        latestProxyVersionCache = { value: null, checkedAt: now };
+        return null;
+    }
+}
 /** Shared stats collector instance for the proxy server */
 exports.proxyStatsCollector = new stats_js_1.StatsCollector();
+/** Shared mesh handle — set during startProxy() */
+let _meshHandle = null;
+/** Capture a request into the mesh (fire-and-forget, never blocks) */
+function meshCapture(model, provider, taskType, tokensIn, tokensOut, costUsd, latencyMs, success, errorType) {
+    if (!_meshHandle)
+        return;
+    try {
+        _meshHandle.captureRequest({
+            model, provider, task_type: taskType,
+            input_tokens: tokensIn, output_tokens: tokensOut,
+            cost_usd: costUsd, latency_ms: latencyMs,
+            success, error_type: errorType,
+            timestamp: new Date().toISOString(),
+        });
+    }
+    catch { }
+}
 /**
  * Default provider endpoints
  */
@@ -171,10 +226,10 @@ exports.SMART_ALIASES = {
  * Send a telemetry event to the cloud (anonymous or authenticated).
  * Non-blocking — errors are silently swallowed.
  */
-function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel) {
+function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, success, costUsd, requestedModel, cacheCreationTokens, cacheReadTokens) {
     try {
-        const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut);
-        (0, telemetry_js_1.recordTelemetry)({
+        const cost = costUsd ?? (0, telemetry_js_1.estimateCost)(model, tokensIn, tokensOut, cacheCreationTokens, cacheReadTokens);
+        const event = {
             task_type: taskType,
             model,
             tokens_in: tokensIn,
@@ -183,7 +238,21 @@ function sendCloudTelemetry(taskType, model, tokensIn, tokensOut, latencyMs, suc
             success,
             cost_usd: cost,
             requested_model: requestedModel,
-        });
+            cache_creation_tokens: cacheCreationTokens,
+            cache_read_tokens: cacheReadTokens,
+        };
+        // Record locally (writes to telemetry.jsonl + queues upload if telemetry_enabled)
+        (0, telemetry_js_1.recordTelemetry)(event);
+        // Ensure cloud upload even if local telemetry_enabled is false
+        // recordCloudTelemetry skips queueForUpload when telemetry is disabled,
+        // but cloud dashboard needs these events regardless of local config
+        if (!(0, config_js_1.isTelemetryEnabled)()) {
+            (0, telemetry_js_1.queueForUpload)({
+                ...event,
+                device_id: (0, config_js_1.getDeviceId)(),
+                timestamp: new Date().toISOString(),
+            });
+        }
     }
     catch {
         // Telemetry should never break the proxy
@@ -220,15 +289,15 @@ function resolveModelAlias(model) {
  * Uses Haiku 3.5 for cost optimization, upgrades based on learned rules
  */
 const DEFAULT_ROUTING = {
-    code_generation: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    code_review: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    summarization: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    analysis: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    creative_writing: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    data_extraction: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    translation: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    question_answering: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
-    general: { provider: 'anthropic', model: 'claude-3-5-haiku-latest' },
+    code_generation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    code_review: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    summarization: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    analysis: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    creative_writing: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    data_extraction: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    translation: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    question_answering: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
+    general: { provider: 'anthropic', model: 'claude-sonnet-4-6' },
 };
 const UNCERTAINTY_PATTERNS = [
     /i'?m not (entirely |completely |really )?sure/i,
@@ -470,7 +539,7 @@ function logRequest(originalModel, targetModel, provider, latencyMs, success, mo
     bufferHistoryEntry(entry);
 }
 /** Update the most recent history entry with token/cost info */
-function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
+function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel, cacheCreationTokens, cacheReadTokens) {
     if (requestHistory.length > 0) {
         const last = requestHistory[requestHistory.length - 1];
         last.tokensIn = tokensIn;
@@ -479,6 +548,10 @@ function updateLastHistoryEntry(tokensIn, tokensOut, costUsd, responseModel) {
         if (responseModel) {
             last.responseModel = responseModel;
         }
+        if (cacheCreationTokens !== undefined)
+            last.cacheCreationTokens = cacheCreationTokens;
+        if (cacheReadTokens !== undefined)
+            last.cacheReadTokens = cacheReadTokens;
     }
 }
 const DEFAULT_PROXY_CONFIG = {
@@ -489,7 +562,6 @@ const DEFAULT_PROXY_CONFIG = {
         cascade: {
             enabled: true,
             models: [
-                'claude-haiku-4-5',
                 'claude-sonnet-4-6',
                 'claude-opus-4-6',
             ],
@@ -498,7 +570,7 @@ const DEFAULT_PROXY_CONFIG = {
         },
         complexity: {
             enabled: true,
-            simple: 'claude-haiku-4-5',
+            simple: 'claude-sonnet-4-6',
             moderate: 'claude-sonnet-4-6',
             complex: 'claude-opus-4-6',
         },
@@ -705,6 +777,23 @@ function classifyComplexity(messages) {
         score += 1;
     if (andCount >= 5)
         score += 1;
+    // Calculate total tokens across ALL messages, not just last user message.
+    // For agent workloads (OpenClaw, aider, Claude Code) the last user message is
+    // often tiny while the real complexity lives in the 100K+ token context.
+    const allText = extractMessageText(messages);
+    const totalTokens = Math.ceil(allText.length / 4);
+    // Context size floor — use as a hard signal regardless of last-message score
+    if (totalTokens > 100000)
+        score += 5; // definitely complex
+    else if (totalTokens > 50000)
+        score += 3; // likely moderate+
+    else if (totalTokens > 20000)
+        score += 2;
+    // Message count signal — long conversations imply multi-step reasoning
+    if (messages.length > 50)
+        score += 2;
+    else if (messages.length > 20)
+        score += 1;
     if (score >= 4)
         return 'complex';
     if (score >= 2)
@@ -1437,11 +1526,13 @@ function convertAnthropicStreamEvent(eventType, eventData, messageId, model, too
             const msg = eventData['message'];
             baseChunk.id = msg?.['id'] || messageId;
             choice.delta = { role: 'assistant', content: '' };
-            // Pass through input token count from message_start
+            // Pass through input token count from message_start (including cache tokens)
             const msgUsage = msg?.['usage'];
             if (msgUsage) {
                 baseChunk['usage'] = {
                     prompt_tokens: msgUsage['input_tokens'] ?? 0,
+                    cache_creation_tokens: msgUsage['cache_creation_input_tokens'] ?? 0,
+                    cache_read_tokens: msgUsage['cache_read_input_tokens'] ?? 0,
                 };
             }
             return `data: ${JSON.stringify(baseChunk)}\n\n`;
@@ -1895,10 +1986,14 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
 .badge.ok{background:#052e1633;color:#34d399}.badge.err{background:#2d0a0a;color:#ef4444}
 .badge.tt-code{background:#1e3a5f;color:#60a5fa}.badge.tt-analysis{background:#3b1f6e;color:#a78bfa}.badge.tt-summarization{background:#1a3a2a;color:#6ee7b7}.badge.tt-qa{background:#3a2f1e;color:#fbbf24}.badge.tt-general{background:#1e293b;color:#94a3b8}
 .badge.cx-simple{background:#052e1633;color:#34d399}.badge.cx-moderate{background:#2d2a0a;color:#fbbf24}.badge.cx-complex{background:#2d0a0a;color:#ef4444}
+.vstat{display:inline-flex;align-items:center;gap:6px;margin-left:8px;padding:1px 8px;border-radius:999px;border:1px solid #334155;font-size:.72rem}
+.vstat.current{color:#94a3b8;border-color:#334155;background:#0f172a66}
+.vstat.outdated{color:#fbbf24;border-color:#f59e0b55;background:#3a2f1e66}
+.vstat.unavailable{color:#a3a3a3;border-color:#52525b66;background:#18181b66}
 @media(max-width:768px){.col-tt,.col-cx{display:none}}
 .prov{display:flex;gap:16px;flex-wrap:wrap}.prov-item{display:flex;align-items:center;font-size:.85rem;background:#111318;padding:8px 14px;border-radius:8px;border:1px solid #1e293b}
 </style></head><body>
-<div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
+<div class="header"><div><h1>⚡ RelayPlane Dashboard</h1></div><div class="meta"><a href="/dashboard/config">Config</a> · <span id="ver"></span><span id="vstat" class="vstat unavailable">Unable to check</span> · up <span id="uptime"></span> · refreshes every 5s</div></div>
 <div class="cards">
   <div class="card"><div class="label">Total Requests</div><div class="value" id="totalReq">—</div></div>
   <div class="card"><div class="label">Total Cost</div><div class="value" id="totalCost">—</div></div>
@@ -1909,7 +2004,7 @@ td{padding:8px 12px;border-bottom:1px solid #111318}
 <table><thead><tr><th>Model</th><th>Requests</th><th>Cost</th><th>% of Total</th></tr></thead><tbody id="models"></tbody></table></div>
 <div class="section"><h2>Provider Status</h2><div class="prov" id="providers"></div></div>
 <div class="section"><h2>Recent Runs</h2>
-<table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
+<table><thead><tr><th>Time</th><th>Model</th><th class="col-tt">Task Type</th><th class="col-cx">Complexity</th><th>Tokens In</th><th>Tokens Out</th><th class="col-cache">Cache Create</th><th class="col-cache">Cache Read</th><th>Cost</th><th>Latency</th><th>Status</th></tr></thead><tbody id="runs"></tbody></table></div>
 <script>
 const $ = id => document.getElementById(id);
 function fmt(n,d=2){return typeof n==='number'?n.toFixed(d):'-'}
@@ -1926,6 +2021,19 @@ async function load(){
     ]);
     $('ver').textContent='v'+health.version;
     $('uptime').textContent=dur(health.uptime);
+    const versionStatus = await fetch('/v1/version-status').then(r=>r.json()).catch(()=>({state:'unavailable', current: health.version, latest: null}));
+    const vEl = $('vstat');
+    if (vEl) {
+      vEl.className = 'vstat ' + (versionStatus.state === 'outdated' ? 'outdated' : versionStatus.state === 'up-to-date' ? 'current' : 'unavailable');
+      if (versionStatus.state === 'outdated') {
+        vEl.textContent = 'Update available · v' + versionStatus.current + ' → v' + versionStatus.latest;
+      } else if (versionStatus.state === 'up-to-date') {
+        vEl.textContent = 'Up to date · v' + versionStatus.current;
+      } else {
+        vEl.textContent = 'Unable to check · v' + versionStatus.current;
+      }
+    }
     const total=stats.summary?.totalEvents||0;
     $('totalReq').textContent=total;
     $('totalCost').textContent='$'+fmt(stats.summary?.totalCostUsd??0,4);
@@ -1937,8 +2045,8 @@ async function load(){
     function ttCls(t){const m={code_generation:'tt-code',analysis:'tt-analysis',summarization:'tt-summarization',question_answering:'tt-qa'};return m[t]||'tt-general'}
     function cxCls(c){const m={simple:'cx-simple',moderate:'cx-moderate',complex:'cx-complex'};return m[c]||'cx-simple'}
     $('runs').innerHTML=(runsR.runs||[]).map(r=>
-      '<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
-    ).join('')||'<tr><td colspan=9 style="color:#64748b">No runs yet</td></tr>';
+      '<tr><td>'+fmtTime(r.started_at)+'</td><td>'+r.model+'</td><td class="col-tt"><span class="badge '+ttCls(r.taskType)+'">'+(r.taskType||'general').replace(/_/g,' ')+'</span></td><td class="col-cx"><span class="badge '+cxCls(r.complexity)+'">'+(r.complexity||'simple')+'</span></td><td>'+(r.tokensIn||0)+'</td><td>'+(r.tokensOut||0)+'</td><td class="col-cache" style="color:#60a5fa">'+(r.cacheCreationTokens||0)+'</td><td class="col-cache" style="color:#34d399">'+(r.cacheReadTokens||0)+'</td><td>$'+fmt(r.costUsd,4)+'</td><td>'+r.latencyMs+'ms</td><td><span class="badge '+(r.status==='success'?'ok':'err')+'">'+r.status+'</span></td></tr>'
+    ).join('')||'<tr><td colspan=11 style="color:#64748b">No runs yet</td></tr>';
     $('providers').innerHTML=(provH.providers||[]).map(p=>{
       const dotClass = p.status==='healthy'?'up':(p.status==='degraded'?'warn':'down');
       const rate = p.successRate!==undefined?(' '+Math.round(p.successRate*100)+'%'):'';
@@ -2046,6 +2154,7 @@ async function startProxy(config = {}) {
     loadHistoryFromDisk();
     // Flush history on shutdown
     const handleShutdown = () => {
+        meshHandle.stop();
         shutdownHistory();
         process.exit(0);
     };
@@ -2054,11 +2163,159 @@ async function startProxy(config = {}) {
     const configPath = getProxyConfigPath();
     let proxyConfig = await loadProxyConfig(configPath, log);
     const cooldownManager = new CooldownManager(getCooldownConfig(proxyConfig));
+    // === Startup config validation (Task 4) ===
+    try {
+        const userConfig = (0, config_js_1.loadConfig)();
+        // Check if config was just created (created_at within 5s of now)
+        const createdAt = new Date(userConfig.created_at).getTime();
+        const now = Date.now();
+        if (Math.abs(now - createdAt) < 5000) {
+            console.warn('[RelayPlane] WARNING: Fresh config detected — previous config may have been deleted');
+        }
+        // Check if credentials exist but config doesn't reference them
+        if ((0, config_js_1.hasValidCredentials)() && !userConfig.api_key) {
+            console.warn('[RelayPlane] WARNING: credentials.json exists but config has no API key reference');
+        }
+        // Auto-enable telemetry for authenticated users
+        if ((0, config_js_1.hasValidCredentials)() && !userConfig.telemetry_enabled) {
+            // Already handled in loadConfig() for fresh configs, but handle existing configs too
+        }
+        // Validate expected fields
+        if (!userConfig.device_id || !userConfig.created_at || userConfig.config_version === undefined) {
+            console.warn('[RelayPlane] WARNING: Config is missing expected fields');
+        }
+    }
+    catch (err) {
+        console.warn(`[RelayPlane] Config validation error: ${err}`);
+    }
+    // Initialize mesh learning layer
+    const meshConfig = (0, config_js_1.getMeshConfig)();
+    const userConfig = (0, config_js_1.loadConfig)();
+    const meshHandle = _meshHandle = (0, index_js_1.initMeshLayer)({
+        enabled: meshConfig.enabled,
+        endpoint: meshConfig.endpoint,
+        sync_interval_ms: meshConfig.sync_interval_ms,
+        contribute: meshConfig.contribute,
+    }, userConfig.api_key);
+    // Initialize budget manager
+    const budgetManager = (0, budget_js_1.getBudgetManager)(proxyConfig.budget);
+    if (proxyConfig.budget?.enabled) {
+        try {
+            budgetManager.init();
+            log('Budget manager initialized');
+        }
+        catch (err) {
+            log(`Budget manager init failed: ${err}`);
+        }
+    }
+    // Initialize anomaly detector
+    const anomalyDetector = (0, anomaly_js_1.getAnomalyDetector)(proxyConfig.anomaly);
+    // Initialize alert manager
+    const alertManager = (0, alerts_js_1.getAlertManager)(proxyConfig.alerts);
+    if (proxyConfig.alerts?.enabled) {
+        try {
+            alertManager.init();
+            log('Alert manager initialized');
+        }
+        catch (err) {
+            log(`Alert manager init failed: ${err}`);
+        }
+    }
+    // Downgrade config
+    let downgradeConfig = {
+        ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG,
+        ...(proxyConfig.downgrade ?? {}),
+    };
+    /**
+     * Pre-request budget check + auto-downgrade.
+     * Returns the (possibly downgraded) model and extra response headers.
+     * If the request should be blocked, returns { blocked: true }.
+     */
+    function preRequestBudgetCheck(model, estimatedCost) {
+        const headers = {};
+        let finalModel = model;
+        let downgraded = false;
+        // Budget check
+        const budgetResult = budgetManager.checkBudget(estimatedCost);
+        if (budgetResult.breached) {
+            // Fire breach alert
+            const limit = budgetResult.breachType === 'hourly'
+                ? budgetManager.getConfig().hourlyUsd
+                : budgetManager.getConfig().dailyUsd;
+            const spend = budgetResult.breachType === 'hourly'
+                ? budgetResult.currentHourlySpend
+                : budgetResult.currentDailySpend;
+            alertManager.fireBreach(budgetResult.breachType, spend, limit);
+            if (budgetResult.action === 'block') {
+                return { blocked: true, model: finalModel, headers, downgraded: false };
+            }
+            if (budgetResult.action === 'downgrade') {
+                const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, 100, downgradeConfig);
+                if (dr.downgraded) {
+                    finalModel = dr.newModel;
+                    downgraded = true;
+                    (0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
+                }
+            }
+        }
+        // Fire threshold alerts
+        for (const threshold of budgetResult.thresholdsCrossed) {
+            alertManager.fireThreshold(threshold, (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100, budgetResult.currentDailySpend, budgetManager.getConfig().dailyUsd);
+            budgetManager.markThresholdFired(threshold);
+        }
+        // Auto-downgrade based on budget percentage (even if not breached)
+        if (!downgraded && downgradeConfig.enabled) {
+            const pct = budgetManager.getConfig().dailyUsd > 0
+                ? (budgetResult.currentDailySpend / budgetManager.getConfig().dailyUsd) * 100
+                : 0;
+            const dr = (0, downgrade_js_1.checkDowngrade)(finalModel, pct, downgradeConfig);
+            if (dr.downgraded) {
+                finalModel = dr.newModel;
+                downgraded = true;
+                (0, downgrade_js_1.applyDowngradeHeaders)(headers, dr);
+            }
+        }
+        return { blocked: false, model: finalModel, headers, downgraded };
+    }
+    /**
+     * Post-request: record spend, run anomaly detection, fire anomaly alerts.
+     */
+    function postRequestRecord(model, tokensIn, tokensOut, costUsd) {
+        // Record spend
+        budgetManager.recordSpend(costUsd, model);
+        // Anomaly detection
+        const anomalyResult = anomalyDetector.recordAndAnalyze({
+            model,
+            tokensIn,
+            tokensOut,
+            costUsd,
+        });
+        if (anomalyResult.detected) {
+            for (const anomaly of anomalyResult.anomalies) {
+                alertManager.fireAnomaly(anomaly);
+            }
+        }
+    }
+    // Initialize response cache
+    const responseCache = (0, response_cache_js_1.getResponseCache)(proxyConfig.cache);
+    if (proxyConfig.cache?.enabled !== false) {
+        try {
+            responseCache.init();
+            log('Response cache initialized');
+        }
+        catch (err) {
+            log(`Response cache init failed: ${err}`);
+        }
+    }
     let configWatcher = null;
     let configReloadTimer = null;
     const reloadConfig = async () => {
         proxyConfig = await loadProxyConfig(configPath, log);
         cooldownManager.updateConfig(getCooldownConfig(proxyConfig));
+        budgetManager.updateConfig({ ...budgetManager.getConfig(), ...(proxyConfig.budget ?? {}) });
+        anomalyDetector.updateConfig({ ...anomalyDetector.getConfig(), ...(proxyConfig.anomaly ?? {}) });
+        alertManager.updateConfig({ ...alertManager.getConfig(), ...(proxyConfig.alerts ?? {}) });
+        downgradeConfig = { ...downgrade_js_1.DEFAULT_DOWNGRADE_CONFIG, ...(proxyConfig.downgrade ?? {}) };
         log(`Reloaded config from ${configPath}`);
     };
     const scheduleConfigReload = () => {
@@ -2083,7 +2340,8 @@ async function startProxy(config = {}) {
     // Initialize RelayPlane
     const relay = new core_1.RelayPlane({ dbPath: config.dbPath });
     // Startup migration: clear default routing rules so complexity config takes priority
-    const clearedCount = relay.routing.clearDefaultRules();
+    const clearDefaultRules = relay.routing.clearDefaultRules;
+    const clearedCount = typeof clearDefaultRules === 'function' ? clearDefaultRules.call(relay.routing) : 0;
     if (clearedCount > 0) {
         log(`Cleared ${clearedCount} default routing rules (complexity config takes priority)`);
     }
@@ -2130,6 +2388,13 @@ async function startProxy(config = {}) {
             }));
             return;
         }
+        if (req.method === 'GET' && pathname === '/v1/version-status') {
+            const latest = await getLatestProxyVersion();
+            const status = (0, version_status_js_1.getVersionStatus)(PROXY_VERSION, latest);
+            res.writeHead(200, { 'Content-Type': 'application/json', 'Cache-Control': 'public, max-age=60' });
+            res.end(JSON.stringify(status));
+            return;
+        }
         // === Control endpoints ===
         if (pathname.startsWith('/control/')) {
             if (req.method === 'POST' && pathname === '/control/enable') {
@@ -2196,6 +2461,36 @@ async function startProxy(config = {}) {
                 return;
             }
         }
+        if (req.method === 'POST' && pathname === '/control/kill') {
+            try {
+                const body = await readJsonBody(req);
+                if (body.all) {
+                    res.writeHead(200, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({
+                        killed: 0,
+                        sessions: [],
+                        note: 'Local proxy mode: session kill not applicable'
+                    }));
+                }
+                else if (body.sessionKey) {
+                    res.writeHead(200, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({
+                        killed: 1,
+                        sessions: [body.sessionKey],
+                        note: 'Rate limits reset for session'
+                    }));
+                }
+                else {
+                    res.writeHead(400, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({ error: 'Provide sessionKey or all=true' }));
+                }
+            }
+            catch {
+                res.writeHead(400, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({ error: 'Invalid JSON' }));
+            }
+            return;
+        }
         // === Telemetry endpoints for dashboard ===
         if (pathname.startsWith('/v1/telemetry/')) {
             const telemetryPath = pathname.replace('/v1/telemetry/', '');
@@ -2244,7 +2539,9 @@ async function startProxy(config = {}) {
                 const offset = parseInt(params.get('offset') || '0', 10);
                 const sorted = [...requestHistory].reverse();
                 const runs = sorted.slice(offset, offset + limit).map(r => {
-                    const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut);
+                    // Savings should reflect routing decisions only — pass same cache tokens to baseline
+                    // so the cache discount doesn't get counted as "savings from routing"
+                    const origCost = (0, telemetry_js_1.estimateCost)('claude-opus-4-6', r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
                     const perRunSavings = Math.max(0, origCost - r.costUsd);
                     return {
                         id: r.id,
@@ -2264,6 +2561,8 @@ async function startProxy(config = {}) {
                         latencyMs: r.latencyMs,
                         tokensIn: r.tokensIn,
                         tokensOut: r.tokensOut,
+                        cacheCreationTokens: r.cacheCreationTokens ?? 0,
+                        cacheReadTokens: r.cacheReadTokens ?? 0,
                         savings: Math.round(perRunSavings * 10000) / 10000,
                         escalated: r.escalated,
                     };
@@ -2281,7 +2580,9 @@ async function startProxy(config = {}) {
                 let totalSavedAmount = 0;
                 const byDayMap = new Map();
                 for (const r of requestHistory) {
-                    const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut);
+                    // Pass same cache tokens to baseline so savings only reflect routing decisions,
+                    // not prompt-cache discounts (those happen regardless of which model is chosen).
+                    const origCost = (0, telemetry_js_1.estimateCost)(OPUS_BASELINE, r.tokensIn, r.tokensOut, r.cacheCreationTokens || undefined, r.cacheReadTokens || undefined);
                     const actualCost = r.costUsd;
                     const saved = Math.max(0, origCost - actualCost);
                     totalOriginalCost += origCost;
@@ -2372,6 +2673,24 @@ async function startProxy(config = {}) {
             res.end(getConfigDashboardHTML());
             return;
         }
+        // === Mesh stats endpoint ===
+        if (req.method === 'GET' && pathname === '/v1/mesh/stats') {
+            res.writeHead(200, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify(meshHandle.getStats()));
+            return;
+        }
+        if (req.method === 'POST' && pathname === '/v1/mesh/sync') {
+            try {
+                const result = await meshHandle.forceSync();
+                res.writeHead(200, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({ sync: result }));
+            }
+            catch (err) {
+                res.writeHead(500, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({ sync: { error: err.message } }));
+            }
+            return;
+        }
         if (req.method === 'GET' && pathname === '/v1/config') {
             try {
                 const raw = await fs.promises.readFile(getProxyConfigPath(), 'utf8');
@@ -2511,6 +2830,48 @@ async function startProxy(config = {}) {
                 log(`Config routing.mode=auto: overriding passthrough → auto for model ${requestedModel}`);
             }
             const isStreaming = requestBody['stream'] === true;
+            // ── Response Cache: check for cached response ──
+            const cacheBypass = responseCache.shouldBypass(requestBody);
+            let cacheHash;
+            if (!cacheBypass) {
+                cacheHash = responseCache.computeKey(requestBody);
+                const cached = responseCache.get(cacheHash);
+                if (cached) {
+                    try {
+                        const cachedData = JSON.parse(cached);
+                        const cacheUsage = cachedData?.usage;
+                        const cacheCost = (0, telemetry_js_1.estimateCost)(requestBody['model'] ?? '', cacheUsage?.input_tokens ?? 0, cacheUsage?.output_tokens ?? 0);
+                        responseCache.recordHit(cacheCost, 0);
+                        // Replay cached streaming response as SSE
+                        if (isStreaming && cachedData._relayplaneStreamCache) {
+                            res.writeHead(200, {
+                                'Content-Type': 'text/event-stream',
+                                'Cache-Control': 'no-cache',
+                                'Connection': 'keep-alive',
+                                'X-RelayPlane-Cache': 'HIT',
+                            });
+                            res.end(cachedData.ssePayload);
+                        }
+                        else {
+                            res.writeHead(200, {
+                                'Content-Type': 'application/json',
+                                'X-RelayPlane-Cache': 'HIT',
+                            });
+                            res.end(cached);
+                        }
+                        log(`Cache HIT for ${requestBody['model']} (hash: ${cacheHash.slice(0, 8)})`);
+                        return;
+                    }
+                    catch {
+                        // Corrupt cache entry, continue to provider
+                    }
+                }
+                responseCache.recordMiss();
+            }
+            else {
+                responseCache.recordBypass();
+            }
+            // ── End cache check ──
             const messages = Array.isArray(requestBody['messages'])
                 ? requestBody['messages']
                 : [];
@@ -2619,6 +2980,47 @@ async function startProxy(config = {}) {
                 res.end(JSON.stringify({ error: `Provider ${targetProvider} is temporarily cooled down` }));
                 return;
             }
+            // ── Budget check + auto-downgrade ──
+            const budgetExtraHeaders = {};
+            {
+                const budgetCheck = preRequestBudgetCheck(targetModel || requestedModel);
+                if (budgetCheck.blocked) {
+                    res.writeHead(429, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({
+                        error: 'Budget limit exceeded. Request blocked.',
+                        type: 'budget_exceeded',
+                    }));
+                    return;
+                }
+                if (budgetCheck.downgraded) {
+                    log(`Budget downgrade: ${targetModel || requestedModel} → ${budgetCheck.model}`);
+                    targetModel = budgetCheck.model;
+                    if (requestBody)
+                        requestBody['model'] = targetModel;
+                }
+                Object.assign(budgetExtraHeaders, budgetCheck.headers);
+            }
+            // ── End budget check ──
+            // ── Rate limit check ──
+            const workspaceId = 'local'; // Local proxy uses single workspace
+            const rateLimit = (0, rate_limiter_js_1.checkLimit)(workspaceId, targetModel);
+            if (!rateLimit.allowed) {
+                console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${workspaceId}`);
+                res.writeHead(429, {
+                    'Content-Type': 'application/json',
+                    'Retry-After': String(rateLimit.retryAfter || 60),
+                    'X-RelayPlane-RateLimit-Limit': String(rateLimit.limit),
+                    'X-RelayPlane-RateLimit-Remaining': '0',
+                    'X-RelayPlane-RateLimit-Reset': String(Math.ceil(rateLimit.resetAt / 1000))
+                });
+                res.end(JSON.stringify({
+                    error: `Rate limit exceeded for ${targetModel}. Max ${rateLimit.limit} requests per minute.`,
+                    type: 'rate_limit_exceeded',
+                    retry_after: rateLimit.retryAfter || 60
+                }));
+                return;
+            }
+            // ── End rate limit check ──
             const startTime = Date.now();
             let nativeResponseData;
             try {
@@ -2688,11 +3090,16 @@ async function startProxy(config = {}) {
                             'Content-Type': 'text/event-stream',
                             'Cache-Control': 'no-cache',
                             'Connection': 'keep-alive',
+                            'X-RelayPlane-Cache': cacheBypass ? 'BYPASS' : 'MISS',
                             ...nativeStreamRpHeaders,
                         });
                         const reader = providerResponse.body?.getReader();
                         let streamTokensIn = 0;
                         let streamTokensOut = 0;
+                        let streamCacheCreation = 0;
+                        let streamCacheRead = 0;
+                        // Buffer raw SSE chunks for cache storage
+                        const rawChunks = [];
                         if (reader) {
                             const decoder = new TextDecoder();
                             let sseBuffer = '';
@@ -2703,6 +3110,8 @@ async function startProxy(config = {}) {
                                         break;
                                     const chunk = decoder.decode(value, { stream: true });
                                     res.write(chunk);
+                                    if (cacheHash && !cacheBypass)
+                                        rawChunks.push(chunk);
                                     // Parse SSE events to extract usage from message_delta / message_stop
                                     sseBuffer += chunk;
                                     const lines = sseBuffer.split('\n');
@@ -2715,9 +3124,11 @@ async function startProxy(config = {}) {
                                                 if (evt.type === 'message_delta' && evt.usage) {
                                                     streamTokensOut = evt.usage.output_tokens ?? streamTokensOut;
                                                 }
-                                                // Anthropic: message_start has usage.input_tokens
+                                                // Anthropic: message_start has usage.input_tokens + cache tokens
                                                 if (evt.type === 'message_start' && evt.message?.usage) {
                                                     streamTokensIn = evt.message.usage.input_tokens ?? streamTokensIn;
+                                                    streamCacheCreation = evt.message.usage.cache_creation_input_tokens ?? 0;
+                                                    streamCacheRead = evt.message.usage.cache_read_input_tokens ?? 0;
                                                 }
                                                 // OpenAI format: choices with usage
                                                 if (evt.usage) {
@@ -2736,15 +3147,45 @@ async function startProxy(config = {}) {
                                 reader.releaseLock();
                             }
                         }
+                        // ── Cache: store streaming response as raw SSE payload ──
+                        if (cacheHash && !cacheBypass && rawChunks.length > 0) {
+                            const streamPayload = JSON.stringify({
+                                _relayplaneStreamCache: true,
+                                ssePayload: rawChunks.join(''),
+                                usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
+                            });
+                            responseCache.set(cacheHash, streamPayload, {
+                                model: targetModel || requestedModel,
+                                tokensIn: streamTokensIn,
+                                tokensOut: streamTokensOut,
+                                costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
+                                taskType,
+                            });
+                            log(`Cache STORE (stream) for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
+                        }
                         // Store streaming token counts so telemetry can use them
-                        nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut } };
+                        nativeResponseData = { usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead } };
                         res.end();
                     }
                     else {
                         nativeResponseData = await providerResponse.json();
                         const nativeRespModel = checkResponseModelMismatch(nativeResponseData, targetModel || requestedModel, targetProvider, log);
                         const nativeRpHeaders = buildRelayPlaneResponseHeaders(targetModel || requestedModel, originalModel ?? 'unknown', complexity, targetProvider, routingMode);
-                        res.writeHead(providerResponse.status, { 'Content-Type': 'application/json', ...nativeRpHeaders });
+                        // ── Cache: store non-streaming response ──
+                        const nativeCacheHeader = cacheBypass ? 'BYPASS' : 'MISS';
+                        if (cacheHash && !cacheBypass) {
+                            const nativeRespJson = JSON.stringify(nativeResponseData);
+                            const nativeUsage = nativeResponseData?.usage;
+                            responseCache.set(cacheHash, nativeRespJson, {
+                                model: targetModel || requestedModel,
+                                tokensIn: nativeUsage?.input_tokens ?? 0,
+                                tokensOut: nativeUsage?.output_tokens ?? 0,
+                                costUsd: (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeUsage?.input_tokens ?? 0, nativeUsage?.output_tokens ?? 0, nativeUsage?.cache_creation_input_tokens || undefined, nativeUsage?.cache_read_input_tokens || undefined),
+                                taskType,
+                            });
+                            log(`Cache STORE for ${targetModel || requestedModel} (hash: ${cacheHash.slice(0, 8)})`);
+                        }
+                        res.writeHead(providerResponse.status, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': nativeCacheHeader, ...nativeRpHeaders });
                         res.end(JSON.stringify(nativeResponseData));
                     }
                 }
@@ -2754,18 +3195,31 @@ async function startProxy(config = {}) {
                 // nativeResponseData holds response JSON for non-streaming, or { usage: { input_tokens, output_tokens } }
                 // synthesised from SSE events for streaming
                 const nativeUsageData = nativeResponseData?.usage;
-                const nativeTokIn = nativeUsageData?.input_tokens ?? nativeUsageData?.prompt_tokens ?? 0;
+                const nativeBaseTokIn = nativeUsageData?.input_tokens ?? nativeUsageData?.prompt_tokens ?? 0;
                 const nativeTokOut = nativeUsageData?.output_tokens ?? nativeUsageData?.completion_tokens ?? 0;
-                updateLastHistoryEntry(nativeTokIn, nativeTokOut, (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut));
+                const nativeCacheCreation = nativeUsageData?.cache_creation_input_tokens ?? 0;
+                const nativeCacheRead = nativeUsageData?.cache_read_input_tokens ?? 0;
+                // Include cache tokens in displayed/recorded token count
+                const nativeTokIn = nativeBaseTokIn + nativeCacheCreation + nativeCacheRead;
+                // Cost calculation expects inputTokens to include cache tokens when cache params are provided
+                const nativeCostUsd = (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined);
+                updateLastHistoryEntry(nativeTokIn, nativeTokOut, nativeCostUsd, undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
+                // ── Post-request: budget spend + anomaly detection ──
+                postRequestRecord(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCostUsd);
                 if (recordTelemetry) {
                     relay
                         .run({
                         prompt: promptText.slice(0, 500),
                         taskType,
                         model: `${targetProvider}:${targetModel || requestedModel}`,
+                    })
+                        .then((runResult) => {
+                        // Backfill token/cost data — relay.run() has no adapters so records NULLs
+                        relay.patchRunTokens(runResult.runId, nativeTokIn, nativeTokOut, nativeCostUsd);
                     })
                         .catch(() => { });
-                    sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined);
+                    sendCloudTelemetry(taskType, targetModel || requestedModel, nativeTokIn, nativeTokOut, durationMs, true, undefined, originalModel ?? undefined, nativeCacheCreation || undefined, nativeCacheRead || undefined);
+                    meshCapture(targetModel || requestedModel, targetProvider, taskType, nativeTokIn, nativeTokOut, (0, telemetry_js_1.estimateCost)(targetModel || requestedModel, nativeTokIn, nativeTokOut, nativeCacheCreation || undefined, nativeCacheRead || undefined), durationMs, true);
                 }
             }
             catch (err) {
@@ -2847,6 +3301,47 @@ async function startProxy(config = {}) {
             return;
         }
         const isStreaming = request.stream === true;
+        // ── Response Cache: check for cached response (chat/completions) ──
+        const chatCacheBypass = responseCache.shouldBypass(request);
+        let chatCacheHash;
+        if (!chatCacheBypass) {
+            chatCacheHash = responseCache.computeKey(request);
+            const chatCached = responseCache.get(chatCacheHash);
+            if (chatCached) {
+                try {
+                    const chatCachedData = JSON.parse(chatCached);
+                    const chatCacheUsage = chatCachedData?.usage;
+                    const chatCacheCost = (0, telemetry_js_1.estimateCost)(request.model ?? '', chatCacheUsage?.prompt_tokens ?? chatCacheUsage?.input_tokens ?? 0, chatCacheUsage?.completion_tokens ?? chatCacheUsage?.output_tokens ?? 0);
+                    responseCache.recordHit(chatCacheCost, 0);
+                    if (isStreaming && chatCachedData._relayplaneStreamCache) {
+                        res.writeHead(200, {
+                            'Content-Type': 'text/event-stream',
+                            'Cache-Control': 'no-cache',
+                            'Connection': 'keep-alive',
+                            'X-RelayPlane-Cache': 'HIT',
+                        });
+                        res.end(chatCachedData.ssePayload);
+                    }
+                    else {
+                        res.writeHead(200, {
+                            'Content-Type': 'application/json',
+                            'X-RelayPlane-Cache': 'HIT',
+                        });
+                        res.end(chatCached);
+                    }
+                    log(`Cache HIT for chat/completions ${request.model} (hash: ${chatCacheHash.slice(0, 8)})`);
+                    return;
+                }
+                catch {
+                    // Corrupt, continue
+                }
+            }
+            responseCache.recordMiss();
+        }
+        else {
+            responseCache.recordBypass();
+        }
+        // ── End cache check ──
         const bypassRouting = !relayplaneEnabled || relayplaneBypass;
         // Extract routing mode from model name
         const originalRequestedModel = request.model;
@@ -3065,10 +3560,48 @@ async function startProxy(config = {}) {
             }
             apiKey = apiKeyResult.apiKey;
         }
+        // ── Budget check + auto-downgrade (chat/completions) ──
+        {
+            const chatBudgetCheck = preRequestBudgetCheck(targetModel);
+            if (chatBudgetCheck.blocked) {
+                res.writeHead(429, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({
+                    error: 'Budget limit exceeded. Request blocked.',
+                    type: 'budget_exceeded',
+                }));
+                return;
+            }
+            if (chatBudgetCheck.downgraded) {
+                log(`Budget downgrade: ${targetModel} → ${chatBudgetCheck.model}`);
+                targetModel = chatBudgetCheck.model;
+                request.model = targetModel;
+            }
+        }
+        // ── End budget check ──
+        // ── Rate limit check ──
+        const chatWorkspaceId = 'local'; // Local proxy uses single workspace
+        const chatRateLimit = (0, rate_limiter_js_1.checkLimit)(chatWorkspaceId, targetModel);
+        if (!chatRateLimit.allowed) {
+            console.error(`[RATE LIMIT] ${targetModel} limit reached for workspace: ${chatWorkspaceId}`);
+            res.writeHead(429, {
+                'Content-Type': 'application/json',
+                'Retry-After': String(chatRateLimit.retryAfter || 60),
+                'X-RelayPlane-RateLimit-Limit': String(chatRateLimit.limit),
+                'X-RelayPlane-RateLimit-Remaining': '0',
+                'X-RelayPlane-RateLimit-Reset': String(Math.ceil(chatRateLimit.resetAt / 1000))
+            });
+            res.end(JSON.stringify({
+                error: `Rate limit exceeded for ${targetModel}. Max ${chatRateLimit.limit} requests per minute.`,
+                type: 'rate_limit_exceeded',
+                retry_after: chatRateLimit.retryAfter || 60
+            }));
+            return;
+        }
+        // ── End rate limit check ──
         const startTime = Date.now();
         // Handle streaming vs non-streaming
         if (isStreaming) {
-            await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity);
+            await handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, useCascade ? 'cascade' : routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity, chatCacheHash, chatCacheBypass);
         }
         else {
             if (useCascade && cascadeConfig) {
@@ -3105,8 +3638,10 @@ async function startProxy(config = {}) {
                     const cascadeUsage = responseData?.usage;
                     const cascadeTokensIn = cascadeUsage?.input_tokens ?? cascadeUsage?.prompt_tokens ?? 0;
                     const cascadeTokensOut = cascadeUsage?.output_tokens ?? cascadeUsage?.completion_tokens ?? 0;
-                    const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut);
-                    updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel);
+                    const cascadeCacheCreation = cascadeUsage?.cache_creation_input_tokens || undefined;
+                    const cascadeCacheRead = cascadeUsage?.cache_read_input_tokens || undefined;
+                    const cascadeCost = (0, telemetry_js_1.estimateCost)(cascadeResult.model, cascadeTokensIn, cascadeTokensOut, cascadeCacheCreation, cascadeCacheRead);
+                    updateLastHistoryEntry(cascadeTokensIn, cascadeTokensOut, cascadeCost, chatCascadeRespModel, cascadeCacheCreation, cascadeCacheRead);
                     if (recordTelemetry) {
                         try {
                             const runResult = await relay.run({
@@ -3114,6 +3649,8 @@ async function startProxy(config = {}) {
                                 taskType,
                                 model: `${cascadeResult.provider}:${cascadeResult.model}`,
                             });
+                            // Backfill token/cost data — relay.run() has no adapters so records NULLs
+                            relay.patchRunTokens(runResult.runId, cascadeTokensIn, cascadeTokensOut, cascadeCost);
                             responseData['_relayplane'] = {
                                 runId: runResult.runId,
                                 routedTo: `${cascadeResult.provider}/${cascadeResult.model}`,
@@ -3128,7 +3665,8 @@ async function startProxy(config = {}) {
                         catch (err) {
                             log(`Failed to record run: ${err}`);
                         }
-                        sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined);
+                        sendCloudTelemetry(taskType, cascadeResult.model, cascadeTokensIn, cascadeTokensOut, durationMs, true, undefined, originalRequestedModel ?? undefined, cascadeCacheCreation, cascadeCacheRead);
+                        meshCapture(cascadeResult.model, cascadeResult.provider, taskType, cascadeTokensIn, cascadeTokensOut, cascadeCost, durationMs, true);
                     }
                     const chatCascadeRpHeaders = buildRelayPlaneResponseHeaders(cascadeResult.model, originalRequestedModel ?? 'unknown', complexity, cascadeResult.provider, 'cascade');
                     res.writeHead(200, { 'Content-Type': 'application/json', ...chatCascadeRpHeaders });
@@ -3152,6 +3690,74 @@ async function startProxy(config = {}) {
             }
         }
     });
+    // ── Health Watchdog ──
+    let watchdogFailures = 0;
+    const WATCHDOG_MAX_FAILURES = 3;
+    const WATCHDOG_INTERVAL_MS = 15_000; // Must be < WatchdogSec (30s) to avoid false kills
+    let watchdogTimer = null;
+    /**
+     * sd_notify: write to $NOTIFY_SOCKET for systemd watchdog integration
+     */
+    function sdNotify(state) {
+        const notifySocket = process.env['NOTIFY_SOCKET'];
+        if (!notifySocket)
+            return;
+        try {
+            const dgram = require('node:dgram');
+            const client = dgram.createSocket('unix_dgram');
+            const buf = Buffer.from(state);
+            client.send(buf, 0, buf.length, notifySocket, () => {
+                client.close();
+            });
+        }
+        catch (err) {
+            log(`sd_notify error: ${err}`);
+        }
+    }
+    function startWatchdog() {
+        // Notify systemd we're ready
+        sdNotify('READY=1');
+        watchdogTimer = setInterval(async () => {
+            try {
+                const controller = new AbortController();
+                const timeout = setTimeout(() => controller.abort(), 5000);
+                const res = await fetch(`http://${host}:${port}/health`, { signal: controller.signal });
+                clearTimeout(timeout);
+                if (res.ok) {
+                    watchdogFailures = 0;
+                    // Notify systemd watchdog we're alive
+                    sdNotify('WATCHDOG=1');
+                }
+                else {
+                    watchdogFailures++;
+                    console.error(`[RelayPlane] Watchdog: health check returned ${res.status} (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES})`);
+                }
+            }
+            catch (err) {
+                watchdogFailures++;
+                console.error(`[RelayPlane] Watchdog: health check failed (failure ${watchdogFailures}/${WATCHDOG_MAX_FAILURES}): ${err}`);
+            }
+            if (watchdogFailures >= WATCHDOG_MAX_FAILURES) {
+                console.error('[RelayPlane] CRITICAL: 3 consecutive watchdog failures. Attempting graceful restart...');
+                sdNotify('STOPPING=1');
+                // Close server and exit — systemd Restart=always will restart us
+                server.close(() => {
+                    process.exit(1);
+                });
+                // Force exit after 10s if graceful close hangs
+                setTimeout(() => process.exit(1), 10_000).unref();
+            }
+        }, WATCHDOG_INTERVAL_MS);
+        watchdogTimer.unref();
+    }
+    // Clean up watchdog on shutdown
+    const origHandleShutdown = () => {
+        if (watchdogTimer)
+            clearInterval(watchdogTimer);
+        sdNotify('STOPPING=1');
+    };
+    process.on('SIGINT', origHandleShutdown);
+    process.on('SIGTERM', origHandleShutdown);
     return new Promise((resolve, reject) => {
         server.on('error', reject);
         server.listen(port, host, () => {
@@ -3164,6 +3770,8 @@ async function startProxy(config = {}) {
             console.log(`  Models: relayplane:auto, relayplane:cost, relayplane:fast, relayplane:quality`);
             console.log(`  Auth: Passthrough for Anthropic, env vars for other providers`);
             console.log(`  Streaming: ✅ Enabled`);
+            startWatchdog();
+            log('Health watchdog started (30s interval, sd_notify enabled)');
             resolve(server);
         });
     });
@@ -3221,7 +3829,7 @@ async function executeNonStreamingProviderRequest(request, targetProvider, targe
     }
     return { responseData, ok: true, status: 200 };
 }
-async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple') {
+async function handleStreamingRequest(res, request, targetProvider, targetModel, apiKey, ctx, relay, promptText, taskType, confidence, routingMode, recordTelemetry, startTime, log, cooldownManager, cooldownsEnabled, complexity = 'simple', cacheHash, cacheBypass) {
     let providerResponse;
     try {
         switch (targetProvider) {
@@ -3274,9 +3882,13 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
         'Connection': 'keep-alive',
         ...streamRpHeaders,
     });
-    // Track token usage from streaming events
+    // Track token usage from streaming events (including Anthropic prompt cache tokens)
     let streamTokensIn = 0;
     let streamTokensOut = 0;
+    let streamCacheCreation = 0;
+    let streamCacheRead = 0;
+    const shouldCacheStream = !!(cacheHash && !cacheBypass);
+    const rawChunks = [];
     try {
         // Stream the response based on provider format
         switch (targetProvider) {
@@ -3284,7 +3896,10 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
                 // Convert Anthropic stream to OpenAI format
                 for await (const chunk of convertAnthropicStream(providerResponse, targetModel)) {
                     res.write(chunk);
-                    // Parse OpenAI-format chunks for usage (emitted at end of stream)
+                    if (shouldCacheStream)
+                        rawChunks.push(chunk);
+                    // Parse OpenAI-format chunks for usage — the converter embeds
+                    // cache_creation_tokens and cache_read_tokens from message_start.
                     try {
                         const lines = chunk.split('\n');
                         for (const line of lines) {
@@ -3293,6 +3908,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
                                 if (evt.usage) {
                                     streamTokensIn = evt.usage.prompt_tokens ?? streamTokensIn;
                                     streamTokensOut = evt.usage.completion_tokens ?? streamTokensOut;
+                                    streamCacheCreation = evt.usage.cache_creation_tokens ?? streamCacheCreation;
+                                    streamCacheRead = evt.usage.cache_read_tokens ?? streamCacheRead;
                                 }
                             }
                         }
@@ -3304,6 +3921,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
                 // Convert Gemini stream to OpenAI format
                 for await (const chunk of convertGeminiStream(providerResponse, targetModel)) {
                     res.write(chunk);
+                    if (shouldCacheStream)
+                        rawChunks.push(chunk);
                     try {
                         const lines = chunk.split('\n');
                         for (const line of lines) {
@@ -3323,6 +3942,8 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
                 // xAI, OpenRouter, DeepSeek, Groq, OpenAI all use OpenAI-compatible streaming format
                 for await (const chunk of pipeOpenAIStream(providerResponse)) {
                     res.write(chunk);
+                    if (shouldCacheStream)
+                        rawChunks.push(chunk);
                     try {
                         const lines = chunk.split('\n');
                         for (const line of lines) {
@@ -3342,15 +3963,43 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
     catch (err) {
         log(`Streaming error: ${err}`);
     }
+    // ── Cache: store streaming response ──
+    if (shouldCacheStream && cacheHash && rawChunks.length > 0) {
+        const responseCache = (0, response_cache_js_1.getResponseCache)();
+        const streamPayload = JSON.stringify({
+            _relayplaneStreamCache: true,
+            ssePayload: rawChunks.join(''),
+            usage: { input_tokens: streamTokensIn, output_tokens: streamTokensOut, prompt_tokens: streamTokensIn, completion_tokens: streamTokensOut, cache_creation_input_tokens: streamCacheCreation, cache_read_input_tokens: streamCacheRead },
+        });
+        responseCache.set(cacheHash, streamPayload, {
+            model: targetModel,
+            tokensIn: streamTokensIn,
+            tokensOut: streamTokensOut,
+            costUsd: (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined),
+            taskType,
+        });
+        log(`Cache STORE (stream) for chat/completions ${targetModel} (hash: ${cacheHash.slice(0, 8)})`);
+    }
     if (cooldownsEnabled) {
         cooldownManager.recordSuccess(targetProvider);
     }
     const durationMs = Date.now() - startTime;
     // Always log the request for stats/telemetry tracking
     logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
-    // Update token/cost info on the history entry
-    const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut);
-    updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost);
+    // Update token/cost info on the history entry (with cache token discount)
+    const streamCost = (0, telemetry_js_1.estimateCost)(targetModel, streamTokensIn, streamTokensOut, streamCacheCreation || undefined, streamCacheRead || undefined);
+    updateLastHistoryEntry(streamTokensIn, streamTokensOut, streamCost, undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
+    // ── Post-request: budget spend + anomaly detection ──
+    try {
+        (0, budget_js_1.getBudgetManager)().recordSpend(streamCost, targetModel);
+        const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn: streamTokensIn, tokensOut: streamTokensOut, costUsd: streamCost });
+        if (anomalyResult.detected) {
+            for (const anomaly of anomalyResult.anomalies) {
+                (0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
+            }
+        }
+    }
+    catch { /* budget/anomaly should never block */ }
     if (recordTelemetry) {
         // Record the run (non-blocking)
         relay
@@ -3360,12 +4009,15 @@ async function handleStreamingRequest(res, request, targetProvider, targetModel,
             model: `${targetProvider}:${targetModel}`,
         })
             .then((runResult) => {
+            // Backfill token/cost data — relay.run() has no adapters so records NULLs
+            relay.patchRunTokens(runResult.runId, streamTokensIn, streamTokensOut, streamCost);
             log(`Completed streaming in ${durationMs}ms, runId: ${runResult.runId}`);
         })
             .catch((err) => {
             log(`Failed to record run: ${err}`);
         });
-        sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined);
+        sendCloudTelemetry(taskType, targetModel, streamTokensIn, streamTokensOut, durationMs, true, undefined, request.model ?? undefined, streamCacheCreation || undefined, streamCacheRead || undefined);
+        meshCapture(targetModel, targetProvider, taskType, streamTokensIn, streamTokensOut, streamCost, durationMs, true);
     }
     res.end();
 }
@@ -3407,12 +4059,25 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
     const nonStreamRespModel = checkResponseModelMismatch(responseData, targetModel, targetProvider, log);
     // Log the successful request
     logRequest(request.model ?? 'unknown', targetModel, targetProvider, durationMs, true, routingMode, undefined, taskType, complexity);
-    // Update token/cost info
+    // Update token/cost info (including Anthropic prompt cache tokens)
     const usage = responseData?.usage;
     const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
     const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
-    const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut);
-    updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel);
+    const cacheCreationTokens = usage?.cache_creation_input_tokens ?? 0;
+    const cacheReadTokens = usage?.cache_read_input_tokens ?? 0;
+    const cost = (0, telemetry_js_1.estimateCost)(targetModel, tokensIn, tokensOut, cacheCreationTokens || undefined, cacheReadTokens || undefined);
+    updateLastHistoryEntry(tokensIn, tokensOut, cost, nonStreamRespModel, cacheCreationTokens || undefined, cacheReadTokens || undefined);
+    // ── Post-request: budget spend + anomaly detection ──
+    try {
+        (0, budget_js_1.getBudgetManager)().recordSpend(cost, targetModel);
+        const anomalyResult = (0, anomaly_js_1.getAnomalyDetector)().recordAndAnalyze({ model: targetModel, tokensIn, tokensOut, costUsd: cost });
+        if (anomalyResult.detected) {
+            for (const anomaly of anomalyResult.anomalies) {
+                (0, alerts_js_1.getAlertManager)().fireAnomaly(anomaly);
+            }
+        }
+    }
+    catch { /* budget/anomaly should never block */ }
     if (recordTelemetry) {
         // Record the run in RelayPlane
         try {
@@ -3421,6 +4086,8 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
                 taskType,
                 model: `${targetProvider}:${targetModel}`,
             });
+            // Backfill token/cost data — relay.run() has no adapters so records NULLs
+            relay.patchRunTokens(runResult.runId, tokensIn, tokensOut, cost);
             // Add routing metadata to response
             responseData['_relayplane'] = {
                 runId: runResult.runId,
@@ -3435,15 +4102,34 @@ async function handleNonStreamingRequest(res, request, targetProvider, targetMod
         catch (err) {
             log(`Failed to record run: ${err}`);
         }
-        // Extract token counts from response if available (Anthropic/OpenAI format)
-        const usage = responseData?.usage;
-        const tokensIn = usage?.input_tokens ?? usage?.prompt_tokens ?? 0;
-        const tokensOut = usage?.output_tokens ?? usage?.completion_tokens ?? 0;
-        sendCloudTelemetry(taskType, targetModel, tokensIn, tokensOut, durationMs, true);
+        // Extract token counts from response if available (Anthropic/OpenAI format, including cache)
+        const innerUsage = responseData?.usage;
+        const innerTokIn = innerUsage?.input_tokens ?? innerUsage?.prompt_tokens ?? 0;
+        const innerTokOut = innerUsage?.output_tokens ?? innerUsage?.completion_tokens ?? 0;
+        const innerCacheCreation = innerUsage?.cache_creation_input_tokens ?? 0;
+        const innerCacheRead = innerUsage?.cache_read_input_tokens ?? 0;
+        sendCloudTelemetry(taskType, targetModel, innerTokIn, innerTokOut, durationMs, true, undefined, undefined, innerCacheCreation || undefined, innerCacheRead || undefined);
+        meshCapture(targetModel, targetProvider, taskType, innerTokIn, innerTokOut, cost, durationMs, true);
+    }
+    // ── Cache: store non-streaming chat/completions response ──
+    const chatRespCache = (0, response_cache_js_1.getResponseCache)();
+    const chatReqAsRecord = request;
+    const chatCacheBypassLocal = chatRespCache.shouldBypass(chatReqAsRecord);
+    let chatCacheHeaderVal = chatCacheBypassLocal ? 'BYPASS' : 'MISS';
+    if (!chatCacheBypassLocal) {
+        const chatHashLocal = chatRespCache.computeKey(chatReqAsRecord);
+        chatRespCache.set(chatHashLocal, JSON.stringify(responseData), {
+            model: targetModel,
+            tokensIn: tokensIn,
+            tokensOut: tokensOut,
+            costUsd: cost,
+            taskType,
+        });
+        log(`Cache STORE for chat/completions ${targetModel} (hash: ${chatHashLocal.slice(0, 8)})`);
     }
     // Send response with RelayPlane routing headers
     const nonStreamRpHeaders = buildRelayPlaneResponseHeaders(targetModel, request.model ?? 'unknown', complexity, targetProvider, routingMode);
-    res.writeHead(200, { 'Content-Type': 'application/json', ...nonStreamRpHeaders });
+    res.writeHead(200, { 'Content-Type': 'application/json', 'X-RelayPlane-Cache': chatCacheHeaderVal, ...nonStreamRpHeaders });
     res.end(JSON.stringify(responseData));
 }
 // Note: CLI entry point is in cli.ts