npm - claude-code-cache-fix - Versions diffs - 3.9.0 → 4.1.0 - Mend

claude-code-cache-fix 3.9.0 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +128 -13
package/bin/claude-via-proxy.mjs +1 -0
package/bin/install-service.mjs +38 -12
package/package.json +1 -1
package/proxy/extensions/cache-telemetry.mjs +15 -3
package/proxy/extensions/signature-surface-hash.mjs +60 -0
package/proxy/extensions/thinking-block-sanitize.mjs +233 -19
package/proxy/extensions/usage-log.mjs +46 -1
package/proxy/extensions.json +18 -80
package/proxy/helpers.mjs +30 -0
package/proxy/pipeline.mjs +22 -1
package/proxy/server.mjs +136 -13
package/proxy/upstream.mjs +15 -1
package/templates/cache-fix-proxy.service.template +3 -0
package/templates/com.cnighswonger.cache-fix-proxy.plist.template +3 -0
package/tools/MANUAL-COMPACT.md +16 -1
package/tools/cache_analysis.py +229 -0

package/proxy/server.mjs CHANGED Viewed

@@ -3,9 +3,50 @@ import { pathToFileURL, URL } from "node:url";
 import config from "./config.mjs";
 import { forwardRequest } from "./upstream.mjs";
 import { streamResponse, createTelemetryRecord } from "./stream.mjs";
-import { loadExtensions, snapshotRegistry, runOnRequest, runOnResponseStart, runOnResponse } from "./pipeline.mjs";
+import { loadExtensions, snapshotRegistry, runOnRequest, runOnResponseStart, runOnResponse, getFailedExtensions } from "./pipeline.mjs";
 import { startWatcher } from "./watcher.mjs";
+// Debug logging — writes to ~/.claude/cache-fix-debug.log (override path with
+// CACHE_FIX_DEBUG_LOG). Self-gated on CACHE_FIX_DEBUG=1; a no-op otherwise.
+// Env is read on every call so tests (and operators flipping the flag at
+// runtime) see live behavior — same pattern as image-strip's #98 gate.
+import { appendFileSync, mkdirSync } from "node:fs";
+import { homedir } from "node:os";
+import { dirname, join } from "node:path";
+import util from "node:util";
+function debugLogPath() {
+  return process.env.CACHE_FIX_DEBUG_LOG ||
+    join(homedir(), ".claude", "cache-fix-debug.log");
+}
+// Never spread raw headers to the log: Authorization / x-api-key / cookies
+// must never persist to disk. Same discipline as bootstrap-defense.mjs's
+// audit-record contract — extract named scalars only.
+const SENSITIVE_HEADERS = new Set([
+  "authorization",
+  "x-api-key",
+  "cookie",
+  "set-cookie",
+  "proxy-authorization",
+]);
+function redactHeaders(headers) {
+  const out = {};
+  for (const [k, v] of Object.entries(headers || {})) {
+    out[k] = SENSITIVE_HEADERS.has(k.toLowerCase()) ? "[REDACTED]" : v;
+  }
+  return out;
+}
+function debugLog(...args) {
+  if (process.env.CACHE_FIX_DEBUG !== "1") return;
+  const path = debugLogPath();
+  try { mkdirSync(dirname(path), { recursive: true }); } catch {}
+  const line = `[${new Date().toISOString()}] ${util.format(...args)}\n`;
+  try { appendFileSync(path, line); } catch {}
+}
 function collectBody(req) {
   return new Promise((resolve, reject) => {
     const chunks = [];
@@ -74,7 +115,13 @@ async function handleMessages(clientReq, clientRes) {
   });
   const pre = await preForward(clientReq, clientRes, abortController, extSnapshot, "messages");
-  if (pre.handled) return;
+  if (pre.handled) {
+    debugLog("[PROXY] handled internally without upstream request",
+             "method:", clientReq.method, "url:", clientReq.url,
+             "status:", clientRes.statusCode,
+             "response headers:", redactHeaders(clientRes.getHeaders()));
+    return;
+  }
   const { parsed, forwardBody, meta } = pre;
   const requestedModel = parsed?.model || null;
@@ -88,6 +135,7 @@ async function handleMessages(clientReq, clientRes) {
       abortController.signal
     ));
   } catch (err) {
+    debugLog("[PROXY] forwardRequest error:", err.message);
     if (abortController.signal.aborted) return;
     clientRes.writeHead(502, { "content-type": "application/json" });
     clientRes.end(JSON.stringify({ error: "upstream_error", message: err.message }));
@@ -99,6 +147,11 @@ async function handleMessages(clientReq, clientRes) {
   // socket carried the request without each one re-instrumenting upstream.
   meta._upstreamConnectionId = upstreamConnectionId ?? null;
+  debugLog("[UPSTREAM -> PROXY -> CLAUDE] RESPONSE",
+           "status:", statusCode, "message:", upstreamRes.statusMessage,
+           "upstream headers:", redactHeaders(upstreamRes.headers),
+           "proxy headers:", redactHeaders(responseHeaders));
   if (extSnapshot.length > 0) {
     const resCtx = { status: statusCode, headers: responseHeaders, meta };
     await runOnResponseStart(resCtx, extSnapshot);
@@ -238,6 +291,21 @@ async function handleBootstrap(clientReq, clientRes) {
 }
 function handleHealth(_req, res) {
+  // Surface extension-load failures so callers (operators, monitoring) see
+  // a degraded proxy state instead of a misleading "ok". See #196: a Node
+  // ESM cache stale-import race silently broke thinking-block-sanitize v2
+  // for 17 hours post-merge before anyone noticed. /health returning "ok"
+  // through that window was load-bearing in the silence.
+  const failed = getFailedExtensions();
+  if (failed.length > 0) {
+    res.writeHead(503, { "content-type": "application/json" });
+    res.end(JSON.stringify({
+      status: "degraded",
+      failed_extensions: failed,
+      hint: "restart the proxy via your supervisor to recover (in-process reload cannot fix stale ESM cache; #196)",
+    }));
+    return;
+  }
   res.writeHead(200, { "content-type": "application/json" });
   res.end(JSON.stringify({ status: "ok" }));
 }
@@ -259,16 +327,44 @@ function handleNotFound(_req, res) {
  */
 export function createProxyServer() {
   return http.createServer((req, res) => {
-    if (req.method === "GET" && req.url === "/health") {
-      return handleHealth(req, res);
-    }
-    if (req.method === "POST" && req.url?.startsWith("/v1/messages")) {
-      return handleMessages(req, res);
-    }
-    if (req.url?.startsWith("/api/claude_cli/bootstrap")) {
-      return handleBootstrap(req, res);
-    }
-    handleNotFound(req, res);
+    // Async IIFE: handleMessages/handleBootstrap return promises, so we have
+    // to await them inside the try/catch — a bare return would let rejections
+    // escape to unhandledRejection and (on Node 15+) crash the process.
+    (async () => {
+      try {
+        debugLog("[CLAUDE -> PROXY] REQUEST",
+                 "method:", req.method, "url:", req.url,
+                 "headers:", redactHeaders(req.headers));
+        // Wrap res.write/res.end to log chunk-level activity when debug is on.
+        // These are sync monkey-patches; the inner debugLog self-gates so the
+        // overhead is negligible when CACHE_FIX_DEBUG is unset.
+        const originalWrite = res.write;
+        const originalEnd = res.end;
+        res.write = function (chunk, ...args) {
+          debugLog(`[PROXY -> CLAUDE] Send chunk. Size: ${chunk ? chunk.length : 0} bytes`);
+          return originalWrite.apply(res, [chunk, ...args]);
+        };
+        res.end = function (chunk, ...args) {
+          debugLog("[PROXY -> CLAUDE] Close connection (res.end)");
+          return originalEnd.apply(res, [chunk, ...args]);
+        };
+        if (req.method === "GET" && req.url === "/health") return handleHealth(req, res);
+        if (req.method === "POST" && req.url?.startsWith("/v1/messages")) return await handleMessages(req, res);
+        if (req.url?.startsWith("/api/claude_cli/bootstrap")) return await handleBootstrap(req, res);
+        debugLog("ERROR: handler not found for req.url=", req.url, "method=", req.method);
+        handleNotFound(req, res);
+      } catch (error) {
+        debugLog("REQUEST HANDLER ERROR:", error?.message, error?.stack);
+        // Generic body: do NOT echo error.message (may include internal paths,
+        // upstream URLs, or other server state).
+        if (!res.headersSent) {
+          res.writeHead(500, { "content-type": "application/json" });
+          res.end(JSON.stringify({ error: "internal_proxy_error" }));
+        }
+      }
+    })();
   });
 }
@@ -290,7 +386,34 @@ export async function startProxy(options = {}) {
   const bind = options.bind ?? config.bind;
   const extensionsDir = options.extensionsDir ?? config.extensionsDir;
   const extensionsConfig = options.extensionsConfig ?? config.extensionsConfig;
-  const watch = options.watch !== false;
+  // Hot-reload is opt-in as of v4.0.0 (#196). The in-process watcher is the
+  // only code path that triggers the Node ESM stale-import race; cold starts
+  // have an empty module cache and load extensions cleanly. Strict `=== "on"`
+  // means any other value (including "true"/"1"/"yes") is treated as off —
+  // the safe default. Note this is the opposite stance from
+  // CACHE_FIX_THINKING_SANITIZE (default-on; only literal "off" disables):
+  // a hot-reload enable is a footgun, so we require the operator to type the
+  // exact opt-in token; a sanitize disable is also a footgun (loses the
+  // wedge mitigation), so we require the exact disable token there.
+  const hotReloadOptIn = process.env.CACHE_FIX_HOT_RELOAD === "on";
+  const watch = options.watch !== false && hotReloadOptIn;
+  // Boot banner on stderr so the EFFECTIVE hot-reload mode is visible in the
+  // supervisor's log (journalctl --user / ~/Library/Logs/) without being
+  // noisy for monitoring tools that line-grep stderr. Keyed off the effective
+  // `watch` value, not the raw envvar, so an embedder calling startProxy({
+  // watch: false }) with the envvar set sees "off" (which is the truth — the
+  // watcher is suppressed regardless of envvar in that case). Supervisor-
+  // neutral wording — no version pin (lives in CHANGELOG/README instead).
+  if (watch) {
+    process.stderr.write(
+      "[cache-fix] hot-reload: on (CACHE_FIX_HOT_RELOAD=on) — long-running processes can hit a Node ESM stale-import race; see #196. Restart the proxy via your supervisor to recover.\n",
+    );
+  } else {
+    process.stderr.write(
+      "[cache-fix] hot-reload: off (set CACHE_FIX_HOT_RELOAD=on to enable). Extension changes require a supervisor-level proxy restart.\n",
+    );
+  }
   let watcher = null;
   try {

package/proxy/upstream.mjs CHANGED Viewed

@@ -183,9 +183,23 @@ function getAgent(isHTTPS, hostname) {
   return agent;
 }
+// Build the upstream URL by concatenating the configured base (with any path
+// component preserved) with the client request URL. The historical
+// `new URL(clientReq.url, base)` approach is RFC 3986 relative-resolution,
+// which drops the base's path component when the relative is path-absolute
+// (`/v1/messages`). That breaks corp-proxy / mirror setups where the
+// configured upstream is `https://corp-proxy.example.net/anthropic-mirror`
+// — the request would land at `https://corp-proxy.example.net/v1/messages`
+// with `/anthropic-mirror` silently dropped. See PR #188 / @nisqatsi.
+export function buildUpstreamUrl(base, clientUrl) {
+  const trimmedBase = base.endsWith("/") ? base.slice(0, -1) : base;
+  const relative = clientUrl.startsWith("/") ? clientUrl : "/" + clientUrl;
+  return new URL(trimmedBase + relative);
+}
 export function forwardRequest(clientReq, body, signal) {
   return new Promise((resolve, reject) => {
-    const upstreamUrl = new URL(clientReq.url, config.upstream);
+    const upstreamUrl = buildUpstreamUrl(config.upstream, clientReq.url);
     const headers = buildUpstreamHeaders(clientReq.headers, upstreamUrl.hostname);
     if (body) {

package/templates/cache-fix-proxy.service.template CHANGED Viewed

@@ -10,7 +10,10 @@ Restart=on-failure
 RestartSec=5
 Environment=CACHE_FIX_PROXY_PORT={{PORT}}
 {{UPSTREAM_LINE}}
+{{CA_FILE_LINE}}
+{{REJECT_UNAUTHORIZED_LINE}}
 {{DEBUG_LINE}}
+{{HOT_RELOAD_LINE}}
 WorkingDirectory={{WORKING_DIR}}
 [Install]

package/templates/com.cnighswonger.cache-fix-proxy.plist.template CHANGED Viewed

@@ -14,7 +14,10 @@
         <key>CACHE_FIX_PROXY_PORT</key>
         <string>{{PORT}}</string>
 {{UPSTREAM_PLIST}}
+{{CA_FILE_PLIST}}
+{{REJECT_UNAUTHORIZED_PLIST}}
 {{DEBUG_PLIST}}
+{{HOT_RELOAD_PLIST}}
     </dict>
     <key>WorkingDirectory</key>
     <string>{{WORKING_DIR}}</string>

package/tools/MANUAL-COMPACT.md CHANGED Viewed

@@ -56,7 +56,7 @@ Always:
 ```
 ```
-Project directory: /home/manager/git_repos/kanfei_nowcast_e3b
+Project directory: ~/git_repos/your-project
 Auto-detected session: db11f377-4ca8-4fc3-9b6d-1069da58c1b2.jsonl
   Modified: 2026-04-19 13:26:42
   Size: 4.8M
@@ -155,6 +155,21 @@ The cold rebuild consumed ~15% Q5h in one call on our Max 5x account. After that
 **Total cost of a manual compact cycle:** roughly ~15% cold rebuild plus a few % for the Opus summarization. Compare to hitting the 1M wall and losing the session entirely.
+### Stale transcripts get swept (CC's `cleanupPeriodDays`)
+Heads up if you're treating the on-disk `.jsonl` as a "keep just in case" backup after `/clear`: it isn't durable. Claude Code maintains a transcript-retention setting `cleanupPeriodDays` in `~/.claude/settings.json` (default 30 days). CC runs a transcript cleanup at startup when its `~/.claude/.last-cleanup` sentinel is past the 24h freshness window — when that fires, CC walks every `.jsonl` under `~/.claude/projects/` and deletes any whose `mtime` is past the cutoff, along with the matching `<session-id>/` companion directory next to it. A session you compacted, `/clear`-ed, and stopped retaining ~31 days ago will be gone after the next launch that crosses the cleanup gate, even if you'd planned to grep it for context.
+Practical implications:
+- **If you need the post-compact JSONL preserved**, copy it out of `~/.claude/projects/` to a path that isn't subject to CC's cleanup — e.g. `~/snapshots/cc-jsonl-backups/`.
+- **A stopped session held in heal-and-await state is especially vulnerable** — it's idle by definition, so it crosses `cleanupPeriodDays` faster than an actively-used session whose appends keep mtime fresh. If you've stopped a session intending to resume later, either resume promptly, `touch` the `.jsonl` to refresh mtime, or copy it out of the tree.
+- Cleanup keys off `mtime`, and plain reads (`cat`/`grep`/`less`) don't refresh `mtime` — inspection doesn't extend retention.
+- **Raise the retention setting on every machine you use CC on.** Adding `"cleanupPeriodDays": 36500` (~100 years) to `~/.claude/settings.json` defangs the documented cleanup path entirely. There's no documented upper bound; the schema just wants a positive integer. The cleanup logic re-reads the setting at each sweep, so you can land this even on machines where prior sweeps already happened.
+**If a transcript was already swept** and you need to recover it, [`vsits/restore-claude-history-linux`](https://github.com/vsits/restore-claude-history-linux) (RCB) restores deleted `.jsonl` files from Linux filesystem snapshots — **ZFS**, **Btrfs**, or **Timeshift**. End-to-end-verified on Ubuntu 24.04; a real Btrfs dogfood confirmed a recovered transcript loads and resumes via `/resume` in a fresh CC session. macOS users have the same shape via the upstream [`garrettmoss/restore-claude-history`](https://github.com/garrettmoss/restore-claude-history) (Time Machine). Both tools also remind you to set `cleanupPeriodDays` afterward — otherwise the restored transcript gets re-swept on the next cleanup pass.
+Tracked upstream as [anthropics/claude-code#62272](https://github.com/anthropics/claude-code/issues/62272) — cache-fix doesn't touch this surface, but documenting it because manual-compact users are the population most likely to bank on the `.jsonl` sticking around.
 ### Summarizer model
 The tool defaults to `claude --print --model claude-opus-4-7` for the highest-fidelity summary. Override with the `MANUAL_COMPACT_MODEL` env var — e.g. `MANUAL_COMPACT_MODEL=claude-sonnet-4-6` to minimize Q5h impact, or to point at a different model if Opus is rate-limited or retired.

package/tools/cache_analysis.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""Shared cache analysis helpers for hooks and MCP tools.
+Reference Python helper for consumers that want to read cache-fix's
+``quota-status`` output and reason about cache-state from a Claude Code
+transcript. Used by host-side hooks (e.g. ``~/.claude/hooks/
+context-advisor-analyze.py``) and MCP tools that need quota-aware
+behavior.
+Consumer pattern: copy or symlink this file into ``~/.claude/mcp/`` (or
+wherever your hook / tool expects to import from) and ``from cache_analysis
+import read_quota_status, analyze_transcript`` etc. The file ships in the
+cache-fix npm package's ``tools/`` directory; npm consumers can reference
+``node_modules/claude-code-cache-fix/tools/cache_analysis.py`` directly or
+copy it out for non-npm installs.
+The ``read_quota_status()`` helper handles both cache-fix v3.5.0+ (proxy
+mode, per-session split at ``~/.claude/quota-status/account.json``) and
+v3.4.x and earlier / preload mode (single global
+``~/.claude/quota-status.json``). See the README's "Migration:
+v3.4.x → v3.5.0+" section.
+"""
+import json
+import subprocess
+from datetime import datetime, timezone
+CACHE_TTL_5M = 300                # 5-minute ephemeral TTL
+CACHE_TTL_1H = 3600              # 1-hour extended TTL
+CONTEXT_THRESHOLD = 50_000        # Minimum tokens to recommend compact
+COMPACT_RESULT_ESTIMATE = 12_000  # Estimated tokens after compaction
+CACHE_CREATE_RATE_5M = 3.75       # Opus $/MTok for 5min cache writes
+CACHE_CREATE_RATE_1H = 7.50       # Opus $/MTok for 1h cache writes
+def read_tail_lines(filepath, n=300):
+    """Read last N lines efficiently using tail."""
+    try:
+        result = subprocess.run(
+            ["tail", "-n", str(n), filepath],
+            capture_output=True, text=True, timeout=5,
+        )
+        return result.stdout.splitlines()
+    except Exception:
+        return []
+def parse_assistant_usage(lines):
+    """Extract assistant messages with usage data from transcript lines."""
+    messages = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if obj.get("type") != "assistant":
+            continue
+        msg = obj.get("message", {})
+        usage = msg.get("usage")
+        ts = obj.get("timestamp")
+        if not usage or not ts:
+            continue
+        cr = usage.get("cache_creation_input_tokens", 0)
+        rd = usage.get("cache_read_input_tokens", 0)
+        inp = usage.get("input_tokens", 0)
+        out = usage.get("output_tokens", 0)
+        if cr == 0 and rd == 0 and inp == 0:
+            continue
+        # Extract TTL tier breakdown if available
+        cr_detail = usage.get("cache_creation", {})
+        cr_1h = cr_detail.get("ephemeral_1h_input_tokens", 0) if isinstance(cr_detail, dict) else 0
+        cr_5m = cr_detail.get("ephemeral_5m_input_tokens", 0) if isinstance(cr_detail, dict) else 0
+        messages.append({
+            "timestamp": ts,
+            "input_tokens": inp,
+            "cache_creation": cr,
+            "cache_read": rd,
+            "output_tokens": out,
+            "total_in": cr + rd + inp,
+            "cr_1h": cr_1h,
+            "cr_5m": cr_5m,
+        })
+    return messages
+def detect_cache_ttl(messages):
+    """Detect the effective cache TTL from recent API call usage data.
+    If any recent calls show ephemeral_1h_input_tokens > 0, the account
+    is on the 1-hour tier. Otherwise, assume 5-minute ephemeral.
+    Returns (ttl_seconds, tier_name).
+    """
+    recent = messages[-10:] if len(messages) >= 10 else messages
+    has_1h = any(m.get("cr_1h", 0) > 0 for m in recent)
+    has_5m = any(m.get("cr_5m", 0) > 0 for m in recent)
+    if has_1h:
+        return CACHE_TTL_1H, "1h"
+    if has_5m:
+        return CACHE_TTL_5M, "5m"
+    # No cache_creation breakdown available — conservative default
+    return CACHE_TTL_5M, "5m (default)"
+def estimate_thinking_overhead(messages):
+    """Estimate thinking block replay overhead.
+    Thinking blocks from prior turns replay as input tokens. Heuristic:
+    cumulative output_tokens approximates thinking content that gets replayed.
+    """
+    if len(messages) < 2:
+        return 0
+    return sum(m["output_tokens"] for m in messages[:-1])
+def format_tokens(n):
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"{n / 1_000:.0f}k"
+    return str(n)
+def format_duration(seconds):
+    if seconds >= 3600:
+        return f"{seconds / 3600:.1f}h"
+    return f"{int(seconds / 60)}m"
+def estimate_savings(total_context, ttl_tier="5m"):
+    """Estimate $ savings from compacting before a cold start.
+    Rate depends on the active cache TTL tier — 1h cache writes are 2x the
+    5m rate. Caller should pass the tier returned by detect_cache_ttl().
+    Default is the conservative 5m rate for backward compatibility.
+    """
+    rate = CACHE_CREATE_RATE_1H if ttl_tier.startswith("1h") else CACHE_CREATE_RATE_5M
+    cold_cost = (total_context / 1_000_000) * rate
+    compact_cost = (COMPACT_RESULT_ESTIMATE / 1_000_000) * rate
+    return cold_cost - compact_cost
+def read_quota_status():
+    """Read current quota utilization from cache-fix's quota-status file.
+    Written by the cache-fix interceptor from API response headers. Path
+    depends on cache-fix version:
+      - v3.5.0+ (proxy mode, per-session split): ~/.claude/quota-status/account.json
+      - v3.4.x and earlier (or preload mode): ~/.claude/quota-status.json (flat)
+    Tries the v3.5.0+ path first, falls back to the legacy flat path. A
+    candidate file whose JSON parses but isn't a dict (e.g. a partial write
+    that lands as ``[]`` or ``null``) is skipped so the next candidate gets
+    a chance — and so callers never receive a non-dict and break on
+    ``status.get(...)`` accessors downstream.
+    Returns dict with five_hour/seven_day pct (and other fields written by
+    cache-fix's response-header capture), or None if no candidate yields a
+    dict-shaped payload.
+    """
+    import os
+    for quota_file in (
+        os.path.expanduser("~/.claude/quota-status/account.json"),
+        os.path.expanduser("~/.claude/quota-status.json"),
+    ):
+        try:
+            with open(quota_file) as f:
+                data = json.load(f)
+        except (OSError, json.JSONDecodeError):
+            continue
+        if isinstance(data, dict):
+            return data
+        # Valid JSON but wrong shape — try the next candidate.
+    return None
+def analyze_transcript(transcript_path):
+    """Full analysis of a transcript. Returns a dict with all cache state info.
+    Returns None if analysis can't be performed (no data, etc).
+    """
+    lines = read_tail_lines(transcript_path, 300)
+    if not lines:
+        return None
+    messages = parse_assistant_usage(lines)
+    if not messages:
+        return None
+    last = messages[-1]
+    try:
+        last_ts = datetime.fromisoformat(last["timestamp"].replace("Z", "+00:00"))
+    except (ValueError, KeyError):
+        return None
+    now = datetime.now(timezone.utc)
+    gap_seconds = (now - last_ts).total_seconds()
+    context_tokens = last["total_in"]
+    thinking_overhead = estimate_thinking_overhead(messages)
+    total_with_thinking = context_tokens + thinking_overhead
+    ttl_seconds, ttl_tier = detect_cache_ttl(messages)
+    cache_expired = gap_seconds > ttl_seconds
+    # Last few turns' cache efficiency
+    recent = messages[-5:] if len(messages) >= 5 else messages
+    recent_cr = sum(m["cache_creation"] for m in recent)
+    recent_total = sum(m["total_in"] for m in recent)
+    cr_pct = (recent_cr / recent_total * 100) if recent_total else 0
+    quota = read_quota_status()
+    return {
+        "context_tokens": context_tokens,
+        "thinking_overhead": thinking_overhead,
+        "total_with_thinking": total_with_thinking,
+        "gap_seconds": gap_seconds,
+        "cache_expired": cache_expired,
+        "ttl_seconds": ttl_seconds,
+        "ttl_tier": ttl_tier,
+        "last_timestamp": last["timestamp"],
+        "num_messages": len(messages),
+        "recent_cr_pct": cr_pct,
+        "savings": estimate_savings(total_with_thinking, ttl_tier) if cache_expired else 0,
+        "quota": quota,
+    }