claude-code-cache-fix 3.1.1 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -86,6 +86,43 @@ nohup cache-fix-proxy server > /tmp/cache-fix-proxy.log 2>&1 &
86
86
  echo 'export ANTHROPIC_BASE_URL=http://127.0.0.1:9801' >> ~/.bashrc
87
87
  ```
88
88
 
89
+ ### Docker
90
+
91
+ A multi-arch (amd64, arm64) container image is published to GitHub Container Registry on every release tag.
92
+
93
+ ```bash
94
+ docker run -d --name cache-fix-proxy \
95
+ --restart=always \
96
+ -p 9801:9801 \
97
+ ghcr.io/cnighswonger/claude-code-cache-fix:latest
98
+
99
+ # Then in your shell:
100
+ export ANTHROPIC_BASE_URL=http://127.0.0.1:9801
101
+ ```
102
+
103
+ Use `--restart=always` instead of the systemd healthcheck companion — Docker handles auto-recovery natively. Mount nothing; the container is stateless. Override the default port with `-e CACHE_FIX_PROXY_PORT=...`. Override the upstream (e.g. to chain through llm-relay) with `-e CACHE_FIX_PROXY_UPSTREAM=http://host.docker.internal:8080`. The image runs as the unprivileged `node` user (uid 1000) and exposes a `HEALTHCHECK` Docker can use for liveness.
104
+
105
+ For corporate environments behind an SSL-inspecting proxy, mount your CA bundle and set the env vars:
106
+
107
+ ```bash
108
+ docker run -d --name cache-fix-proxy --restart=always -p 9801:9801 \
109
+ -e HTTPS_PROXY=http://proxy.corp.example:8080 \
110
+ -e CACHE_FIX_PROXY_CA_FILE=/etc/ssl/corp-ca.pem \
111
+ -v /path/to/zscaler-root.pem:/etc/ssl/corp-ca.pem:ro \
112
+ ghcr.io/cnighswonger/claude-code-cache-fix:latest
113
+ ```
114
+
115
+ Image tags: `latest`, `3`, `3.2`, `3.2.1` (semver-ladder, so `3` always points to the newest 3.x). `latest` always tracks the newest tagged release.
116
+
117
+ **Linux note:** the chained-upstream `host.docker.internal` example below is automatic on Docker Desktop (macOS / Windows). On plain Linux Docker Engine you usually need `--add-host=host.docker.internal:host-gateway` so the name resolves to the host bridge. Without it, the container's name lookup fails and the proxy can't reach the upstream service running on the host. Example chaining cache-fix proxy through `llm-relay` running on the host:
118
+
119
+ ```bash
120
+ docker run -d --name cache-fix-proxy --restart=always -p 9801:9801 \
121
+ --add-host=host.docker.internal:host-gateway \
122
+ -e CACHE_FIX_PROXY_UPSTREAM=http://host.docker.internal:8080 \
123
+ ghcr.io/cnighswonger/claude-code-cache-fix:latest
124
+ ```
125
+
89
126
  ### Health check
90
127
 
91
128
  ```bash
@@ -297,6 +334,27 @@ export CACHE_FIX_IMAGE_KEEP_LAST=3
297
334
 
298
335
  Keeps images in the last 3 user messages, replaces older ones with a text placeholder. Only targets `tool_result` blocks — user-pasted images are never touched.
299
336
 
337
+ ### Oversized-image guard
338
+
339
+ ```bash
340
+ export CACHE_FIX_IMAGE_MAX_DIM=2000
341
+ ```
342
+
343
+ The Anthropic API enforces TWO image-related limits on multi-image requests, and the same error message can fire for either:
344
+
345
+ > `"An image in the conversation exceeds the dimension limit for many-image requests (2000px). Start a new session with fewer images."`
346
+
347
+ Two pressure axes to address them:
348
+
349
+ | Pressure | Variable | What it does |
350
+ |---|---|---|
351
+ | **Too many images in conversation** | `CACHE_FIX_IMAGE_KEEP_LAST=N` | Strips images from old user messages, keeps only the last N. |
352
+ | **Any single image too large** | `CACHE_FIX_IMAGE_MAX_DIM=2000` | Replaces images exceeding the dimension limit with a forensic placeholder noting the original dimensions. Covers both user-message direct images and tool_result-nested images. |
353
+
354
+ The two compose: with both set, `KEEP_LAST` runs first (drops the count), then `MAX_DIM` runs on what remains (caps the size of the kept ones). Common triggers for the dimension axis: hi-res manuscript scans, retina screenshots, photos at full resolution.
355
+
356
+ Pure-JS PNG and JPEG header parsing — no native deps. Other formats (GIF, WebP, AVIF, BMP) pass through unchanged regardless of dimension. Fail-open: images whose dimensions can't be parsed (truncated header, unsupported format) are kept rather than stripped — better to send a request that might error than to strip a valid image we just couldn't measure.
357
+
300
358
  ## System prompt rewrite (preload mode, optional)
301
359
 
302
360
  The interceptor can rewrite Claude Code's `# Output efficiency` system-prompt section. Disabled by default. Enable with `CACHE_FIX_OUTPUT_EFFICIENCY_REPLACEMENT`. See [docs/output-efficiency-prompts.md](docs/output-efficiency-prompts.md) for the three known prompt variants and usage instructions.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-code-cache-fix",
3
- "version": "3.1.1",
3
+ "version": "3.2.1",
4
4
  "description": "Cache optimization proxy and interceptor for Claude Code. Fixes prompt cache bugs, stabilizes prefix, reduces quota burn.",
5
5
  "type": "module",
6
6
  "exports": "./preload.mjs",
@@ -1,6 +1,13 @@
1
+ import { parseImageDimensions } from "../image-dimensions.mjs";
2
+
1
3
  const KEEP_LAST = parseInt(process.env.CACHE_FIX_IMAGE_KEEP_LAST || "0", 10);
4
+ const MAX_DIM = parseInt(process.env.CACHE_FIX_IMAGE_MAX_DIM || "0", 10);
2
5
  const PLACEHOLDER = "[image stripped from history — file may still be on disk]";
3
6
 
7
+ function oversizedPlaceholder(maxDim, w, h) {
8
+ return `[image stripped — exceeded ${maxDim}px max dimension (was ${w}x${h}px)]`;
9
+ }
10
+
4
11
  function stripOldToolResultImages(messages, keepLast) {
5
12
  if (!keepLast || keepLast <= 0 || !Array.isArray(messages)) {
6
13
  return { messages, stats: null };
@@ -58,26 +65,116 @@ function stripOldToolResultImages(messages, keepLast) {
58
65
  return { messages: strippedCount > 0 ? result : messages, stats };
59
66
  }
60
67
 
61
- export { stripOldToolResultImages, PLACEHOLDER };
68
+ // Strip oversized images from BOTH user-message direct content and
69
+ // tool_result-nested content. Orthogonal to KEEP_LAST: scans every image
70
+ // remaining in the message list and replaces any whose width or height
71
+ // exceeds maxDim. Fail-open: images we can't measure (unsupported format,
72
+ // truncated header) are kept rather than stripped.
73
+ //
74
+ // Stripping by oversize prevents the Anthropic API error:
75
+ // "An image in the conversation exceeds the dimension limit for many-image
76
+ // requests (2000px). Start a new session with fewer images."
77
+ function stripOversizedImages(messages, maxDim) {
78
+ if (!maxDim || maxDim <= 0 || !Array.isArray(messages)) {
79
+ return { messages, stats: null };
80
+ }
81
+
82
+ let strippedCount = 0;
83
+ let strippedBytes = 0;
84
+
85
+ function maybeStrip(item) {
86
+ if (!item || item.type !== "image") return item;
87
+ const src = item.source;
88
+ if (!src || !src.data || !src.media_type) return item;
89
+ const dims = parseImageDimensions(src.media_type, src.data);
90
+ if (!dims) return item; // can't measure → keep
91
+ if (dims.width <= maxDim && dims.height <= maxDim) return item;
92
+ strippedCount++;
93
+ strippedBytes += src.data.length;
94
+ return { type: "text", text: oversizedPlaceholder(maxDim, dims.width, dims.height) };
95
+ }
96
+
97
+ const result = messages.map((msg) => {
98
+ if (!Array.isArray(msg.content)) return msg;
99
+ let mutated = false;
100
+ const newContent = msg.content.map((block) => {
101
+ // Direct image block on a user message
102
+ if (block && block.type === "image") {
103
+ const replaced = maybeStrip(block);
104
+ if (replaced !== block) {
105
+ mutated = true;
106
+ return replaced;
107
+ }
108
+ return block;
109
+ }
110
+ // Image nested inside a tool_result.content array
111
+ if (block && block.type === "tool_result" && Array.isArray(block.content)) {
112
+ let toolMutated = false;
113
+ const newToolContent = block.content.map((item) => {
114
+ const replaced = maybeStrip(item);
115
+ if (replaced !== item) toolMutated = true;
116
+ return replaced;
117
+ });
118
+ if (toolMutated) {
119
+ mutated = true;
120
+ return { ...block, content: newToolContent };
121
+ }
122
+ }
123
+ return block;
124
+ });
125
+ return mutated ? { ...msg, content: newContent } : msg;
126
+ });
127
+
128
+ const stats = strippedCount > 0
129
+ ? { strippedCount, strippedBytes, estimatedTokens: Math.ceil(strippedBytes * 0.125) }
130
+ : null;
131
+
132
+ return { messages: strippedCount > 0 ? result : messages, stats };
133
+ }
134
+
135
+ export { stripOldToolResultImages, stripOversizedImages, PLACEHOLDER, oversizedPlaceholder };
62
136
 
63
137
  export default {
64
138
  name: "image-strip",
65
- description: "Strip base64 images from old tool results to reduce token waste",
139
+ description:
140
+ "Strip base64 images from old tool results AND optionally strip oversized images that would trigger Anthropic's many-image dimension limit",
66
141
  enabled: false,
67
142
  order: 150,
68
143
 
69
144
  async onRequest(ctx) {
70
145
  const keepLast = parseInt(ctx.meta.imageKeepLast ?? KEEP_LAST, 10);
71
- if (!keepLast || keepLast <= 0) return;
146
+ const maxDim = parseInt(ctx.meta.imageMaxDim ?? MAX_DIM, 10);
147
+ if ((!keepLast || keepLast <= 0) && (!maxDim || maxDim <= 0)) return;
72
148
  if (!ctx.body.messages) return;
73
149
 
74
- const { messages, stats } = stripOldToolResultImages(ctx.body.messages, keepLast);
75
- if (stats) {
150
+ let messages = ctx.body.messages;
151
+ const logParts = [];
152
+
153
+ // Pass 1: existing keep_last behavior. Sets ctx.meta.imageStripStats with
154
+ // the same shape as before this PR — back-compat preserved.
155
+ if (keepLast > 0) {
156
+ const r = stripOldToolResultImages(messages, keepLast);
157
+ if (r.stats) {
158
+ messages = r.messages;
159
+ ctx.meta.imageStripStats = r.stats;
160
+ logParts.push(`keep_last: ${r.stats.strippedCount} stripped (~${r.stats.estimatedTokens} tokens saved)`);
161
+ }
162
+ }
163
+
164
+ // Pass 2: new max_dim behavior. Stats land on a new field so consumers
165
+ // already reading imageStripStats don't see a shape change.
166
+ if (maxDim > 0) {
167
+ const r = stripOversizedImages(messages, maxDim);
168
+ if (r.stats) {
169
+ messages = r.messages;
170
+ ctx.meta.imageStripOversizedStats = r.stats;
171
+ logParts.push(`max_dim: ${r.stats.strippedCount} oversized stripped (~${r.stats.estimatedTokens} tokens saved)`);
172
+ }
173
+ }
174
+
175
+ if (logParts.length > 0) {
76
176
  ctx.body.messages = messages;
77
- ctx.meta.imageStripStats = stats;
78
- process.stderr.write(
79
- `[image-strip] stripped ${stats.strippedCount} images (~${stats.estimatedTokens} tokens saved)\n`
80
- );
177
+ process.stderr.write(`[image-strip] ${logParts.join("; ")}\n`);
81
178
  }
82
179
  },
83
180
  };
@@ -0,0 +1,385 @@
1
+ // overage-warning — emit a one-time warning per Q5h-window threshold
2
+ // crossing when Anthropic's response headers indicate the user is
3
+ // approaching or has crossed the overage threshold.
4
+ //
5
+ // Advisory only. No request mutation. Two outputs:
6
+ // 1. stderr line prefixed `[overage-warning]` for proxy journals/logs
7
+ // 2. structured JSON record appended to `~/.claude/overage-warnings.jsonl`
8
+ //
9
+ // Activation: `enabled: true` in extensions.json (this extension is
10
+ // always loaded), gated at runtime by `CACHE_FIX_OVERAGE_WARNING=1`.
11
+ // Matches the prefix-diff pattern (env-var-only opt-in).
12
+ //
13
+ // See `docs/directives/proxy-overage-cost-warning.md` for the full design.
14
+
15
+ import { appendFile, mkdir } from "node:fs/promises";
16
+ import { join, dirname } from "node:path";
17
+ import { homedir } from "node:os";
18
+
19
+ import { WEIGHTED_TOKEN_COST_USD_COARSE } from "../rates.mjs";
20
+
21
+ // Env-gated runtime flags read on each call. Reading at module load would
22
+ // freeze the values and make per-test isolation impossible. The check is
23
+ // cheap (one process.env lookup per invocation when disabled).
24
+ function isEnabled() {
25
+ return process.env.CACHE_FIX_OVERAGE_WARNING === "1";
26
+ }
27
+ function isQuiet() {
28
+ return process.env.CACHE_FIX_OVERAGE_WARNING_QUIET === "1";
29
+ }
30
+ function isDebug() {
31
+ return process.env.CACHE_FIX_DEBUG === "1";
32
+ }
33
+
34
+ function debug(msg) {
35
+ if (isDebug()) process.stderr.write(`[overage-warning] DEBUG: ${msg}\n`);
36
+ }
37
+
38
+ // --- Module-scope state ---
39
+ //
40
+ // Sliding window of (timestamp, q5h_util, input_tokens, cache_creation_tokens,
41
+ // cache_read_tokens, output_tokens) samples. Used to compute burn rate.
42
+ //
43
+ // Cross-response dedup: per Q5h window (keyed by q5h_resets_at), the set
44
+ // of thresholds we've already warned at. Window expires when q5h_resets_at
45
+ // changes (new window = new dedup state).
46
+
47
+ const WINDOW_MS = 15 * 60 * 1000;
48
+ const WINDOW_MAX_SAMPLES = 60;
49
+ const WARM_UP_MIN_SAMPLES = 3;
50
+
51
+ const _window = []; // { t, q5h, input, cache_creation, cache_read, output }
52
+ let _dedupWindowResetsAt = 0;
53
+ let _dedupThresholds = new Set();
54
+
55
+ function resetState() {
56
+ _window.length = 0;
57
+ _dedupWindowResetsAt = 0;
58
+ _dedupThresholds = new Set();
59
+ }
60
+
61
+ // --- Pure functions (test seam) ---
62
+
63
+ export function parseTriggerFromHeaders(headers) {
64
+ if (!headers || typeof headers !== "object") return { eligible: false };
65
+ const get = (k) => headers[k] || "";
66
+ const num = (k) => {
67
+ const v = get(k);
68
+ if (!v) return null;
69
+ const n = parseFloat(v);
70
+ return Number.isFinite(n) ? n : null;
71
+ };
72
+ const intOf = (k) => {
73
+ const v = get(k);
74
+ if (!v) return 0;
75
+ const n = parseInt(v, 10);
76
+ return Number.isFinite(n) ? n : 0;
77
+ };
78
+
79
+ const status =
80
+ get("anthropic-ratelimit-unified-status") ||
81
+ get("anthropic-ratelimit-unified-5h-status");
82
+ const surpassed = num("anthropic-ratelimit-unified-7d-surpassed-threshold");
83
+ const overage_status = get("anthropic-ratelimit-unified-overage-status") || "unknown";
84
+ const upgrade_paths_raw = get("anthropic-ratelimit-unified-upgrade-paths");
85
+ const q5h_util = num("anthropic-ratelimit-unified-5h-utilization");
86
+ const q7d_util = num("anthropic-ratelimit-unified-7d-utilization");
87
+ const q5h_resets_at = intOf("anthropic-ratelimit-unified-5h-reset");
88
+
89
+ // Trigger gates: status is allowed_warning or throttled, surpassed-threshold
90
+ // header is present and non-empty.
91
+ const isWarn = status === "allowed_warning" || status === "throttled";
92
+ if (!isWarn) return { eligible: false };
93
+ if (surpassed === null) return { eligible: false };
94
+
95
+ const upgrade_paths = upgrade_paths_raw
96
+ ? upgrade_paths_raw.split(",").map((s) => s.trim()).filter(Boolean)
97
+ : [];
98
+
99
+ return {
100
+ eligible: true,
101
+ trigger: {
102
+ status,
103
+ surpassed_threshold: surpassed,
104
+ overage_status,
105
+ upgrade_paths,
106
+ },
107
+ snapshot: {
108
+ q5h_pct: q5h_util !== null ? Math.round(q5h_util * 100) : null,
109
+ q7d_pct: q7d_util !== null ? Math.round(q7d_util * 100) : null,
110
+ q5h_resets_at,
111
+ },
112
+ raw: {
113
+ q5h_util,
114
+ q5h_resets_at,
115
+ },
116
+ };
117
+ }
118
+
119
+ export function dedupKey(threshold, q5h_resets_at) {
120
+ return `${threshold}@${q5h_resets_at}`;
121
+ }
122
+
123
+ // Compute burn-rate projection from samples. Returns:
124
+ // { min_to_100, tokens_per_min, cost_per_hr_usd_coarse, window_samples,
125
+ // window_minutes }
126
+ // All projection fields are `null` when fewer than WARM_UP_MIN_SAMPLES
127
+ // samples exist OR utilization is non-increasing across the window.
128
+ export function computeProjection(samples, now = Date.now()) {
129
+ // Drop expired samples (caller may have already done this; defensive).
130
+ const fresh = samples.filter((s) => now - s.t <= WINDOW_MS);
131
+
132
+ if (fresh.length < WARM_UP_MIN_SAMPLES) {
133
+ return {
134
+ min_to_100: null,
135
+ tokens_per_min: null,
136
+ cost_per_hr_usd_coarse: null,
137
+ window_samples: fresh.length,
138
+ window_minutes: 0,
139
+ };
140
+ }
141
+
142
+ const oldest = fresh[0];
143
+ const newest = fresh[fresh.length - 1];
144
+ const windowMin = (newest.t - oldest.t) / 60_000;
145
+
146
+ if (windowMin <= 0) {
147
+ return {
148
+ min_to_100: null,
149
+ tokens_per_min: null,
150
+ cost_per_hr_usd_coarse: null,
151
+ window_samples: fresh.length,
152
+ window_minutes: 0,
153
+ };
154
+ }
155
+
156
+ const deltaUtil = newest.q5h - oldest.q5h;
157
+ const utilPerMin = deltaUtil / windowMin;
158
+
159
+ let min_to_100 = null;
160
+ if (utilPerMin > 0) {
161
+ min_to_100 = Math.max(0, Math.round((1 - newest.q5h) / utilPerMin));
162
+ }
163
+
164
+ // Sum of all relevant tokens across the window. Each sample carries the
165
+ // per-call token deltas as pushed by recordSample (caller responsibility).
166
+ const totalTokens = fresh.reduce(
167
+ (acc, s) =>
168
+ acc + (s.input || 0) + (s.cache_creation || 0) + (s.cache_read || 0) + (s.output || 0),
169
+ 0,
170
+ );
171
+ const tokens_per_min = totalTokens / windowMin;
172
+ const cost_per_hr_usd_coarse =
173
+ utilPerMin > 0
174
+ ? +(tokens_per_min * 60 * WEIGHTED_TOKEN_COST_USD_COARSE).toFixed(2)
175
+ : null;
176
+
177
+ return {
178
+ min_to_100,
179
+ tokens_per_min: Math.round(tokens_per_min),
180
+ cost_per_hr_usd_coarse,
181
+ window_samples: fresh.length,
182
+ window_minutes: +windowMin.toFixed(1),
183
+ };
184
+ }
185
+
186
+ export function formatStderrLine({ ts, trigger, snapshot, projection }) {
187
+ const upgrade = trigger.upgrade_paths.length
188
+ ? trigger.upgrade_paths.join(", ")
189
+ : "(none)";
190
+ const head = `[overage-warning] ${ts} Q5h=${snapshot.q5h_pct}% Q7d=${snapshot.q7d_pct}% (surpassed ${trigger.surpassed_threshold})`;
191
+ if (projection && projection.min_to_100 !== null && projection.cost_per_hr_usd_coarse !== null) {
192
+ return `${head} — projected 100% in ~${projection.min_to_100} min, estimated continued burn ≈ $${projection.cost_per_hr_usd_coarse.toFixed(2)}/hr at API rates (coarse). Upgrade paths: ${upgrade}.`;
193
+ }
194
+ return `${head} — projection unavailable (warming up). Upgrade paths: ${upgrade}.`;
195
+ }
196
+
197
+ export function formatJsonlRecord({ ts, trigger, snapshot, projection }) {
198
+ return {
199
+ ts,
200
+ trigger: {
201
+ status: trigger.status,
202
+ surpassed_threshold: trigger.surpassed_threshold,
203
+ overage_status: trigger.overage_status,
204
+ upgrade_paths: trigger.upgrade_paths,
205
+ },
206
+ snapshot: {
207
+ q5h_pct: snapshot.q5h_pct,
208
+ q7d_pct: snapshot.q7d_pct,
209
+ q5h_resets_at: snapshot.q5h_resets_at,
210
+ },
211
+ projection: projection || {
212
+ min_to_100: null,
213
+ tokens_per_min: null,
214
+ cost_per_hr_usd_coarse: null,
215
+ window_samples: 0,
216
+ window_minutes: 0,
217
+ },
218
+ };
219
+ }
220
+
221
+ // --- Window management ---
222
+
223
+ export function recordSample(state, sample) {
224
+ state.window.push(sample);
225
+ const cutoff = sample.t - WINDOW_MS;
226
+ while (state.window.length && state.window[0].t < cutoff) state.window.shift();
227
+ while (state.window.length > WINDOW_MAX_SAMPLES) state.window.shift();
228
+ }
229
+
230
+ // --- Dedup helpers operating on module state ---
231
+
232
+ function checkAndMarkDedup(threshold, q5h_resets_at) {
233
+ // New Q5h window resets the dedup set.
234
+ if (q5h_resets_at !== _dedupWindowResetsAt) {
235
+ _dedupWindowResetsAt = q5h_resets_at;
236
+ _dedupThresholds = new Set();
237
+ }
238
+ const key = dedupKey(threshold, q5h_resets_at);
239
+ if (_dedupThresholds.has(key)) return false;
240
+ _dedupThresholds.add(key);
241
+ return true;
242
+ }
243
+
244
+ // --- I/O ---
245
+
246
+ async function appendJsonl(record, dir) {
247
+ const outDir = dir || (process.env.CACHE_FIX_OVERAGE_WARNING_DIR || join(homedir(), ".claude"));
248
+ const outPath = join(outDir, "overage-warnings.jsonl");
249
+ await mkdir(outDir, { recursive: true });
250
+ await appendFile(outPath, JSON.stringify(record) + "\n");
251
+ }
252
+
253
+ // Test helper: write a record using a caller-supplied directory. Bypasses
254
+ // env-var lookup so tests do not race on a shared env. Pure side effect.
255
+ export async function writeRecord(record, dir) {
256
+ await appendJsonl(record, dir);
257
+ }
258
+
259
+ // Test helper: drain in-memory state. For deterministic tests.
260
+ export function _resetForTest() {
261
+ resetState();
262
+ }
263
+
264
+ // --- Extension contract ---
265
+
266
+ export default {
267
+ name: "overage-warning",
268
+ description:
269
+ "Emit one-time warning per Q5h-window threshold crossing when overage headers indicate trouble",
270
+ enabled: true,
271
+ order: 610,
272
+
273
+ async onResponseStart(ctx) {
274
+ if (!isEnabled()) return;
275
+ if (!ctx || !ctx.headers) return;
276
+
277
+ try {
278
+ ctx.meta = ctx.meta || {};
279
+
280
+ // Always capture quota state if the headers carry it, regardless of
281
+ // whether THIS response's status crosses a warning threshold. Future
282
+ // responses need warm samples to project from.
283
+ const q5hRaw = ctx.headers["anthropic-ratelimit-unified-5h-utilization"];
284
+ const q5hUtil = q5hRaw ? parseFloat(q5hRaw) : null;
285
+ if (q5hUtil !== null && Number.isFinite(q5hUtil)) {
286
+ ctx.meta._overageQuota = { q5h_util: q5hUtil };
287
+ }
288
+
289
+ // Trigger eligibility latch — only set when this response is the one
290
+ // that crossed a threshold. Keeps emission gate separate from sampling.
291
+ const result = parseTriggerFromHeaders(ctx.headers);
292
+ if (!result.eligible) return;
293
+ ctx.meta._overageWarning = {
294
+ eligible: true,
295
+ emitted: false,
296
+ trigger: result.trigger,
297
+ snapshot: result.snapshot,
298
+ raw: result.raw,
299
+ };
300
+ } catch (err) {
301
+ debug(`onResponseStart unexpected: ${err?.message ?? err}`);
302
+ }
303
+ },
304
+
305
+ async onStreamEvent(ctx) {
306
+ if (!isEnabled()) return;
307
+ if (!ctx || !ctx.event) return;
308
+
309
+ try {
310
+ // Sample collection — happens on every response that has a quota
311
+ // reading, regardless of whether this response is the one that emits.
312
+ if (ctx.event.type === "message_start" && ctx.event.message?.usage) {
313
+ const u = ctx.event.message.usage;
314
+ const q5hUtil = ctx.meta?._overageQuota?.q5h_util;
315
+ if (q5hUtil !== undefined && q5hUtil !== null) {
316
+ const sample = {
317
+ t: Date.now(),
318
+ q5h: q5hUtil,
319
+ input: u.input_tokens || 0,
320
+ cache_creation: u.cache_creation_input_tokens || 0,
321
+ cache_read: u.cache_read_input_tokens || 0,
322
+ output: 0,
323
+ };
324
+ recordSample({ window: _window }, sample);
325
+ // Hand the response its own sample reference. message_delta updates
326
+ // THIS sample only — never the window's last sample, which could
327
+ // belong to a different response under interleaving.
328
+ ctx.meta._overageSample = sample;
329
+ }
330
+ }
331
+
332
+ if (ctx.event.type === "message_delta") {
333
+ // Update THIS response's sample with output tokens. The sample
334
+ // reference is response-local (set by message_start), so a response
335
+ // that never sampled cannot leak output tokens into another response.
336
+ const ownSample = ctx.meta?._overageSample;
337
+ if (ownSample && ctx.event.usage?.output_tokens) {
338
+ ownSample.output += ctx.event.usage.output_tokens;
339
+ }
340
+
341
+ // Emission gate.
342
+ const w = ctx.meta?._overageWarning;
343
+ if (!w || !w.eligible || w.emitted) return;
344
+
345
+ const allowed = checkAndMarkDedup(
346
+ w.trigger.surpassed_threshold,
347
+ w.snapshot.q5h_resets_at,
348
+ );
349
+ if (!allowed) {
350
+ w.emitted = true;
351
+ return;
352
+ }
353
+
354
+ const ts = new Date().toISOString();
355
+ const projection = computeProjection(_window, Date.now());
356
+ const projectionForOutput =
357
+ projection.window_samples >= WARM_UP_MIN_SAMPLES &&
358
+ projection.min_to_100 !== null
359
+ ? projection
360
+ : null;
361
+
362
+ const record = formatJsonlRecord({
363
+ ts,
364
+ trigger: w.trigger,
365
+ snapshot: w.snapshot,
366
+ projection: projectionForOutput || projection,
367
+ });
368
+
369
+ if (!isQuiet()) {
370
+ process.stderr.write(formatStderrLine({
371
+ ts,
372
+ trigger: w.trigger,
373
+ snapshot: w.snapshot,
374
+ projection: projectionForOutput,
375
+ }) + "\n");
376
+ }
377
+
378
+ await appendJsonl(record);
379
+ w.emitted = true;
380
+ }
381
+ } catch (err) {
382
+ debug(`onStreamEvent unexpected: ${err?.message ?? err}`);
383
+ }
384
+ },
385
+ };