claude-code-cache-fix 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,385 @@
1
+ // overage-warning — emit a one-time warning per Q5h-window threshold
2
+ // crossing when Anthropic's response headers indicate the user is
3
+ // approaching or has crossed the overage threshold.
4
+ //
5
+ // Advisory only. No request mutation. Two outputs:
6
+ // 1. stderr line prefixed `[overage-warning]` for proxy journals/logs
7
+ // 2. structured JSON record appended to `~/.claude/overage-warnings.jsonl`
8
+ //
9
+ // Activation: `enabled: true` in extensions.json (this extension is
10
+ // always loaded), gated at runtime by `CACHE_FIX_OVERAGE_WARNING=1`.
11
+ // Matches the prefix-diff pattern (env-var-only opt-in).
12
+ //
13
+ // See `docs/directives/proxy-overage-cost-warning.md` for the full design.
14
+
15
+ import { appendFile, mkdir } from "node:fs/promises";
16
+ import { join, dirname } from "node:path";
17
+ import { homedir } from "node:os";
18
+
19
+ import { WEIGHTED_TOKEN_COST_USD_COARSE } from "../rates.mjs";
20
+
21
+ // Env-gated runtime flags read on each call. Reading at module load would
22
+ // freeze the values and make per-test isolation impossible. The check is
23
+ // cheap (one process.env lookup per invocation when disabled).
24
+ function isEnabled() {
25
+ return process.env.CACHE_FIX_OVERAGE_WARNING === "1";
26
+ }
27
+ function isQuiet() {
28
+ return process.env.CACHE_FIX_OVERAGE_WARNING_QUIET === "1";
29
+ }
30
+ function isDebug() {
31
+ return process.env.CACHE_FIX_DEBUG === "1";
32
+ }
33
+
34
+ function debug(msg) {
35
+ if (isDebug()) process.stderr.write(`[overage-warning] DEBUG: ${msg}\n`);
36
+ }
37
+
38
+ // --- Module-scope state ---
39
+ //
40
+ // Sliding window of (timestamp, q5h_util, input_tokens, cache_creation_tokens,
41
+ // cache_read_tokens, output_tokens) samples. Used to compute burn rate.
42
+ //
43
+ // Cross-response dedup: per Q5h window (keyed by q5h_resets_at), the set
44
+ // of thresholds we've already warned at. Window expires when q5h_resets_at
45
+ // changes (new window = new dedup state).
46
+
47
+ const WINDOW_MS = 15 * 60 * 1000;
48
+ const WINDOW_MAX_SAMPLES = 60;
49
+ const WARM_UP_MIN_SAMPLES = 3;
50
+
51
+ const _window = []; // { t, q5h, input, cache_creation, cache_read, output }
52
+ let _dedupWindowResetsAt = 0;
53
+ let _dedupThresholds = new Set();
54
+
55
+ function resetState() {
56
+ _window.length = 0;
57
+ _dedupWindowResetsAt = 0;
58
+ _dedupThresholds = new Set();
59
+ }
60
+
61
+ // --- Pure functions (test seam) ---
62
+
63
+ export function parseTriggerFromHeaders(headers) {
64
+ if (!headers || typeof headers !== "object") return { eligible: false };
65
+ const get = (k) => headers[k] || "";
66
+ const num = (k) => {
67
+ const v = get(k);
68
+ if (!v) return null;
69
+ const n = parseFloat(v);
70
+ return Number.isFinite(n) ? n : null;
71
+ };
72
+ const intOf = (k) => {
73
+ const v = get(k);
74
+ if (!v) return 0;
75
+ const n = parseInt(v, 10);
76
+ return Number.isFinite(n) ? n : 0;
77
+ };
78
+
79
+ const status =
80
+ get("anthropic-ratelimit-unified-status") ||
81
+ get("anthropic-ratelimit-unified-5h-status");
82
+ const surpassed = num("anthropic-ratelimit-unified-7d-surpassed-threshold");
83
+ const overage_status = get("anthropic-ratelimit-unified-overage-status") || "unknown";
84
+ const upgrade_paths_raw = get("anthropic-ratelimit-unified-upgrade-paths");
85
+ const q5h_util = num("anthropic-ratelimit-unified-5h-utilization");
86
+ const q7d_util = num("anthropic-ratelimit-unified-7d-utilization");
87
+ const q5h_resets_at = intOf("anthropic-ratelimit-unified-5h-reset");
88
+
89
+ // Trigger gates: status is allowed_warning or throttled, surpassed-threshold
90
+ // header is present and non-empty.
91
+ const isWarn = status === "allowed_warning" || status === "throttled";
92
+ if (!isWarn) return { eligible: false };
93
+ if (surpassed === null) return { eligible: false };
94
+
95
+ const upgrade_paths = upgrade_paths_raw
96
+ ? upgrade_paths_raw.split(",").map((s) => s.trim()).filter(Boolean)
97
+ : [];
98
+
99
+ return {
100
+ eligible: true,
101
+ trigger: {
102
+ status,
103
+ surpassed_threshold: surpassed,
104
+ overage_status,
105
+ upgrade_paths,
106
+ },
107
+ snapshot: {
108
+ q5h_pct: q5h_util !== null ? Math.round(q5h_util * 100) : null,
109
+ q7d_pct: q7d_util !== null ? Math.round(q7d_util * 100) : null,
110
+ q5h_resets_at,
111
+ },
112
+ raw: {
113
+ q5h_util,
114
+ q5h_resets_at,
115
+ },
116
+ };
117
+ }
118
+
119
+ export function dedupKey(threshold, q5h_resets_at) {
120
+ return `${threshold}@${q5h_resets_at}`;
121
+ }
122
+
123
+ // Compute burn-rate projection from samples. Returns:
124
+ // { min_to_100, tokens_per_min, cost_per_hr_usd_coarse, window_samples,
125
+ // window_minutes }
126
+ // All projection fields are `null` when fewer than WARM_UP_MIN_SAMPLES
127
+ // samples exist OR utilization is non-increasing across the window.
128
+ export function computeProjection(samples, now = Date.now()) {
129
+ // Drop expired samples (caller may have already done this; defensive).
130
+ const fresh = samples.filter((s) => now - s.t <= WINDOW_MS);
131
+
132
+ if (fresh.length < WARM_UP_MIN_SAMPLES) {
133
+ return {
134
+ min_to_100: null,
135
+ tokens_per_min: null,
136
+ cost_per_hr_usd_coarse: null,
137
+ window_samples: fresh.length,
138
+ window_minutes: 0,
139
+ };
140
+ }
141
+
142
+ const oldest = fresh[0];
143
+ const newest = fresh[fresh.length - 1];
144
+ const windowMin = (newest.t - oldest.t) / 60_000;
145
+
146
+ if (windowMin <= 0) {
147
+ return {
148
+ min_to_100: null,
149
+ tokens_per_min: null,
150
+ cost_per_hr_usd_coarse: null,
151
+ window_samples: fresh.length,
152
+ window_minutes: 0,
153
+ };
154
+ }
155
+
156
+ const deltaUtil = newest.q5h - oldest.q5h;
157
+ const utilPerMin = deltaUtil / windowMin;
158
+
159
+ let min_to_100 = null;
160
+ if (utilPerMin > 0) {
161
+ min_to_100 = Math.max(0, Math.round((1 - newest.q5h) / utilPerMin));
162
+ }
163
+
164
+ // Sum of all relevant tokens across the window. Each sample carries the
165
+ // per-call token deltas as pushed by recordSample (caller responsibility).
166
+ const totalTokens = fresh.reduce(
167
+ (acc, s) =>
168
+ acc + (s.input || 0) + (s.cache_creation || 0) + (s.cache_read || 0) + (s.output || 0),
169
+ 0,
170
+ );
171
+ const tokens_per_min = totalTokens / windowMin;
172
+ const cost_per_hr_usd_coarse =
173
+ utilPerMin > 0
174
+ ? +(tokens_per_min * 60 * WEIGHTED_TOKEN_COST_USD_COARSE).toFixed(2)
175
+ : null;
176
+
177
+ return {
178
+ min_to_100,
179
+ tokens_per_min: Math.round(tokens_per_min),
180
+ cost_per_hr_usd_coarse,
181
+ window_samples: fresh.length,
182
+ window_minutes: +windowMin.toFixed(1),
183
+ };
184
+ }
185
+
186
+ export function formatStderrLine({ ts, trigger, snapshot, projection }) {
187
+ const upgrade = trigger.upgrade_paths.length
188
+ ? trigger.upgrade_paths.join(", ")
189
+ : "(none)";
190
+ const head = `[overage-warning] ${ts} Q5h=${snapshot.q5h_pct}% Q7d=${snapshot.q7d_pct}% (surpassed ${trigger.surpassed_threshold})`;
191
+ if (projection && projection.min_to_100 !== null && projection.cost_per_hr_usd_coarse !== null) {
192
+ return `${head} — projected 100% in ~${projection.min_to_100} min, estimated continued burn ≈ $${projection.cost_per_hr_usd_coarse.toFixed(2)}/hr at API rates (coarse). Upgrade paths: ${upgrade}.`;
193
+ }
194
+ return `${head} — projection unavailable (warming up). Upgrade paths: ${upgrade}.`;
195
+ }
196
+
197
+ export function formatJsonlRecord({ ts, trigger, snapshot, projection }) {
198
+ return {
199
+ ts,
200
+ trigger: {
201
+ status: trigger.status,
202
+ surpassed_threshold: trigger.surpassed_threshold,
203
+ overage_status: trigger.overage_status,
204
+ upgrade_paths: trigger.upgrade_paths,
205
+ },
206
+ snapshot: {
207
+ q5h_pct: snapshot.q5h_pct,
208
+ q7d_pct: snapshot.q7d_pct,
209
+ q5h_resets_at: snapshot.q5h_resets_at,
210
+ },
211
+ projection: projection || {
212
+ min_to_100: null,
213
+ tokens_per_min: null,
214
+ cost_per_hr_usd_coarse: null,
215
+ window_samples: 0,
216
+ window_minutes: 0,
217
+ },
218
+ };
219
+ }
220
+
221
+ // --- Window management ---
222
+
223
+ export function recordSample(state, sample) {
224
+ state.window.push(sample);
225
+ const cutoff = sample.t - WINDOW_MS;
226
+ while (state.window.length && state.window[0].t < cutoff) state.window.shift();
227
+ while (state.window.length > WINDOW_MAX_SAMPLES) state.window.shift();
228
+ }
229
+
230
+ // --- Dedup helpers operating on module state ---
231
+
232
+ function checkAndMarkDedup(threshold, q5h_resets_at) {
233
+ // New Q5h window resets the dedup set.
234
+ if (q5h_resets_at !== _dedupWindowResetsAt) {
235
+ _dedupWindowResetsAt = q5h_resets_at;
236
+ _dedupThresholds = new Set();
237
+ }
238
+ const key = dedupKey(threshold, q5h_resets_at);
239
+ if (_dedupThresholds.has(key)) return false;
240
+ _dedupThresholds.add(key);
241
+ return true;
242
+ }
243
+
244
+ // --- I/O ---
245
+
246
+ async function appendJsonl(record, dir) {
247
+ const outDir = dir || (process.env.CACHE_FIX_OVERAGE_WARNING_DIR || join(homedir(), ".claude"));
248
+ const outPath = join(outDir, "overage-warnings.jsonl");
249
+ await mkdir(outDir, { recursive: true });
250
+ await appendFile(outPath, JSON.stringify(record) + "\n");
251
+ }
252
+
253
+ // Test helper: write a record using a caller-supplied directory. Bypasses
254
+ // env-var lookup so tests do not race on a shared env. Pure side effect.
255
+ export async function writeRecord(record, dir) {
256
+ await appendJsonl(record, dir);
257
+ }
258
+
259
+ // Test helper: drain in-memory state. For deterministic tests.
260
+ export function _resetForTest() {
261
+ resetState();
262
+ }
263
+
264
+ // --- Extension contract ---
265
+
266
+ export default {
267
+ name: "overage-warning",
268
+ description:
269
+ "Emit one-time warning per Q5h-window threshold crossing when overage headers indicate trouble",
270
+ enabled: true,
271
+ order: 610,
272
+
273
+ async onResponseStart(ctx) {
274
+ if (!isEnabled()) return;
275
+ if (!ctx || !ctx.headers) return;
276
+
277
+ try {
278
+ ctx.meta = ctx.meta || {};
279
+
280
+ // Always capture quota state if the headers carry it, regardless of
281
+ // whether THIS response's status crosses a warning threshold. Future
282
+ // responses need warm samples to project from.
283
+ const q5hRaw = ctx.headers["anthropic-ratelimit-unified-5h-utilization"];
284
+ const q5hUtil = q5hRaw ? parseFloat(q5hRaw) : null;
285
+ if (q5hUtil !== null && Number.isFinite(q5hUtil)) {
286
+ ctx.meta._overageQuota = { q5h_util: q5hUtil };
287
+ }
288
+
289
+ // Trigger eligibility latch — only set when this response is the one
290
+ // that crossed a threshold. Keeps emission gate separate from sampling.
291
+ const result = parseTriggerFromHeaders(ctx.headers);
292
+ if (!result.eligible) return;
293
+ ctx.meta._overageWarning = {
294
+ eligible: true,
295
+ emitted: false,
296
+ trigger: result.trigger,
297
+ snapshot: result.snapshot,
298
+ raw: result.raw,
299
+ };
300
+ } catch (err) {
301
+ debug(`onResponseStart unexpected: ${err?.message ?? err}`);
302
+ }
303
+ },
304
+
305
+ async onStreamEvent(ctx) {
306
+ if (!isEnabled()) return;
307
+ if (!ctx || !ctx.event) return;
308
+
309
+ try {
310
+ // Sample collection — happens on every response that has a quota
311
+ // reading, regardless of whether this response is the one that emits.
312
+ if (ctx.event.type === "message_start" && ctx.event.message?.usage) {
313
+ const u = ctx.event.message.usage;
314
+ const q5hUtil = ctx.meta?._overageQuota?.q5h_util;
315
+ if (q5hUtil !== undefined && q5hUtil !== null) {
316
+ const sample = {
317
+ t: Date.now(),
318
+ q5h: q5hUtil,
319
+ input: u.input_tokens || 0,
320
+ cache_creation: u.cache_creation_input_tokens || 0,
321
+ cache_read: u.cache_read_input_tokens || 0,
322
+ output: 0,
323
+ };
324
+ recordSample({ window: _window }, sample);
325
+ // Hand the response its own sample reference. message_delta updates
326
+ // THIS sample only — never the window's last sample, which could
327
+ // belong to a different response under interleaving.
328
+ ctx.meta._overageSample = sample;
329
+ }
330
+ }
331
+
332
+ if (ctx.event.type === "message_delta") {
333
+ // Update THIS response's sample with output tokens. The sample
334
+ // reference is response-local (set by message_start), so a response
335
+ // that never sampled cannot leak output tokens into another response.
336
+ const ownSample = ctx.meta?._overageSample;
337
+ if (ownSample && ctx.event.usage?.output_tokens) {
338
+ ownSample.output += ctx.event.usage.output_tokens;
339
+ }
340
+
341
+ // Emission gate.
342
+ const w = ctx.meta?._overageWarning;
343
+ if (!w || !w.eligible || w.emitted) return;
344
+
345
+ const allowed = checkAndMarkDedup(
346
+ w.trigger.surpassed_threshold,
347
+ w.snapshot.q5h_resets_at,
348
+ );
349
+ if (!allowed) {
350
+ w.emitted = true;
351
+ return;
352
+ }
353
+
354
+ const ts = new Date().toISOString();
355
+ const projection = computeProjection(_window, Date.now());
356
+ const projectionForOutput =
357
+ projection.window_samples >= WARM_UP_MIN_SAMPLES &&
358
+ projection.min_to_100 !== null
359
+ ? projection
360
+ : null;
361
+
362
+ const record = formatJsonlRecord({
363
+ ts,
364
+ trigger: w.trigger,
365
+ snapshot: w.snapshot,
366
+ projection: projectionForOutput || projection,
367
+ });
368
+
369
+ if (!isQuiet()) {
370
+ process.stderr.write(formatStderrLine({
371
+ ts,
372
+ trigger: w.trigger,
373
+ snapshot: w.snapshot,
374
+ projection: projectionForOutput,
375
+ }) + "\n");
376
+ }
377
+
378
+ await appendJsonl(record);
379
+ w.emitted = true;
380
+ }
381
+ } catch (err) {
382
+ debug(`onStreamEvent unexpected: ${err?.message ?? err}`);
383
+ }
384
+ },
385
+ };