claude-code-cache-fix 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -20
- package/bin/claude-via-proxy.mjs +57 -0
- package/bin/install-service.mjs +476 -0
- package/package.json +3 -2
- package/proxy/extensions/overage-warning.mjs +385 -0
- package/proxy/extensions/upstream-change-detection.mjs +533 -0
- package/proxy/extensions/usage-log.mjs +252 -23
- package/proxy/extensions.json +1 -0
- package/proxy/rates.mjs +16 -0
- package/templates/cache-fix-proxy-healthcheck.service.template +7 -0
- package/templates/cache-fix-proxy-healthcheck.timer.template +14 -0
- package/templates/cache-fix-proxy.service.template +17 -0
- package/templates/com.cnighswonger.cache-fix-proxy.plist.template +33 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
// overage-warning — emit a one-time warning per Q5h-window threshold
|
|
2
|
+
// crossing when Anthropic's response headers indicate the user is
|
|
3
|
+
// approaching or has crossed the overage threshold.
|
|
4
|
+
//
|
|
5
|
+
// Advisory only. No request mutation. Two outputs:
|
|
6
|
+
// 1. stderr line prefixed `[overage-warning]` for proxy journals/logs
|
|
7
|
+
// 2. structured JSON record appended to `~/.claude/overage-warnings.jsonl`
|
|
8
|
+
//
|
|
9
|
+
// Activation: `enabled: true` in extensions.json (this extension is
|
|
10
|
+
// always loaded), gated at runtime by `CACHE_FIX_OVERAGE_WARNING=1`.
|
|
11
|
+
// Matches the prefix-diff pattern (env-var-only opt-in).
|
|
12
|
+
//
|
|
13
|
+
// See `docs/directives/proxy-overage-cost-warning.md` for the full design.
|
|
14
|
+
|
|
15
|
+
import { appendFile, mkdir } from "node:fs/promises";
|
|
16
|
+
import { join, dirname } from "node:path";
|
|
17
|
+
import { homedir } from "node:os";
|
|
18
|
+
|
|
19
|
+
import { WEIGHTED_TOKEN_COST_USD_COARSE } from "../rates.mjs";
|
|
20
|
+
|
|
21
|
+
// Env-gated runtime flags read on each call. Reading at module load would
|
|
22
|
+
// freeze the values and make per-test isolation impossible. The check is
|
|
23
|
+
// cheap (one process.env lookup per invocation when disabled).
|
|
24
|
+
function isEnabled() {
|
|
25
|
+
return process.env.CACHE_FIX_OVERAGE_WARNING === "1";
|
|
26
|
+
}
|
|
27
|
+
function isQuiet() {
|
|
28
|
+
return process.env.CACHE_FIX_OVERAGE_WARNING_QUIET === "1";
|
|
29
|
+
}
|
|
30
|
+
function isDebug() {
|
|
31
|
+
return process.env.CACHE_FIX_DEBUG === "1";
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function debug(msg) {
|
|
35
|
+
if (isDebug()) process.stderr.write(`[overage-warning] DEBUG: ${msg}\n`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// --- Module-scope state ---
|
|
39
|
+
//
|
|
40
|
+
// Sliding window of (timestamp, q5h_util, input_tokens, cache_creation_tokens,
|
|
41
|
+
// cache_read_tokens, output_tokens) samples. Used to compute burn rate.
|
|
42
|
+
//
|
|
43
|
+
// Cross-response dedup: per Q5h window (keyed by q5h_resets_at), the set
|
|
44
|
+
// of thresholds we've already warned at. Window expires when q5h_resets_at
|
|
45
|
+
// changes (new window = new dedup state).
|
|
46
|
+
|
|
47
|
+
const WINDOW_MS = 15 * 60 * 1000;
|
|
48
|
+
const WINDOW_MAX_SAMPLES = 60;
|
|
49
|
+
const WARM_UP_MIN_SAMPLES = 3;
|
|
50
|
+
|
|
51
|
+
const _window = []; // { t, q5h, input, cache_creation, cache_read, output }
|
|
52
|
+
let _dedupWindowResetsAt = 0;
|
|
53
|
+
let _dedupThresholds = new Set();
|
|
54
|
+
|
|
55
|
+
function resetState() {
|
|
56
|
+
_window.length = 0;
|
|
57
|
+
_dedupWindowResetsAt = 0;
|
|
58
|
+
_dedupThresholds = new Set();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// --- Pure functions (test seam) ---
|
|
62
|
+
|
|
63
|
+
export function parseTriggerFromHeaders(headers) {
|
|
64
|
+
if (!headers || typeof headers !== "object") return { eligible: false };
|
|
65
|
+
const get = (k) => headers[k] || "";
|
|
66
|
+
const num = (k) => {
|
|
67
|
+
const v = get(k);
|
|
68
|
+
if (!v) return null;
|
|
69
|
+
const n = parseFloat(v);
|
|
70
|
+
return Number.isFinite(n) ? n : null;
|
|
71
|
+
};
|
|
72
|
+
const intOf = (k) => {
|
|
73
|
+
const v = get(k);
|
|
74
|
+
if (!v) return 0;
|
|
75
|
+
const n = parseInt(v, 10);
|
|
76
|
+
return Number.isFinite(n) ? n : 0;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const status =
|
|
80
|
+
get("anthropic-ratelimit-unified-status") ||
|
|
81
|
+
get("anthropic-ratelimit-unified-5h-status");
|
|
82
|
+
const surpassed = num("anthropic-ratelimit-unified-7d-surpassed-threshold");
|
|
83
|
+
const overage_status = get("anthropic-ratelimit-unified-overage-status") || "unknown";
|
|
84
|
+
const upgrade_paths_raw = get("anthropic-ratelimit-unified-upgrade-paths");
|
|
85
|
+
const q5h_util = num("anthropic-ratelimit-unified-5h-utilization");
|
|
86
|
+
const q7d_util = num("anthropic-ratelimit-unified-7d-utilization");
|
|
87
|
+
const q5h_resets_at = intOf("anthropic-ratelimit-unified-5h-reset");
|
|
88
|
+
|
|
89
|
+
// Trigger gates: status is allowed_warning or throttled, surpassed-threshold
|
|
90
|
+
// header is present and non-empty.
|
|
91
|
+
const isWarn = status === "allowed_warning" || status === "throttled";
|
|
92
|
+
if (!isWarn) return { eligible: false };
|
|
93
|
+
if (surpassed === null) return { eligible: false };
|
|
94
|
+
|
|
95
|
+
const upgrade_paths = upgrade_paths_raw
|
|
96
|
+
? upgrade_paths_raw.split(",").map((s) => s.trim()).filter(Boolean)
|
|
97
|
+
: [];
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
eligible: true,
|
|
101
|
+
trigger: {
|
|
102
|
+
status,
|
|
103
|
+
surpassed_threshold: surpassed,
|
|
104
|
+
overage_status,
|
|
105
|
+
upgrade_paths,
|
|
106
|
+
},
|
|
107
|
+
snapshot: {
|
|
108
|
+
q5h_pct: q5h_util !== null ? Math.round(q5h_util * 100) : null,
|
|
109
|
+
q7d_pct: q7d_util !== null ? Math.round(q7d_util * 100) : null,
|
|
110
|
+
q5h_resets_at,
|
|
111
|
+
},
|
|
112
|
+
raw: {
|
|
113
|
+
q5h_util,
|
|
114
|
+
q5h_resets_at,
|
|
115
|
+
},
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export function dedupKey(threshold, q5h_resets_at) {
|
|
120
|
+
return `${threshold}@${q5h_resets_at}`;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Compute burn-rate projection from samples. Returns:
|
|
124
|
+
// { min_to_100, tokens_per_min, cost_per_hr_usd_coarse, window_samples,
|
|
125
|
+
// window_minutes }
|
|
126
|
+
// All projection fields are `null` when fewer than WARM_UP_MIN_SAMPLES
|
|
127
|
+
// samples exist OR utilization is non-increasing across the window.
|
|
128
|
+
export function computeProjection(samples, now = Date.now()) {
|
|
129
|
+
// Drop expired samples (caller may have already done this; defensive).
|
|
130
|
+
const fresh = samples.filter((s) => now - s.t <= WINDOW_MS);
|
|
131
|
+
|
|
132
|
+
if (fresh.length < WARM_UP_MIN_SAMPLES) {
|
|
133
|
+
return {
|
|
134
|
+
min_to_100: null,
|
|
135
|
+
tokens_per_min: null,
|
|
136
|
+
cost_per_hr_usd_coarse: null,
|
|
137
|
+
window_samples: fresh.length,
|
|
138
|
+
window_minutes: 0,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const oldest = fresh[0];
|
|
143
|
+
const newest = fresh[fresh.length - 1];
|
|
144
|
+
const windowMin = (newest.t - oldest.t) / 60_000;
|
|
145
|
+
|
|
146
|
+
if (windowMin <= 0) {
|
|
147
|
+
return {
|
|
148
|
+
min_to_100: null,
|
|
149
|
+
tokens_per_min: null,
|
|
150
|
+
cost_per_hr_usd_coarse: null,
|
|
151
|
+
window_samples: fresh.length,
|
|
152
|
+
window_minutes: 0,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const deltaUtil = newest.q5h - oldest.q5h;
|
|
157
|
+
const utilPerMin = deltaUtil / windowMin;
|
|
158
|
+
|
|
159
|
+
let min_to_100 = null;
|
|
160
|
+
if (utilPerMin > 0) {
|
|
161
|
+
min_to_100 = Math.max(0, Math.round((1 - newest.q5h) / utilPerMin));
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Sum of all relevant tokens across the window. Each sample carries the
|
|
165
|
+
// per-call token deltas as pushed by recordSample (caller responsibility).
|
|
166
|
+
const totalTokens = fresh.reduce(
|
|
167
|
+
(acc, s) =>
|
|
168
|
+
acc + (s.input || 0) + (s.cache_creation || 0) + (s.cache_read || 0) + (s.output || 0),
|
|
169
|
+
0,
|
|
170
|
+
);
|
|
171
|
+
const tokens_per_min = totalTokens / windowMin;
|
|
172
|
+
const cost_per_hr_usd_coarse =
|
|
173
|
+
utilPerMin > 0
|
|
174
|
+
? +(tokens_per_min * 60 * WEIGHTED_TOKEN_COST_USD_COARSE).toFixed(2)
|
|
175
|
+
: null;
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
min_to_100,
|
|
179
|
+
tokens_per_min: Math.round(tokens_per_min),
|
|
180
|
+
cost_per_hr_usd_coarse,
|
|
181
|
+
window_samples: fresh.length,
|
|
182
|
+
window_minutes: +windowMin.toFixed(1),
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
export function formatStderrLine({ ts, trigger, snapshot, projection }) {
|
|
187
|
+
const upgrade = trigger.upgrade_paths.length
|
|
188
|
+
? trigger.upgrade_paths.join(", ")
|
|
189
|
+
: "(none)";
|
|
190
|
+
const head = `[overage-warning] ${ts} Q5h=${snapshot.q5h_pct}% Q7d=${snapshot.q7d_pct}% (surpassed ${trigger.surpassed_threshold})`;
|
|
191
|
+
if (projection && projection.min_to_100 !== null && projection.cost_per_hr_usd_coarse !== null) {
|
|
192
|
+
return `${head} — projected 100% in ~${projection.min_to_100} min, estimated continued burn ≈ $${projection.cost_per_hr_usd_coarse.toFixed(2)}/hr at API rates (coarse). Upgrade paths: ${upgrade}.`;
|
|
193
|
+
}
|
|
194
|
+
return `${head} — projection unavailable (warming up). Upgrade paths: ${upgrade}.`;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export function formatJsonlRecord({ ts, trigger, snapshot, projection }) {
|
|
198
|
+
return {
|
|
199
|
+
ts,
|
|
200
|
+
trigger: {
|
|
201
|
+
status: trigger.status,
|
|
202
|
+
surpassed_threshold: trigger.surpassed_threshold,
|
|
203
|
+
overage_status: trigger.overage_status,
|
|
204
|
+
upgrade_paths: trigger.upgrade_paths,
|
|
205
|
+
},
|
|
206
|
+
snapshot: {
|
|
207
|
+
q5h_pct: snapshot.q5h_pct,
|
|
208
|
+
q7d_pct: snapshot.q7d_pct,
|
|
209
|
+
q5h_resets_at: snapshot.q5h_resets_at,
|
|
210
|
+
},
|
|
211
|
+
projection: projection || {
|
|
212
|
+
min_to_100: null,
|
|
213
|
+
tokens_per_min: null,
|
|
214
|
+
cost_per_hr_usd_coarse: null,
|
|
215
|
+
window_samples: 0,
|
|
216
|
+
window_minutes: 0,
|
|
217
|
+
},
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// --- Window management ---
|
|
222
|
+
|
|
223
|
+
export function recordSample(state, sample) {
|
|
224
|
+
state.window.push(sample);
|
|
225
|
+
const cutoff = sample.t - WINDOW_MS;
|
|
226
|
+
while (state.window.length && state.window[0].t < cutoff) state.window.shift();
|
|
227
|
+
while (state.window.length > WINDOW_MAX_SAMPLES) state.window.shift();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// --- Dedup helpers operating on module state ---
|
|
231
|
+
|
|
232
|
+
function checkAndMarkDedup(threshold, q5h_resets_at) {
|
|
233
|
+
// New Q5h window resets the dedup set.
|
|
234
|
+
if (q5h_resets_at !== _dedupWindowResetsAt) {
|
|
235
|
+
_dedupWindowResetsAt = q5h_resets_at;
|
|
236
|
+
_dedupThresholds = new Set();
|
|
237
|
+
}
|
|
238
|
+
const key = dedupKey(threshold, q5h_resets_at);
|
|
239
|
+
if (_dedupThresholds.has(key)) return false;
|
|
240
|
+
_dedupThresholds.add(key);
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// --- I/O ---
|
|
245
|
+
|
|
246
|
+
async function appendJsonl(record, dir) {
|
|
247
|
+
const outDir = dir || (process.env.CACHE_FIX_OVERAGE_WARNING_DIR || join(homedir(), ".claude"));
|
|
248
|
+
const outPath = join(outDir, "overage-warnings.jsonl");
|
|
249
|
+
await mkdir(outDir, { recursive: true });
|
|
250
|
+
await appendFile(outPath, JSON.stringify(record) + "\n");
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Test helper: write a record using a caller-supplied directory. Bypasses
|
|
254
|
+
// env-var lookup so tests do not race on a shared env. Pure side effect.
|
|
255
|
+
export async function writeRecord(record, dir) {
|
|
256
|
+
await appendJsonl(record, dir);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Test helper: drain in-memory state. For deterministic tests.
|
|
260
|
+
export function _resetForTest() {
|
|
261
|
+
resetState();
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// --- Extension contract ---
|
|
265
|
+
|
|
266
|
+
export default {
|
|
267
|
+
name: "overage-warning",
|
|
268
|
+
description:
|
|
269
|
+
"Emit one-time warning per Q5h-window threshold crossing when overage headers indicate trouble",
|
|
270
|
+
enabled: true,
|
|
271
|
+
order: 610,
|
|
272
|
+
|
|
273
|
+
async onResponseStart(ctx) {
|
|
274
|
+
if (!isEnabled()) return;
|
|
275
|
+
if (!ctx || !ctx.headers) return;
|
|
276
|
+
|
|
277
|
+
try {
|
|
278
|
+
ctx.meta = ctx.meta || {};
|
|
279
|
+
|
|
280
|
+
// Always capture quota state if the headers carry it, regardless of
|
|
281
|
+
// whether THIS response's status crosses a warning threshold. Future
|
|
282
|
+
// responses need warm samples to project from.
|
|
283
|
+
const q5hRaw = ctx.headers["anthropic-ratelimit-unified-5h-utilization"];
|
|
284
|
+
const q5hUtil = q5hRaw ? parseFloat(q5hRaw) : null;
|
|
285
|
+
if (q5hUtil !== null && Number.isFinite(q5hUtil)) {
|
|
286
|
+
ctx.meta._overageQuota = { q5h_util: q5hUtil };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Trigger eligibility latch — only set when this response is the one
|
|
290
|
+
// that crossed a threshold. Keeps emission gate separate from sampling.
|
|
291
|
+
const result = parseTriggerFromHeaders(ctx.headers);
|
|
292
|
+
if (!result.eligible) return;
|
|
293
|
+
ctx.meta._overageWarning = {
|
|
294
|
+
eligible: true,
|
|
295
|
+
emitted: false,
|
|
296
|
+
trigger: result.trigger,
|
|
297
|
+
snapshot: result.snapshot,
|
|
298
|
+
raw: result.raw,
|
|
299
|
+
};
|
|
300
|
+
} catch (err) {
|
|
301
|
+
debug(`onResponseStart unexpected: ${err?.message ?? err}`);
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
|
|
305
|
+
async onStreamEvent(ctx) {
|
|
306
|
+
if (!isEnabled()) return;
|
|
307
|
+
if (!ctx || !ctx.event) return;
|
|
308
|
+
|
|
309
|
+
try {
|
|
310
|
+
// Sample collection — happens on every response that has a quota
|
|
311
|
+
// reading, regardless of whether this response is the one that emits.
|
|
312
|
+
if (ctx.event.type === "message_start" && ctx.event.message?.usage) {
|
|
313
|
+
const u = ctx.event.message.usage;
|
|
314
|
+
const q5hUtil = ctx.meta?._overageQuota?.q5h_util;
|
|
315
|
+
if (q5hUtil !== undefined && q5hUtil !== null) {
|
|
316
|
+
const sample = {
|
|
317
|
+
t: Date.now(),
|
|
318
|
+
q5h: q5hUtil,
|
|
319
|
+
input: u.input_tokens || 0,
|
|
320
|
+
cache_creation: u.cache_creation_input_tokens || 0,
|
|
321
|
+
cache_read: u.cache_read_input_tokens || 0,
|
|
322
|
+
output: 0,
|
|
323
|
+
};
|
|
324
|
+
recordSample({ window: _window }, sample);
|
|
325
|
+
// Hand the response its own sample reference. message_delta updates
|
|
326
|
+
// THIS sample only — never the window's last sample, which could
|
|
327
|
+
// belong to a different response under interleaving.
|
|
328
|
+
ctx.meta._overageSample = sample;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (ctx.event.type === "message_delta") {
|
|
333
|
+
// Update THIS response's sample with output tokens. The sample
|
|
334
|
+
// reference is response-local (set by message_start), so a response
|
|
335
|
+
// that never sampled cannot leak output tokens into another response.
|
|
336
|
+
const ownSample = ctx.meta?._overageSample;
|
|
337
|
+
if (ownSample && ctx.event.usage?.output_tokens) {
|
|
338
|
+
ownSample.output += ctx.event.usage.output_tokens;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Emission gate.
|
|
342
|
+
const w = ctx.meta?._overageWarning;
|
|
343
|
+
if (!w || !w.eligible || w.emitted) return;
|
|
344
|
+
|
|
345
|
+
const allowed = checkAndMarkDedup(
|
|
346
|
+
w.trigger.surpassed_threshold,
|
|
347
|
+
w.snapshot.q5h_resets_at,
|
|
348
|
+
);
|
|
349
|
+
if (!allowed) {
|
|
350
|
+
w.emitted = true;
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const ts = new Date().toISOString();
|
|
355
|
+
const projection = computeProjection(_window, Date.now());
|
|
356
|
+
const projectionForOutput =
|
|
357
|
+
projection.window_samples >= WARM_UP_MIN_SAMPLES &&
|
|
358
|
+
projection.min_to_100 !== null
|
|
359
|
+
? projection
|
|
360
|
+
: null;
|
|
361
|
+
|
|
362
|
+
const record = formatJsonlRecord({
|
|
363
|
+
ts,
|
|
364
|
+
trigger: w.trigger,
|
|
365
|
+
snapshot: w.snapshot,
|
|
366
|
+
projection: projectionForOutput || projection,
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
if (!isQuiet()) {
|
|
370
|
+
process.stderr.write(formatStderrLine({
|
|
371
|
+
ts,
|
|
372
|
+
trigger: w.trigger,
|
|
373
|
+
snapshot: w.snapshot,
|
|
374
|
+
projection: projectionForOutput,
|
|
375
|
+
}) + "\n");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
await appendJsonl(record);
|
|
379
|
+
w.emitted = true;
|
|
380
|
+
}
|
|
381
|
+
} catch (err) {
|
|
382
|
+
debug(`onStreamEvent unexpected: ${err?.message ?? err}`);
|
|
383
|
+
}
|
|
384
|
+
},
|
|
385
|
+
};
|