claude-code-cache-fix 3.1.1 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -0
- package/package.json +1 -1
- package/proxy/extensions/image-strip.mjs +106 -9
- package/proxy/extensions/overage-warning.mjs +385 -0
- package/proxy/extensions/upstream-change-detection.mjs +533 -0
- package/proxy/extensions/usage-log.mjs +252 -23
- package/proxy/extensions.json +1 -0
- package/proxy/image-dimensions.mjs +120 -0
- package/proxy/rates.mjs +16 -0
package/README.md
CHANGED
|
@@ -86,6 +86,43 @@ nohup cache-fix-proxy server > /tmp/cache-fix-proxy.log 2>&1 &
|
|
|
86
86
|
echo 'export ANTHROPIC_BASE_URL=http://127.0.0.1:9801' >> ~/.bashrc
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
+
### Docker
|
|
90
|
+
|
|
91
|
+
A multi-arch (amd64, arm64) container image is published to GitHub Container Registry on every release tag.
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
docker run -d --name cache-fix-proxy \
|
|
95
|
+
--restart=always \
|
|
96
|
+
-p 9801:9801 \
|
|
97
|
+
ghcr.io/cnighswonger/claude-code-cache-fix:latest
|
|
98
|
+
|
|
99
|
+
# Then in your shell:
|
|
100
|
+
export ANTHROPIC_BASE_URL=http://127.0.0.1:9801
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Use `--restart=always` instead of the systemd healthcheck companion — Docker handles auto-recovery natively. Mount nothing; the container is stateless. Override the default port with `-e CACHE_FIX_PROXY_PORT=...`. Override the upstream (e.g. to chain through llm-relay) with `-e CACHE_FIX_PROXY_UPSTREAM=http://host.docker.internal:8080`. The image runs as the unprivileged `node` user (uid 1000) and exposes a `HEALTHCHECK` Docker can use for liveness.
|
|
104
|
+
|
|
105
|
+
For corporate environments behind an SSL-inspecting proxy, mount your CA bundle and set the env vars:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
docker run -d --name cache-fix-proxy --restart=always -p 9801:9801 \
|
|
109
|
+
-e HTTPS_PROXY=http://proxy.corp.example:8080 \
|
|
110
|
+
-e CACHE_FIX_PROXY_CA_FILE=/etc/ssl/corp-ca.pem \
|
|
111
|
+
-v /path/to/zscaler-root.pem:/etc/ssl/corp-ca.pem:ro \
|
|
112
|
+
ghcr.io/cnighswonger/claude-code-cache-fix:latest
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Image tags: `latest`, `3`, `3.2`, `3.2.1` (semver-ladder, so `3` always points to the newest 3.x). `latest` always tracks the newest tagged release.
|
|
116
|
+
|
|
117
|
+
**Linux note:** the chained-upstream `host.docker.internal` example below is automatic on Docker Desktop (macOS / Windows). On plain Linux Docker Engine you usually need `--add-host=host.docker.internal:host-gateway` so the name resolves to the host bridge. Without it, the container's name lookup fails and the proxy can't reach the upstream service running on the host. Example chaining cache-fix proxy through `llm-relay` running on the host:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
docker run -d --name cache-fix-proxy --restart=always -p 9801:9801 \
|
|
121
|
+
--add-host=host.docker.internal:host-gateway \
|
|
122
|
+
-e CACHE_FIX_PROXY_UPSTREAM=http://host.docker.internal:8080 \
|
|
123
|
+
ghcr.io/cnighswonger/claude-code-cache-fix:latest
|
|
124
|
+
```
|
|
125
|
+
|
|
89
126
|
### Health check
|
|
90
127
|
|
|
91
128
|
```bash
|
|
@@ -297,6 +334,27 @@ export CACHE_FIX_IMAGE_KEEP_LAST=3
|
|
|
297
334
|
|
|
298
335
|
Keeps images in the last 3 user messages, replaces older ones with a text placeholder. Only targets `tool_result` blocks — user-pasted images are never touched.
|
|
299
336
|
|
|
337
|
+
### Oversized-image guard
|
|
338
|
+
|
|
339
|
+
```bash
|
|
340
|
+
export CACHE_FIX_IMAGE_MAX_DIM=2000
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
The Anthropic API enforces TWO image-related limits on multi-image requests, and the same error message can fire for either:
|
|
344
|
+
|
|
345
|
+
> `"An image in the conversation exceeds the dimension limit for many-image requests (2000px). Start a new session with fewer images."`
|
|
346
|
+
|
|
347
|
+
Two pressure axes to address them:
|
|
348
|
+
|
|
349
|
+
| Pressure | Variable | What it does |
|
|
350
|
+
|---|---|---|
|
|
351
|
+
| **Too many images in conversation** | `CACHE_FIX_IMAGE_KEEP_LAST=N` | Strips images from old user messages, keeps only the last N. |
|
|
352
|
+
| **Any single image too large** | `CACHE_FIX_IMAGE_MAX_DIM=2000` | Replaces images exceeding the dimension limit with a forensic placeholder noting the original dimensions. Covers both user-message direct images and tool_result-nested images. |
|
|
353
|
+
|
|
354
|
+
The two compose: with both set, `KEEP_LAST` runs first (drops the count), then `MAX_DIM` runs on what remains (caps the size of the kept ones). Common triggers for the dimension axis: hi-res manuscript scans, retina screenshots, photos at full resolution.
|
|
355
|
+
|
|
356
|
+
Pure-JS PNG and JPEG header parsing — no native deps. Other formats (GIF, WebP, AVIF, BMP) pass through unchanged regardless of dimension. Fail-open: images whose dimensions can't be parsed (truncated header, unsupported format) are kept rather than stripped — better to send a request that might error than to strip a valid image we just couldn't measure.
|
|
357
|
+
|
|
300
358
|
## System prompt rewrite (preload mode, optional)
|
|
301
359
|
|
|
302
360
|
The interceptor can rewrite Claude Code's `# Output efficiency` system-prompt section. Disabled by default. Enable with `CACHE_FIX_OUTPUT_EFFICIENCY_REPLACEMENT`. See [docs/output-efficiency-prompts.md](docs/output-efficiency-prompts.md) for the three known prompt variants and usage instructions.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-code-cache-fix",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.2.1",
|
|
4
4
|
"description": "Cache optimization proxy and interceptor for Claude Code. Fixes prompt cache bugs, stabilizes prefix, reduces quota burn.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": "./preload.mjs",
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
import { parseImageDimensions } from "../image-dimensions.mjs";
|
|
2
|
+
|
|
1
3
|
const KEEP_LAST = parseInt(process.env.CACHE_FIX_IMAGE_KEEP_LAST || "0", 10);
|
|
4
|
+
const MAX_DIM = parseInt(process.env.CACHE_FIX_IMAGE_MAX_DIM || "0", 10);
|
|
2
5
|
const PLACEHOLDER = "[image stripped from history — file may still be on disk]";
|
|
3
6
|
|
|
7
|
+
function oversizedPlaceholder(maxDim, w, h) {
|
|
8
|
+
return `[image stripped — exceeded ${maxDim}px max dimension (was ${w}x${h}px)]`;
|
|
9
|
+
}
|
|
10
|
+
|
|
4
11
|
function stripOldToolResultImages(messages, keepLast) {
|
|
5
12
|
if (!keepLast || keepLast <= 0 || !Array.isArray(messages)) {
|
|
6
13
|
return { messages, stats: null };
|
|
@@ -58,26 +65,116 @@ function stripOldToolResultImages(messages, keepLast) {
|
|
|
58
65
|
return { messages: strippedCount > 0 ? result : messages, stats };
|
|
59
66
|
}
|
|
60
67
|
|
|
61
|
-
|
|
68
|
+
// Strip oversized images from BOTH user-message direct content and
|
|
69
|
+
// tool_result-nested content. Orthogonal to KEEP_LAST: scans every image
|
|
70
|
+
// remaining in the message list and replaces any whose width or height
|
|
71
|
+
// exceeds maxDim. Fail-open: images we can't measure (unsupported format,
|
|
72
|
+
// truncated header) are kept rather than stripped.
|
|
73
|
+
//
|
|
74
|
+
// Stripping by oversize prevents the Anthropic API error:
|
|
75
|
+
// "An image in the conversation exceeds the dimension limit for many-image
|
|
76
|
+
// requests (2000px). Start a new session with fewer images."
|
|
77
|
+
function stripOversizedImages(messages, maxDim) {
|
|
78
|
+
if (!maxDim || maxDim <= 0 || !Array.isArray(messages)) {
|
|
79
|
+
return { messages, stats: null };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
let strippedCount = 0;
|
|
83
|
+
let strippedBytes = 0;
|
|
84
|
+
|
|
85
|
+
function maybeStrip(item) {
|
|
86
|
+
if (!item || item.type !== "image") return item;
|
|
87
|
+
const src = item.source;
|
|
88
|
+
if (!src || !src.data || !src.media_type) return item;
|
|
89
|
+
const dims = parseImageDimensions(src.media_type, src.data);
|
|
90
|
+
if (!dims) return item; // can't measure → keep
|
|
91
|
+
if (dims.width <= maxDim && dims.height <= maxDim) return item;
|
|
92
|
+
strippedCount++;
|
|
93
|
+
strippedBytes += src.data.length;
|
|
94
|
+
return { type: "text", text: oversizedPlaceholder(maxDim, dims.width, dims.height) };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const result = messages.map((msg) => {
|
|
98
|
+
if (!Array.isArray(msg.content)) return msg;
|
|
99
|
+
let mutated = false;
|
|
100
|
+
const newContent = msg.content.map((block) => {
|
|
101
|
+
// Direct image block on a user message
|
|
102
|
+
if (block && block.type === "image") {
|
|
103
|
+
const replaced = maybeStrip(block);
|
|
104
|
+
if (replaced !== block) {
|
|
105
|
+
mutated = true;
|
|
106
|
+
return replaced;
|
|
107
|
+
}
|
|
108
|
+
return block;
|
|
109
|
+
}
|
|
110
|
+
// Image nested inside a tool_result.content array
|
|
111
|
+
if (block && block.type === "tool_result" && Array.isArray(block.content)) {
|
|
112
|
+
let toolMutated = false;
|
|
113
|
+
const newToolContent = block.content.map((item) => {
|
|
114
|
+
const replaced = maybeStrip(item);
|
|
115
|
+
if (replaced !== item) toolMutated = true;
|
|
116
|
+
return replaced;
|
|
117
|
+
});
|
|
118
|
+
if (toolMutated) {
|
|
119
|
+
mutated = true;
|
|
120
|
+
return { ...block, content: newToolContent };
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return block;
|
|
124
|
+
});
|
|
125
|
+
return mutated ? { ...msg, content: newContent } : msg;
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
const stats = strippedCount > 0
|
|
129
|
+
? { strippedCount, strippedBytes, estimatedTokens: Math.ceil(strippedBytes * 0.125) }
|
|
130
|
+
: null;
|
|
131
|
+
|
|
132
|
+
return { messages: strippedCount > 0 ? result : messages, stats };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
export { stripOldToolResultImages, stripOversizedImages, PLACEHOLDER, oversizedPlaceholder };
|
|
62
136
|
|
|
63
137
|
export default {
|
|
64
138
|
name: "image-strip",
|
|
65
|
-
description:
|
|
139
|
+
description:
|
|
140
|
+
"Strip base64 images from old tool results AND optionally strip oversized images that would trigger Anthropic's many-image dimension limit",
|
|
66
141
|
enabled: false,
|
|
67
142
|
order: 150,
|
|
68
143
|
|
|
69
144
|
async onRequest(ctx) {
|
|
70
145
|
const keepLast = parseInt(ctx.meta.imageKeepLast ?? KEEP_LAST, 10);
|
|
71
|
-
|
|
146
|
+
const maxDim = parseInt(ctx.meta.imageMaxDim ?? MAX_DIM, 10);
|
|
147
|
+
if ((!keepLast || keepLast <= 0) && (!maxDim || maxDim <= 0)) return;
|
|
72
148
|
if (!ctx.body.messages) return;
|
|
73
149
|
|
|
74
|
-
|
|
75
|
-
|
|
150
|
+
let messages = ctx.body.messages;
|
|
151
|
+
const logParts = [];
|
|
152
|
+
|
|
153
|
+
// Pass 1: existing keep_last behavior. Sets ctx.meta.imageStripStats with
|
|
154
|
+
// the same shape as before this PR — back-compat preserved.
|
|
155
|
+
if (keepLast > 0) {
|
|
156
|
+
const r = stripOldToolResultImages(messages, keepLast);
|
|
157
|
+
if (r.stats) {
|
|
158
|
+
messages = r.messages;
|
|
159
|
+
ctx.meta.imageStripStats = r.stats;
|
|
160
|
+
logParts.push(`keep_last: ${r.stats.strippedCount} stripped (~${r.stats.estimatedTokens} tokens saved)`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Pass 2: new max_dim behavior. Stats land on a new field so consumers
|
|
165
|
+
// already reading imageStripStats don't see a shape change.
|
|
166
|
+
if (maxDim > 0) {
|
|
167
|
+
const r = stripOversizedImages(messages, maxDim);
|
|
168
|
+
if (r.stats) {
|
|
169
|
+
messages = r.messages;
|
|
170
|
+
ctx.meta.imageStripOversizedStats = r.stats;
|
|
171
|
+
logParts.push(`max_dim: ${r.stats.strippedCount} oversized stripped (~${r.stats.estimatedTokens} tokens saved)`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (logParts.length > 0) {
|
|
76
176
|
ctx.body.messages = messages;
|
|
77
|
-
|
|
78
|
-
process.stderr.write(
|
|
79
|
-
`[image-strip] stripped ${stats.strippedCount} images (~${stats.estimatedTokens} tokens saved)\n`
|
|
80
|
-
);
|
|
177
|
+
process.stderr.write(`[image-strip] ${logParts.join("; ")}\n`);
|
|
81
178
|
}
|
|
82
179
|
},
|
|
83
180
|
};
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
// overage-warning — emit a one-time warning per Q5h-window threshold
|
|
2
|
+
// crossing when Anthropic's response headers indicate the user is
|
|
3
|
+
// approaching or has crossed the overage threshold.
|
|
4
|
+
//
|
|
5
|
+
// Advisory only. No request mutation. Two outputs:
|
|
6
|
+
// 1. stderr line prefixed `[overage-warning]` for proxy journals/logs
|
|
7
|
+
// 2. structured JSON record appended to `~/.claude/overage-warnings.jsonl`
|
|
8
|
+
//
|
|
9
|
+
// Activation: `enabled: true` in extensions.json (this extension is
|
|
10
|
+
// always loaded), gated at runtime by `CACHE_FIX_OVERAGE_WARNING=1`.
|
|
11
|
+
// Matches the prefix-diff pattern (env-var-only opt-in).
|
|
12
|
+
//
|
|
13
|
+
// See `docs/directives/proxy-overage-cost-warning.md` for the full design.
|
|
14
|
+
|
|
15
|
+
import { appendFile, mkdir } from "node:fs/promises";
|
|
16
|
+
import { join, dirname } from "node:path";
|
|
17
|
+
import { homedir } from "node:os";
|
|
18
|
+
|
|
19
|
+
import { WEIGHTED_TOKEN_COST_USD_COARSE } from "../rates.mjs";
|
|
20
|
+
|
|
21
|
+
// Env-gated runtime flags read on each call. Reading at module load would
|
|
22
|
+
// freeze the values and make per-test isolation impossible. The check is
|
|
23
|
+
// cheap (one process.env lookup per invocation when disabled).
|
|
24
|
+
function isEnabled() {
|
|
25
|
+
return process.env.CACHE_FIX_OVERAGE_WARNING === "1";
|
|
26
|
+
}
|
|
27
|
+
function isQuiet() {
|
|
28
|
+
return process.env.CACHE_FIX_OVERAGE_WARNING_QUIET === "1";
|
|
29
|
+
}
|
|
30
|
+
function isDebug() {
|
|
31
|
+
return process.env.CACHE_FIX_DEBUG === "1";
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function debug(msg) {
|
|
35
|
+
if (isDebug()) process.stderr.write(`[overage-warning] DEBUG: ${msg}\n`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// --- Module-scope state ---
|
|
39
|
+
//
|
|
40
|
+
// Sliding window of (timestamp, q5h_util, input_tokens, cache_creation_tokens,
|
|
41
|
+
// cache_read_tokens, output_tokens) samples. Used to compute burn rate.
|
|
42
|
+
//
|
|
43
|
+
// Cross-response dedup: per Q5h window (keyed by q5h_resets_at), the set
|
|
44
|
+
// of thresholds we've already warned at. Window expires when q5h_resets_at
|
|
45
|
+
// changes (new window = new dedup state).
|
|
46
|
+
|
|
47
|
+
const WINDOW_MS = 15 * 60 * 1000;
|
|
48
|
+
const WINDOW_MAX_SAMPLES = 60;
|
|
49
|
+
const WARM_UP_MIN_SAMPLES = 3;
|
|
50
|
+
|
|
51
|
+
const _window = []; // { t, q5h, input, cache_creation, cache_read, output }
|
|
52
|
+
let _dedupWindowResetsAt = 0;
|
|
53
|
+
let _dedupThresholds = new Set();
|
|
54
|
+
|
|
55
|
+
function resetState() {
|
|
56
|
+
_window.length = 0;
|
|
57
|
+
_dedupWindowResetsAt = 0;
|
|
58
|
+
_dedupThresholds = new Set();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// --- Pure functions (test seam) ---
|
|
62
|
+
|
|
63
|
+
export function parseTriggerFromHeaders(headers) {
|
|
64
|
+
if (!headers || typeof headers !== "object") return { eligible: false };
|
|
65
|
+
const get = (k) => headers[k] || "";
|
|
66
|
+
const num = (k) => {
|
|
67
|
+
const v = get(k);
|
|
68
|
+
if (!v) return null;
|
|
69
|
+
const n = parseFloat(v);
|
|
70
|
+
return Number.isFinite(n) ? n : null;
|
|
71
|
+
};
|
|
72
|
+
const intOf = (k) => {
|
|
73
|
+
const v = get(k);
|
|
74
|
+
if (!v) return 0;
|
|
75
|
+
const n = parseInt(v, 10);
|
|
76
|
+
return Number.isFinite(n) ? n : 0;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const status =
|
|
80
|
+
get("anthropic-ratelimit-unified-status") ||
|
|
81
|
+
get("anthropic-ratelimit-unified-5h-status");
|
|
82
|
+
const surpassed = num("anthropic-ratelimit-unified-7d-surpassed-threshold");
|
|
83
|
+
const overage_status = get("anthropic-ratelimit-unified-overage-status") || "unknown";
|
|
84
|
+
const upgrade_paths_raw = get("anthropic-ratelimit-unified-upgrade-paths");
|
|
85
|
+
const q5h_util = num("anthropic-ratelimit-unified-5h-utilization");
|
|
86
|
+
const q7d_util = num("anthropic-ratelimit-unified-7d-utilization");
|
|
87
|
+
const q5h_resets_at = intOf("anthropic-ratelimit-unified-5h-reset");
|
|
88
|
+
|
|
89
|
+
// Trigger gates: status is allowed_warning or throttled, surpassed-threshold
|
|
90
|
+
// header is present and non-empty.
|
|
91
|
+
const isWarn = status === "allowed_warning" || status === "throttled";
|
|
92
|
+
if (!isWarn) return { eligible: false };
|
|
93
|
+
if (surpassed === null) return { eligible: false };
|
|
94
|
+
|
|
95
|
+
const upgrade_paths = upgrade_paths_raw
|
|
96
|
+
? upgrade_paths_raw.split(",").map((s) => s.trim()).filter(Boolean)
|
|
97
|
+
: [];
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
eligible: true,
|
|
101
|
+
trigger: {
|
|
102
|
+
status,
|
|
103
|
+
surpassed_threshold: surpassed,
|
|
104
|
+
overage_status,
|
|
105
|
+
upgrade_paths,
|
|
106
|
+
},
|
|
107
|
+
snapshot: {
|
|
108
|
+
q5h_pct: q5h_util !== null ? Math.round(q5h_util * 100) : null,
|
|
109
|
+
q7d_pct: q7d_util !== null ? Math.round(q7d_util * 100) : null,
|
|
110
|
+
q5h_resets_at,
|
|
111
|
+
},
|
|
112
|
+
raw: {
|
|
113
|
+
q5h_util,
|
|
114
|
+
q5h_resets_at,
|
|
115
|
+
},
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export function dedupKey(threshold, q5h_resets_at) {
|
|
120
|
+
return `${threshold}@${q5h_resets_at}`;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Compute burn-rate projection from samples. Returns:
|
|
124
|
+
// { min_to_100, tokens_per_min, cost_per_hr_usd_coarse, window_samples,
|
|
125
|
+
// window_minutes }
|
|
126
|
+
// All projection fields are `null` when fewer than WARM_UP_MIN_SAMPLES
|
|
127
|
+
// samples exist OR utilization is non-increasing across the window.
|
|
128
|
+
export function computeProjection(samples, now = Date.now()) {
|
|
129
|
+
// Drop expired samples (caller may have already done this; defensive).
|
|
130
|
+
const fresh = samples.filter((s) => now - s.t <= WINDOW_MS);
|
|
131
|
+
|
|
132
|
+
if (fresh.length < WARM_UP_MIN_SAMPLES) {
|
|
133
|
+
return {
|
|
134
|
+
min_to_100: null,
|
|
135
|
+
tokens_per_min: null,
|
|
136
|
+
cost_per_hr_usd_coarse: null,
|
|
137
|
+
window_samples: fresh.length,
|
|
138
|
+
window_minutes: 0,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const oldest = fresh[0];
|
|
143
|
+
const newest = fresh[fresh.length - 1];
|
|
144
|
+
const windowMin = (newest.t - oldest.t) / 60_000;
|
|
145
|
+
|
|
146
|
+
if (windowMin <= 0) {
|
|
147
|
+
return {
|
|
148
|
+
min_to_100: null,
|
|
149
|
+
tokens_per_min: null,
|
|
150
|
+
cost_per_hr_usd_coarse: null,
|
|
151
|
+
window_samples: fresh.length,
|
|
152
|
+
window_minutes: 0,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const deltaUtil = newest.q5h - oldest.q5h;
|
|
157
|
+
const utilPerMin = deltaUtil / windowMin;
|
|
158
|
+
|
|
159
|
+
let min_to_100 = null;
|
|
160
|
+
if (utilPerMin > 0) {
|
|
161
|
+
min_to_100 = Math.max(0, Math.round((1 - newest.q5h) / utilPerMin));
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Sum of all relevant tokens across the window. Each sample carries the
|
|
165
|
+
// per-call token deltas as pushed by recordSample (caller responsibility).
|
|
166
|
+
const totalTokens = fresh.reduce(
|
|
167
|
+
(acc, s) =>
|
|
168
|
+
acc + (s.input || 0) + (s.cache_creation || 0) + (s.cache_read || 0) + (s.output || 0),
|
|
169
|
+
0,
|
|
170
|
+
);
|
|
171
|
+
const tokens_per_min = totalTokens / windowMin;
|
|
172
|
+
const cost_per_hr_usd_coarse =
|
|
173
|
+
utilPerMin > 0
|
|
174
|
+
? +(tokens_per_min * 60 * WEIGHTED_TOKEN_COST_USD_COARSE).toFixed(2)
|
|
175
|
+
: null;
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
min_to_100,
|
|
179
|
+
tokens_per_min: Math.round(tokens_per_min),
|
|
180
|
+
cost_per_hr_usd_coarse,
|
|
181
|
+
window_samples: fresh.length,
|
|
182
|
+
window_minutes: +windowMin.toFixed(1),
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
export function formatStderrLine({ ts, trigger, snapshot, projection }) {
|
|
187
|
+
const upgrade = trigger.upgrade_paths.length
|
|
188
|
+
? trigger.upgrade_paths.join(", ")
|
|
189
|
+
: "(none)";
|
|
190
|
+
const head = `[overage-warning] ${ts} Q5h=${snapshot.q5h_pct}% Q7d=${snapshot.q7d_pct}% (surpassed ${trigger.surpassed_threshold})`;
|
|
191
|
+
if (projection && projection.min_to_100 !== null && projection.cost_per_hr_usd_coarse !== null) {
|
|
192
|
+
return `${head} — projected 100% in ~${projection.min_to_100} min, estimated continued burn ≈ $${projection.cost_per_hr_usd_coarse.toFixed(2)}/hr at API rates (coarse). Upgrade paths: ${upgrade}.`;
|
|
193
|
+
}
|
|
194
|
+
return `${head} — projection unavailable (warming up). Upgrade paths: ${upgrade}.`;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export function formatJsonlRecord({ ts, trigger, snapshot, projection }) {
|
|
198
|
+
return {
|
|
199
|
+
ts,
|
|
200
|
+
trigger: {
|
|
201
|
+
status: trigger.status,
|
|
202
|
+
surpassed_threshold: trigger.surpassed_threshold,
|
|
203
|
+
overage_status: trigger.overage_status,
|
|
204
|
+
upgrade_paths: trigger.upgrade_paths,
|
|
205
|
+
},
|
|
206
|
+
snapshot: {
|
|
207
|
+
q5h_pct: snapshot.q5h_pct,
|
|
208
|
+
q7d_pct: snapshot.q7d_pct,
|
|
209
|
+
q5h_resets_at: snapshot.q5h_resets_at,
|
|
210
|
+
},
|
|
211
|
+
projection: projection || {
|
|
212
|
+
min_to_100: null,
|
|
213
|
+
tokens_per_min: null,
|
|
214
|
+
cost_per_hr_usd_coarse: null,
|
|
215
|
+
window_samples: 0,
|
|
216
|
+
window_minutes: 0,
|
|
217
|
+
},
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// --- Window management ---
|
|
222
|
+
|
|
223
|
+
export function recordSample(state, sample) {
|
|
224
|
+
state.window.push(sample);
|
|
225
|
+
const cutoff = sample.t - WINDOW_MS;
|
|
226
|
+
while (state.window.length && state.window[0].t < cutoff) state.window.shift();
|
|
227
|
+
while (state.window.length > WINDOW_MAX_SAMPLES) state.window.shift();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// --- Dedup helpers operating on module state ---
|
|
231
|
+
|
|
232
|
+
function checkAndMarkDedup(threshold, q5h_resets_at) {
|
|
233
|
+
// New Q5h window resets the dedup set.
|
|
234
|
+
if (q5h_resets_at !== _dedupWindowResetsAt) {
|
|
235
|
+
_dedupWindowResetsAt = q5h_resets_at;
|
|
236
|
+
_dedupThresholds = new Set();
|
|
237
|
+
}
|
|
238
|
+
const key = dedupKey(threshold, q5h_resets_at);
|
|
239
|
+
if (_dedupThresholds.has(key)) return false;
|
|
240
|
+
_dedupThresholds.add(key);
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// --- I/O ---
|
|
245
|
+
|
|
246
|
+
async function appendJsonl(record, dir) {
|
|
247
|
+
const outDir = dir || (process.env.CACHE_FIX_OVERAGE_WARNING_DIR || join(homedir(), ".claude"));
|
|
248
|
+
const outPath = join(outDir, "overage-warnings.jsonl");
|
|
249
|
+
await mkdir(outDir, { recursive: true });
|
|
250
|
+
await appendFile(outPath, JSON.stringify(record) + "\n");
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Test helper: write a record using a caller-supplied directory. Bypasses
|
|
254
|
+
// env-var lookup so tests do not race on a shared env. Pure side effect.
|
|
255
|
+
export async function writeRecord(record, dir) {
|
|
256
|
+
await appendJsonl(record, dir);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Test helper: drain in-memory state. For deterministic tests.
|
|
260
|
+
export function _resetForTest() {
|
|
261
|
+
resetState();
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// --- Extension contract ---
|
|
265
|
+
|
|
266
|
+
export default {
|
|
267
|
+
name: "overage-warning",
|
|
268
|
+
description:
|
|
269
|
+
"Emit one-time warning per Q5h-window threshold crossing when overage headers indicate trouble",
|
|
270
|
+
enabled: true,
|
|
271
|
+
order: 610,
|
|
272
|
+
|
|
273
|
+
async onResponseStart(ctx) {
|
|
274
|
+
if (!isEnabled()) return;
|
|
275
|
+
if (!ctx || !ctx.headers) return;
|
|
276
|
+
|
|
277
|
+
try {
|
|
278
|
+
ctx.meta = ctx.meta || {};
|
|
279
|
+
|
|
280
|
+
// Always capture quota state if the headers carry it, regardless of
|
|
281
|
+
// whether THIS response's status crosses a warning threshold. Future
|
|
282
|
+
// responses need warm samples to project from.
|
|
283
|
+
const q5hRaw = ctx.headers["anthropic-ratelimit-unified-5h-utilization"];
|
|
284
|
+
const q5hUtil = q5hRaw ? parseFloat(q5hRaw) : null;
|
|
285
|
+
if (q5hUtil !== null && Number.isFinite(q5hUtil)) {
|
|
286
|
+
ctx.meta._overageQuota = { q5h_util: q5hUtil };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Trigger eligibility latch — only set when this response is the one
|
|
290
|
+
// that crossed a threshold. Keeps emission gate separate from sampling.
|
|
291
|
+
const result = parseTriggerFromHeaders(ctx.headers);
|
|
292
|
+
if (!result.eligible) return;
|
|
293
|
+
ctx.meta._overageWarning = {
|
|
294
|
+
eligible: true,
|
|
295
|
+
emitted: false,
|
|
296
|
+
trigger: result.trigger,
|
|
297
|
+
snapshot: result.snapshot,
|
|
298
|
+
raw: result.raw,
|
|
299
|
+
};
|
|
300
|
+
} catch (err) {
|
|
301
|
+
debug(`onResponseStart unexpected: ${err?.message ?? err}`);
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
|
|
305
|
+
async onStreamEvent(ctx) {
|
|
306
|
+
if (!isEnabled()) return;
|
|
307
|
+
if (!ctx || !ctx.event) return;
|
|
308
|
+
|
|
309
|
+
try {
|
|
310
|
+
// Sample collection — happens on every response that has a quota
|
|
311
|
+
// reading, regardless of whether this response is the one that emits.
|
|
312
|
+
if (ctx.event.type === "message_start" && ctx.event.message?.usage) {
|
|
313
|
+
const u = ctx.event.message.usage;
|
|
314
|
+
const q5hUtil = ctx.meta?._overageQuota?.q5h_util;
|
|
315
|
+
if (q5hUtil !== undefined && q5hUtil !== null) {
|
|
316
|
+
const sample = {
|
|
317
|
+
t: Date.now(),
|
|
318
|
+
q5h: q5hUtil,
|
|
319
|
+
input: u.input_tokens || 0,
|
|
320
|
+
cache_creation: u.cache_creation_input_tokens || 0,
|
|
321
|
+
cache_read: u.cache_read_input_tokens || 0,
|
|
322
|
+
output: 0,
|
|
323
|
+
};
|
|
324
|
+
recordSample({ window: _window }, sample);
|
|
325
|
+
// Hand the response its own sample reference. message_delta updates
|
|
326
|
+
// THIS sample only — never the window's last sample, which could
|
|
327
|
+
// belong to a different response under interleaving.
|
|
328
|
+
ctx.meta._overageSample = sample;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (ctx.event.type === "message_delta") {
|
|
333
|
+
// Update THIS response's sample with output tokens. The sample
|
|
334
|
+
// reference is response-local (set by message_start), so a response
|
|
335
|
+
// that never sampled cannot leak output tokens into another response.
|
|
336
|
+
const ownSample = ctx.meta?._overageSample;
|
|
337
|
+
if (ownSample && ctx.event.usage?.output_tokens) {
|
|
338
|
+
ownSample.output += ctx.event.usage.output_tokens;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Emission gate.
|
|
342
|
+
const w = ctx.meta?._overageWarning;
|
|
343
|
+
if (!w || !w.eligible || w.emitted) return;
|
|
344
|
+
|
|
345
|
+
const allowed = checkAndMarkDedup(
|
|
346
|
+
w.trigger.surpassed_threshold,
|
|
347
|
+
w.snapshot.q5h_resets_at,
|
|
348
|
+
);
|
|
349
|
+
if (!allowed) {
|
|
350
|
+
w.emitted = true;
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const ts = new Date().toISOString();
|
|
355
|
+
const projection = computeProjection(_window, Date.now());
|
|
356
|
+
const projectionForOutput =
|
|
357
|
+
projection.window_samples >= WARM_UP_MIN_SAMPLES &&
|
|
358
|
+
projection.min_to_100 !== null
|
|
359
|
+
? projection
|
|
360
|
+
: null;
|
|
361
|
+
|
|
362
|
+
const record = formatJsonlRecord({
|
|
363
|
+
ts,
|
|
364
|
+
trigger: w.trigger,
|
|
365
|
+
snapshot: w.snapshot,
|
|
366
|
+
projection: projectionForOutput || projection,
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
if (!isQuiet()) {
|
|
370
|
+
process.stderr.write(formatStderrLine({
|
|
371
|
+
ts,
|
|
372
|
+
trigger: w.trigger,
|
|
373
|
+
snapshot: w.snapshot,
|
|
374
|
+
projection: projectionForOutput,
|
|
375
|
+
}) + "\n");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
await appendJsonl(record);
|
|
379
|
+
w.emitted = true;
|
|
380
|
+
}
|
|
381
|
+
} catch (err) {
|
|
382
|
+
debug(`onStreamEvent unexpected: ${err?.message ?? err}`);
|
|
383
|
+
}
|
|
384
|
+
},
|
|
385
|
+
};
|