claude-code-cache-fix 3.8.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -8
- package/README.zh.md +691 -159
- package/bin/claude-via-proxy.mjs +1 -0
- package/bin/install-service.mjs +15 -0
- package/hooks/README.md +36 -0
- package/hooks/examples/worktree-edit-guard.py +93 -0
- package/package.json +2 -1
- package/proxy/extensions/auto-1m-guard.mjs +117 -0
- package/proxy/extensions/cache-telemetry.mjs +20 -3
- package/proxy/extensions/signature-surface-hash.mjs +60 -0
- package/proxy/extensions/thinking-block-sanitize.mjs +233 -19
- package/proxy/pipeline.mjs +22 -1
- package/proxy/server.mjs +44 -2
- package/templates/cache-fix-proxy.service.template +1 -0
- package/templates/com.cnighswonger.cache-fix-proxy.plist.template +1 -0
- package/tools/MANUAL-COMPACT.md +31 -9
- package/tools/manual-compact.sh +17 -11
- package/tools/quota-statusline.sh +4 -2
|
@@ -1,12 +1,27 @@
|
|
|
1
1
|
// thinking-block-sanitize — request-path mitigation for the CC thinking-desync
|
|
2
|
-
// wedge (anthropics/claude-code#63147).
|
|
2
|
+
// wedge (anthropics/claude-code#63147).
|
|
3
|
+
//
|
|
4
|
+
// v1 (default since v4.0.0; CACHE_FIX_THINKING_SANITIZE unset or =on): On replay paths (resume / --continue /
|
|
3
5
|
// auto-compaction / parallel-tool-cancel), CC re-sends prior assistant turns'
|
|
4
6
|
// thinking in the OMITTED shape `{ type:"thinking", thinking:"", signature }`.
|
|
5
7
|
// The API rejects modified thinking in the *latest* assistant message with a
|
|
6
|
-
// permanent 400, which wedges the session.
|
|
7
|
-
//
|
|
8
|
+
// permanent 400, which wedges the session. v1 drops these omitted blocks
|
|
9
|
+
// before forwarding. Never touches non-empty thinking; never touches
|
|
10
|
+
// redacted_thinking (v1's empirical exclusion — zero observed in worst-case
|
|
11
|
+
// wedged transcripts).
|
|
12
|
+
//
|
|
13
|
+
// v2 (CACHE_FIX_THINKING_SANITIZE=v2): Additionally handles yurukusa's "13E"
|
|
14
|
+
// pattern — when ToolSearch dynamically loads a tool mid-conversation, the
|
|
15
|
+
// prior assistant turn's thinking signature is invalidated because it was
|
|
16
|
+
// computed over the now-stale tools surface. The API rejects + CC's harness
|
|
17
|
+
// strips-and-retries, paying a 400 + retry tax every turn. v2 detects
|
|
18
|
+
// cross-request tools-surface change via a per-session tools-hash baseline,
|
|
19
|
+
// and strips ALL prior-turn signed thinking (both `thinking` blocks with
|
|
20
|
+
// non-empty text AND `redacted_thinking` blocks — v2's scope is structural,
|
|
21
|
+
// not empirical) on hash mismatch. Same active-tool-continuation latest-turn
|
|
22
|
+
// guard as v1.
|
|
8
23
|
//
|
|
9
|
-
// Resolved turn-selection rule (directive Open Question 1, empirical capture):
|
|
24
|
+
// Resolved turn-selection rule (v1 directive Open Question 1, empirical capture):
|
|
10
25
|
// - drop omitted thinking from ALL prior assistant turns, AND
|
|
11
26
|
// - from the LATEST assistant turn UNLESS it is an active tool-continuation
|
|
12
27
|
// (last block is a tool_use with a following tool_result) — that case is
|
|
@@ -16,15 +31,47 @@
|
|
|
16
31
|
// / MAX_THINKING_TOKENS=0 stop it only by disabling thinking entirely
|
|
17
32
|
// (lossy); DISABLE_INTERLEAVED_THINKING=1 does NOT stop the 400 — so the
|
|
18
33
|
// answer for that case is don't-resume + heal/retire.
|
|
19
|
-
// Never touches non-empty thinking, and never touches redacted_thinking (v1).
|
|
20
34
|
//
|
|
21
|
-
//
|
|
22
|
-
//
|
|
35
|
+
// v2 state pattern (per directive proxy-thinking-block-sanitize-v2.md):
|
|
36
|
+
// - In-memory per-session map keyed by canonical session filename, seeded
|
|
37
|
+
// once from sessions/<sid>.json on first request that session. Mirrors
|
|
38
|
+
// session-health's pattern. Lives at module scope.
|
|
39
|
+
// - Baseline updates ONLY on response success (HTTP 2xx). 4xx/5xx leave
|
|
40
|
+
// the baseline unchanged so a failed request's hash doesn't become the
|
|
41
|
+
// new ground truth.
|
|
42
|
+
// - First request observes-and-establishes (no strip; baseline is set
|
|
43
|
+
// after the response succeeds).
|
|
44
|
+
// - When canonical session id is "unknown" (raw id null/empty/whitespace),
|
|
45
|
+
// v2 no-ops entirely. The shared sessions/unknown.json would cross-
|
|
46
|
+
// contaminate baselines across unrelated agents otherwise.
|
|
47
|
+
//
|
|
48
|
+
// Modes via CACHE_FIX_THINKING_SANITIZE (as of v4.0.0 — v1 default-on flip):
|
|
49
|
+
// unset (or "on") — v1 only (omitted-text drop). DEFAULT.
|
|
50
|
+
// "off" — extension no-ops (explicit disable)
|
|
51
|
+
// "v2" — v1 + v2 (omitted-text drop AND
|
|
52
|
+
// tools-hash-mismatch drop). v2 is
|
|
53
|
+
// strict superset of v1.
|
|
54
|
+
// any other value — treated as v1 (the default), not off.
|
|
55
|
+
// Matches the precedent of being
|
|
56
|
+
// permissive about the on-path.
|
|
57
|
+
//
|
|
58
|
+
// v1 default-on rationale: 7-day prod dogfood across 37 sessions (2026-05-29
|
|
59
|
+
// → 2026-06-05) on `=on`: zero `cannot be modified` 400s, cache hit-rate
|
|
60
|
+
// aggregate 94.66% vs 92.44% baseline (no prefix degradation), sanitize fired
|
|
61
|
+
// on ~35% of sessions, ~800 blocks dropped per day, max 938K context healthy.
|
|
62
|
+
// v2 stays opt-in via `=v2` because the dogfood only ran v1.
|
|
23
63
|
//
|
|
24
64
|
// Order 550: after the request-body mutators (ttl-management 500) and before
|
|
25
65
|
// session-health (590), so #160's thinking_block_count reflects the forwarded
|
|
26
|
-
// body. The per-request drop
|
|
27
|
-
//
|
|
66
|
+
// body. The per-request drop counts are exposed via ctx.meta._thinkingSanitize
|
|
67
|
+
// (v1 counter) and ctx.meta._thinkingSanitizeV2 (v2 counter + baseline) for
|
|
68
|
+
// cache-telemetry (600) to merge into the per-session JSON.
|
|
69
|
+
|
|
70
|
+
import { readFileSync } from "node:fs";
|
|
71
|
+
import { resolveSessionId, sessionFilePath, sessionFilename } from "./cache-telemetry.mjs";
|
|
72
|
+
import { computeSignatureSurfaceHash } from "./signature-surface-hash.mjs";
|
|
73
|
+
|
|
74
|
+
// --- v1 predicates ---
|
|
28
75
|
|
|
29
76
|
export function isOmittedThinking(block) {
|
|
30
77
|
return (
|
|
@@ -70,14 +117,38 @@ function latestAssistantIndex(messages) {
|
|
|
70
117
|
return -1;
|
|
71
118
|
}
|
|
72
119
|
|
|
73
|
-
//
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
120
|
+
// --- v2 predicate ---
|
|
121
|
+
|
|
122
|
+
// v2 strips signed `thinking` blocks (non-empty text) AND `redacted_thinking`
|
|
123
|
+
// blocks. v1's `isOmittedThinking` filter handles the empty-text case
|
|
124
|
+
// independently — when both flags are active, v1 drops the empty ones and v2
|
|
125
|
+
// drops the signed ones; predicates are non-overlapping.
|
|
126
|
+
export function isSignedThinkingForV2(block) {
|
|
127
|
+
if (!block) return false;
|
|
128
|
+
if (block.type === "redacted_thinking") return true;
|
|
129
|
+
// Non-empty thinking with a signature — v1 leaves these alone by design.
|
|
130
|
+
return (
|
|
131
|
+
block.type === "thinking" &&
|
|
132
|
+
typeof block.thinking === "string" &&
|
|
133
|
+
block.thinking.trim() !== "" &&
|
|
134
|
+
typeof block.signature === "string" &&
|
|
135
|
+
block.signature.length > 0
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// --- Pure planner ---
|
|
140
|
+
//
|
|
141
|
+
// Returns { messages, dropped, droppedV2 }. Does not mutate input.
|
|
142
|
+
// `v2StripSigned` is the externally-determined boolean: should v2's
|
|
143
|
+
// signed-thinking drop fire this request? (Caller has already computed
|
|
144
|
+
// hash mismatch + session-state checks.)
|
|
145
|
+
export function planSanitize(messages, { v2StripSigned = false } = {}) {
|
|
146
|
+
if (!Array.isArray(messages)) return { messages, dropped: 0, droppedV2: 0 };
|
|
77
147
|
const latestAsst = latestAssistantIndex(messages);
|
|
78
148
|
const protectLatest = latestAsst >= 0 && isActiveToolContinuation(messages, latestAsst);
|
|
79
149
|
|
|
80
150
|
let dropped = 0;
|
|
151
|
+
let droppedV2 = 0;
|
|
81
152
|
let changed = false;
|
|
82
153
|
const out = [];
|
|
83
154
|
for (let i = 0; i < messages.length; i++) {
|
|
@@ -87,14 +158,24 @@ export function planSanitize(messages) {
|
|
|
87
158
|
continue;
|
|
88
159
|
}
|
|
89
160
|
if (i === latestAsst && protectLatest) {
|
|
90
|
-
|
|
161
|
+
// Active continuation — leave thinking intact (both v1 and v2 respect
|
|
162
|
+
// this; the API needs the signed thinking for the pending tool call).
|
|
163
|
+
out.push(msg);
|
|
91
164
|
continue;
|
|
92
165
|
}
|
|
93
166
|
const kept = msg.content.filter((b) => {
|
|
167
|
+
// v1 always-active drop predicate.
|
|
94
168
|
if (isOmittedThinking(b)) {
|
|
95
169
|
dropped++;
|
|
96
170
|
return false;
|
|
97
171
|
}
|
|
172
|
+
// v2-only drop predicate. Predicates are mutually exclusive on a single
|
|
173
|
+
// block: omitted thinking matches v1's predicate but not v2's, and
|
|
174
|
+
// signed/redacted thinking matches v2's predicate but not v1's.
|
|
175
|
+
if (v2StripSigned && isSignedThinkingForV2(b)) {
|
|
176
|
+
droppedV2++;
|
|
177
|
+
return false;
|
|
178
|
+
}
|
|
98
179
|
return true;
|
|
99
180
|
});
|
|
100
181
|
if (kept.length === msg.content.length) {
|
|
@@ -106,25 +187,158 @@ export function planSanitize(messages) {
|
|
|
106
187
|
changed = true;
|
|
107
188
|
}
|
|
108
189
|
}
|
|
109
|
-
return { messages: changed ? out : messages, dropped };
|
|
190
|
+
return { messages: changed ? out : messages, dropped, droppedV2 };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// --- v2 mode + state ---
|
|
194
|
+
|
|
195
|
+
// "off" | "on" | "v2". As of v4.0.0 the default flipped from "off" to "on" —
|
|
196
|
+
// v1 (omitted-text drop) is the new default behavior. Set
|
|
197
|
+
// CACHE_FIX_THINKING_SANITIZE=off to explicitly disable; =v2 to additionally
|
|
198
|
+
// enable the v2 tools-hash-mismatch drop (still opt-in pending its own
|
|
199
|
+
// prod-dogfood window after #200 closes the silent-load failure mode).
|
|
200
|
+
// Unknown values fall through to "on" — we are permissive about the on-path
|
|
201
|
+
// and only treat the literal "off" as a disable.
|
|
202
|
+
export function modeFromEnv(env = process.env) {
|
|
203
|
+
const v = env.CACHE_FIX_THINKING_SANITIZE;
|
|
204
|
+
if (v === "off") return "off";
|
|
205
|
+
if (v === "v2") return "v2";
|
|
206
|
+
return "on";
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Per-session state, in memory. Keyed by canonical session filename
|
|
210
|
+
// (sessionFilename(rawId)). Each entry: { tools_hash_baseline }.
|
|
211
|
+
// Mirrors session-health's pattern: seeded once from disk on first request
|
|
212
|
+
// that session, then maintained in memory + persisted via cache-telemetry's
|
|
213
|
+
// spread of ctx.meta._thinkingSanitizeV2.
|
|
214
|
+
const v2SessionState = new Map();
|
|
215
|
+
|
|
216
|
+
function seedV2FromFile(rawSid) {
|
|
217
|
+
let prev = null;
|
|
218
|
+
try {
|
|
219
|
+
prev = JSON.parse(readFileSync(sessionFilePath(rawSid), "utf8"));
|
|
220
|
+
} catch {}
|
|
221
|
+
return {
|
|
222
|
+
tools_hash_baseline:
|
|
223
|
+
typeof prev?.tools_hash_baseline === "string" ? prev.tools_hash_baseline : null,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Test-only reset (also useful for proxy-restart simulation in unit tests).
|
|
228
|
+
export function _resetV2State() {
|
|
229
|
+
v2SessionState.clear();
|
|
110
230
|
}
|
|
111
231
|
|
|
232
|
+
// --- Extension default-export ---
|
|
233
|
+
|
|
112
234
|
export default {
|
|
113
235
|
name: "thinking-block-sanitize",
|
|
114
236
|
description:
|
|
115
|
-
"Drop omitted (empty-text) thinking blocks from prior assistant turns and the latest non-continuation turn, to head off the CC thinking-desync 400 (#63147).
|
|
237
|
+
"Drop omitted (empty-text) thinking blocks from prior assistant turns and the latest non-continuation turn, to head off the CC thinking-desync 400 (#63147). v1 mode: omitted-text drop only. v2 mode: also drop signed thinking + redacted_thinking on cross-request tools-hash mismatch (ToolSearch surface). v1 is now ON by default as of v4.0.0; set CACHE_FIX_THINKING_SANITIZE=off to disable, =v2 to additionally opt into v2.",
|
|
116
238
|
order: 550,
|
|
117
239
|
|
|
118
240
|
async onRequest(ctx) {
|
|
119
|
-
|
|
241
|
+
const mode = modeFromEnv();
|
|
242
|
+
if (mode === "off") return;
|
|
243
|
+
|
|
120
244
|
const body = ctx.body;
|
|
121
245
|
if (!body || !Array.isArray(body.messages)) return;
|
|
122
246
|
|
|
123
|
-
|
|
124
|
-
|
|
247
|
+
// v2 only fires when mode === "v2" AND we have a usable session id.
|
|
248
|
+
let v2StripSigned = false;
|
|
249
|
+
let stateKey = null;
|
|
250
|
+
let currentHash = null;
|
|
251
|
+
|
|
252
|
+
if (mode === "v2") {
|
|
253
|
+
// Resolve session id inline — cache-telemetry's onRequest runs at order
|
|
254
|
+
// 600, after us, so ctx.meta._sessionId is not yet set when we fire at
|
|
255
|
+
// order 550. We import resolveSessionId from cache-telemetry to keep
|
|
256
|
+
// canonicalization consistent.
|
|
257
|
+
const rawSid = resolveSessionId(ctx.headers);
|
|
258
|
+
stateKey = sessionFilename(rawSid);
|
|
259
|
+
|
|
260
|
+
// "unknown" canonical id → no-op for v2 (cross-contamination risk on
|
|
261
|
+
// the shared sessions/unknown.json baseline). v1's strip still runs
|
|
262
|
+
// below regardless.
|
|
263
|
+
if (stateKey !== "unknown") {
|
|
264
|
+
currentHash = computeSignatureSurfaceHash({ tools: body.tools });
|
|
265
|
+
|
|
266
|
+
// Seed in-memory state from disk on first encounter that session
|
|
267
|
+
// (covers proxy restart — re-reads persisted baseline).
|
|
268
|
+
let st = v2SessionState.get(stateKey);
|
|
269
|
+
if (!st) {
|
|
270
|
+
st = seedV2FromFile(rawSid);
|
|
271
|
+
v2SessionState.set(stateKey, st);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const baseline = st.tools_hash_baseline;
|
|
275
|
+
// Mismatch only fires when there IS a baseline AND it differs.
|
|
276
|
+
// First request (baseline === null) observes-and-establishes — no strip.
|
|
277
|
+
v2StripSigned = baseline !== null && baseline !== currentHash;
|
|
278
|
+
|
|
279
|
+
// Stash for the onResponseStart hook to advance the baseline iff the
|
|
280
|
+
// response succeeded. Stash BEFORE the plan + strip so the response
|
|
281
|
+
// path has access regardless of whether anything was dropped.
|
|
282
|
+
ctx.meta._thinkingSanitizeV2PendingHash = currentHash;
|
|
283
|
+
ctx.meta._thinkingSanitizeV2StateKey = stateKey;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
const { messages, dropped, droppedV2 } = planSanitize(body.messages, {
|
|
288
|
+
v2StripSigned,
|
|
289
|
+
});
|
|
290
|
+
if (dropped > 0 || droppedV2 > 0) body.messages = messages;
|
|
125
291
|
|
|
126
292
|
// Counts only — never content. Exposed for cache-telemetry to persist and
|
|
127
293
|
// for the #160 session-health signal.
|
|
294
|
+
// v1 counter — unchanged, fires for both modes.
|
|
128
295
|
ctx.meta._thinkingSanitize = { thinking_blocks_dropped: dropped };
|
|
296
|
+
|
|
297
|
+
// v2 counter — fires only in v2 mode. Includes the post-mismatch baseline
|
|
298
|
+
// value that cache-telemetry will persist (so consumers can see the
|
|
299
|
+
// current baseline in the session JSON even on requests that didn't
|
|
300
|
+
// strip). The actual advance only happens on response success below.
|
|
301
|
+
if (mode === "v2" && stateKey && stateKey !== "unknown") {
|
|
302
|
+
ctx.meta._thinkingSanitizeV2 = {
|
|
303
|
+
thinking_blocks_dropped_v2: droppedV2,
|
|
304
|
+
// Persist the SOON-TO-BE-NEW baseline (it'll be advanced on success).
|
|
305
|
+
// On 4xx/5xx, the cache-telemetry write still happens but the
|
|
306
|
+
// in-memory state isn't advanced — next request re-reads disk and
|
|
307
|
+
// sees the persisted value, which may now disagree with in-memory.
|
|
308
|
+
// We resolve that by NOT writing the new hash to disk on failure:
|
|
309
|
+
// see onResponseStart below, which is the only thing that advances
|
|
310
|
+
// both in-memory and (indirectly via cache-telemetry's spread) disk.
|
|
311
|
+
// For now, leave tools_hash_baseline at the CURRENT baseline value;
|
|
312
|
+
// onResponseStart will overwrite this in meta if the response is 2xx.
|
|
313
|
+
tools_hash_baseline: v2SessionState.get(stateKey)?.tools_hash_baseline ?? null,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
},
|
|
317
|
+
|
|
318
|
+
// Advance the baseline only on HTTP 2xx response. 4xx/5xx leaves the
|
|
319
|
+
// in-memory state and the meta-stashed baseline untouched, so a failed
|
|
320
|
+
// request's hash never becomes the new ground truth.
|
|
321
|
+
async onResponseStart(ctx) {
|
|
322
|
+
const stateKey = ctx.meta._thinkingSanitizeV2StateKey;
|
|
323
|
+
const pendingHash = ctx.meta._thinkingSanitizeV2PendingHash;
|
|
324
|
+
if (!stateKey || !pendingHash) return;
|
|
325
|
+
if (typeof ctx.status !== "number") return;
|
|
326
|
+
if (ctx.status < 200 || ctx.status >= 300) return;
|
|
327
|
+
|
|
328
|
+
// Advance in-memory baseline.
|
|
329
|
+
let st = v2SessionState.get(stateKey);
|
|
330
|
+
if (!st) {
|
|
331
|
+
st = { tools_hash_baseline: null };
|
|
332
|
+
v2SessionState.set(stateKey, st);
|
|
333
|
+
}
|
|
334
|
+
st.tools_hash_baseline = pendingHash;
|
|
335
|
+
|
|
336
|
+
// Update the meta-stashed baseline so cache-telemetry's spread writes
|
|
337
|
+
// the new value to disk. If meta._thinkingSanitizeV2 wasn't stashed
|
|
338
|
+
// (e.g. mode flip mid-request), construct it now.
|
|
339
|
+
if (!ctx.meta._thinkingSanitizeV2) {
|
|
340
|
+
ctx.meta._thinkingSanitizeV2 = { thinking_blocks_dropped_v2: 0 };
|
|
341
|
+
}
|
|
342
|
+
ctx.meta._thinkingSanitizeV2.tools_hash_baseline = pendingHash;
|
|
129
343
|
},
|
|
130
344
|
};
|
package/proxy/pipeline.mjs
CHANGED
|
@@ -3,6 +3,7 @@ import { join } from "node:path";
|
|
|
3
3
|
import { pathToFileURL } from "node:url";
|
|
4
4
|
|
|
5
5
|
let registry = [];
|
|
6
|
+
let failedExtensions = []; // [{ file, error, lastAttempt }]
|
|
6
7
|
|
|
7
8
|
export async function loadExtensions(dir, configPath) {
|
|
8
9
|
let config = {};
|
|
@@ -15,6 +16,7 @@ export async function loadExtensions(dir, configPath) {
|
|
|
15
16
|
const mjsFiles = files.filter((f) => f.endsWith(".mjs")).sort();
|
|
16
17
|
|
|
17
18
|
const extensions = [];
|
|
19
|
+
const newlyFailed = [];
|
|
18
20
|
for (const file of mjsFiles) {
|
|
19
21
|
try {
|
|
20
22
|
const mod = await import(pathToFileURL(join(dir, file)).href + "?t=" + Date.now());
|
|
@@ -29,12 +31,24 @@ export async function loadExtensions(dir, configPath) {
|
|
|
29
31
|
extensions.push({ ...ext, order, _file: file });
|
|
30
32
|
}
|
|
31
33
|
} catch (err) {
|
|
32
|
-
|
|
34
|
+
// Load-bearing observability: this branch is the only signal that the
|
|
35
|
+
// proxy is running with a degraded extension graph. See #196: a Node
|
|
36
|
+
// ESM cache stale-import race silently broke thinking-block-sanitize
|
|
37
|
+
// v2 for 17 hours post-merge before AITL grepped the journal. The
|
|
38
|
+
// [CRITICAL] prefix is harder to miss than the prior [pipeline] one,
|
|
39
|
+
// and the explicit "restart proxy to recover" hint tells the operator
|
|
40
|
+
// what to do — the underlying Node ESM cache problem can't be fixed
|
|
41
|
+
// in-process (you can't evict cached transitive imports), so a full
|
|
42
|
+
// process restart is the only path to recover the extension graph.
|
|
43
|
+
const msg = `[CRITICAL] extension load failed: ${file}: ${err.message} — restart the proxy via your supervisor to recover (in-process reload cannot fix stale ESM cache; see #196)\n`;
|
|
44
|
+
process.stderr.write(msg);
|
|
45
|
+
newlyFailed.push({ file, error: String(err.message || err), lastAttempt: new Date().toISOString() });
|
|
33
46
|
}
|
|
34
47
|
}
|
|
35
48
|
|
|
36
49
|
extensions.sort((a, b) => a.order - b.order);
|
|
37
50
|
registry = extensions;
|
|
51
|
+
failedExtensions = newlyFailed;
|
|
38
52
|
return extensions;
|
|
39
53
|
}
|
|
40
54
|
|
|
@@ -46,6 +60,13 @@ export function snapshotRegistry() {
|
|
|
46
60
|
return [...registry];
|
|
47
61
|
}
|
|
48
62
|
|
|
63
|
+
// Exposed for /health and any operator-facing tool that wants to surface
|
|
64
|
+
// extension-load failures. Returns a fresh array per call so callers can't
|
|
65
|
+
// mutate internal state.
|
|
66
|
+
export function getFailedExtensions() {
|
|
67
|
+
return failedExtensions.map((f) => ({ ...f }));
|
|
68
|
+
}
|
|
69
|
+
|
|
49
70
|
// Route scoping: extensions default to messages-only so that adding a new
|
|
50
71
|
// route (e.g. /api/claude_cli/bootstrap) doesn't drag every existing
|
|
51
72
|
// message-mutating extension onto it — most throw on a null body because
|
package/proxy/server.mjs
CHANGED
|
@@ -3,7 +3,7 @@ import { pathToFileURL, URL } from "node:url";
|
|
|
3
3
|
import config from "./config.mjs";
|
|
4
4
|
import { forwardRequest } from "./upstream.mjs";
|
|
5
5
|
import { streamResponse, createTelemetryRecord } from "./stream.mjs";
|
|
6
|
-
import { loadExtensions, snapshotRegistry, runOnRequest, runOnResponseStart, runOnResponse } from "./pipeline.mjs";
|
|
6
|
+
import { loadExtensions, snapshotRegistry, runOnRequest, runOnResponseStart, runOnResponse, getFailedExtensions } from "./pipeline.mjs";
|
|
7
7
|
import { startWatcher } from "./watcher.mjs";
|
|
8
8
|
|
|
9
9
|
function collectBody(req) {
|
|
@@ -238,6 +238,21 @@ async function handleBootstrap(clientReq, clientRes) {
|
|
|
238
238
|
}
|
|
239
239
|
|
|
240
240
|
function handleHealth(_req, res) {
|
|
241
|
+
// Surface extension-load failures so callers (operators, monitoring) see
|
|
242
|
+
// a degraded proxy state instead of a misleading "ok". See #196: a Node
|
|
243
|
+
// ESM cache stale-import race silently broke thinking-block-sanitize v2
|
|
244
|
+
// for 17 hours post-merge before anyone noticed. /health returning "ok"
|
|
245
|
+
// through that window was load-bearing in the silence.
|
|
246
|
+
const failed = getFailedExtensions();
|
|
247
|
+
if (failed.length > 0) {
|
|
248
|
+
res.writeHead(503, { "content-type": "application/json" });
|
|
249
|
+
res.end(JSON.stringify({
|
|
250
|
+
status: "degraded",
|
|
251
|
+
failed_extensions: failed,
|
|
252
|
+
hint: "restart the proxy via your supervisor to recover (in-process reload cannot fix stale ESM cache; #196)",
|
|
253
|
+
}));
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
241
256
|
res.writeHead(200, { "content-type": "application/json" });
|
|
242
257
|
res.end(JSON.stringify({ status: "ok" }));
|
|
243
258
|
}
|
|
@@ -290,7 +305,34 @@ export async function startProxy(options = {}) {
|
|
|
290
305
|
const bind = options.bind ?? config.bind;
|
|
291
306
|
const extensionsDir = options.extensionsDir ?? config.extensionsDir;
|
|
292
307
|
const extensionsConfig = options.extensionsConfig ?? config.extensionsConfig;
|
|
293
|
-
|
|
308
|
+
// Hot-reload is opt-in as of v4.0.0 (#196). The in-process watcher is the
|
|
309
|
+
// only code path that triggers the Node ESM stale-import race; cold starts
|
|
310
|
+
// have an empty module cache and load extensions cleanly. Strict `=== "on"`
|
|
311
|
+
// means any other value (including "true"/"1"/"yes") is treated as off —
|
|
312
|
+
// the safe default. Note this is the opposite stance from
|
|
313
|
+
// CACHE_FIX_THINKING_SANITIZE (default-on; only literal "off" disables):
|
|
314
|
+
// a hot-reload enable is a footgun, so we require the operator to type the
|
|
315
|
+
// exact opt-in token; a sanitize disable is also a footgun (loses the
|
|
316
|
+
// wedge mitigation), so we require the exact disable token there.
|
|
317
|
+
const hotReloadOptIn = process.env.CACHE_FIX_HOT_RELOAD === "on";
|
|
318
|
+
const watch = options.watch !== false && hotReloadOptIn;
|
|
319
|
+
|
|
320
|
+
// Boot banner on stderr so the EFFECTIVE hot-reload mode is visible in the
|
|
321
|
+
// supervisor's log (journalctl --user / ~/Library/Logs/) without being
|
|
322
|
+
// noisy for monitoring tools that line-grep stderr. Keyed off the effective
|
|
323
|
+
// `watch` value, not the raw envvar, so an embedder calling startProxy({
|
|
324
|
+
// watch: false }) with the envvar set sees "off" (which is the truth — the
|
|
325
|
+
// watcher is suppressed regardless of envvar in that case). Supervisor-
|
|
326
|
+
// neutral wording — no version pin (lives in CHANGELOG/README instead).
|
|
327
|
+
if (watch) {
|
|
328
|
+
process.stderr.write(
|
|
329
|
+
"[cache-fix] hot-reload: on (CACHE_FIX_HOT_RELOAD=on) — long-running processes can hit a Node ESM stale-import race; see #196. Restart the proxy via your supervisor to recover.\n",
|
|
330
|
+
);
|
|
331
|
+
} else {
|
|
332
|
+
process.stderr.write(
|
|
333
|
+
"[cache-fix] hot-reload: off (set CACHE_FIX_HOT_RELOAD=on to enable). Extension changes require a supervisor-level proxy restart.\n",
|
|
334
|
+
);
|
|
335
|
+
}
|
|
294
336
|
|
|
295
337
|
let watcher = null;
|
|
296
338
|
try {
|
package/tools/MANUAL-COMPACT.md
CHANGED
|
@@ -10,10 +10,10 @@ When using the 1M context window hack (`DISABLE_COMPACT=1` + `CLAUDE_CODE_MAX_CO
|
|
|
10
10
|
|
|
11
11
|
1. Extracts conversation turns from the session JSONL transcript
|
|
12
12
|
2. Splits turns into three weighted segments:
|
|
13
|
-
- **Foundational** (first 20%) — truncated to
|
|
14
|
-
- **Working** (middle 40%) — truncated to
|
|
15
|
-
- **Active** (last 40%) — preserved up to
|
|
16
|
-
3. Sends the weighted extract to Claude
|
|
13
|
+
- **Foundational** (first 20%) — truncated to 300 chars each
|
|
14
|
+
- **Working** (middle 40%) — truncated to 1500 chars each
|
|
15
|
+
- **Active** (last 40%) — preserved up to 8000 chars each
|
|
16
|
+
3. Sends the weighted extract to Claude Opus for summarization
|
|
17
17
|
4. Produces a structured summary optimized for agent handoff
|
|
18
18
|
|
|
19
19
|
The weighting ensures recent active work (the part you're most likely to need) gets full detail, while earlier completed work is compressed.
|
|
@@ -56,7 +56,7 @@ Always:
|
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
```
|
|
59
|
-
Project directory:
|
|
59
|
+
Project directory: ~/git_repos/your-project
|
|
60
60
|
Auto-detected session: db11f377-4ca8-4fc3-9b6d-1069da58c1b2.jsonl
|
|
61
61
|
Modified: 2026-04-19 13:26:42
|
|
62
62
|
Size: 4.8M
|
|
@@ -142,7 +142,7 @@ Use the user context file to fill known gaps.
|
|
|
142
142
|
|
|
143
143
|
Two costs to account for:
|
|
144
144
|
|
|
145
|
-
1. **Summarization call** — the `claude --print` call through
|
|
145
|
+
1. **Summarization call** — the `claude --print` call through Opus. With the relaxed recent-turn caps the extract is larger (and Opus costs more per token than Sonnet), so expect a few % Q5h rather than ~1-2%. The tradeoff buys markedly higher-fidelity summaries; override with `MANUAL_COMPACT_MODEL=claude-sonnet-4-6` if you need to minimize cost.
|
|
146
146
|
2. **Cold start after /clear** — the first API call rebuilds the full cache from scratch. Real-world example from a 954K-token session:
|
|
147
147
|
|
|
148
148
|
```
|
|
@@ -153,11 +153,33 @@ Second call: cache_read=957,253 cache_creation=5,569 (warm again)
|
|
|
153
153
|
|
|
154
154
|
The cold rebuild consumed ~15% Q5h in one call on our Max 5x account. After that single rebuild, the session is warm again and cache hits resume at 99%+.
|
|
155
155
|
|
|
156
|
-
**Total cost of a manual compact cycle:** ~
|
|
156
|
+
**Total cost of a manual compact cycle:** roughly ~15% cold rebuild plus a few % for the Opus summarization. Compare to hitting the 1M wall and losing the session entirely.
|
|
157
157
|
|
|
158
|
-
###
|
|
158
|
+
### Stale transcripts get swept (CC's `cleanupPeriodDays`)
|
|
159
159
|
|
|
160
|
-
|
|
160
|
+
Heads up if you're treating the on-disk `.jsonl` as a "keep just in case" backup after `/clear`: it isn't durable. Claude Code maintains a transcript-retention setting `cleanupPeriodDays` in `~/.claude/settings.json` (default 30 days). CC runs a transcript cleanup at startup when its `~/.claude/.last-cleanup` sentinel is past the 24h freshness window — when that fires, CC walks every `.jsonl` under `~/.claude/projects/` and deletes any whose `mtime` is past the cutoff, along with the matching `<session-id>/` companion directory next to it. A session you compacted, `/clear`-ed, and stopped retaining ~31 days ago will be gone after the next launch that crosses the cleanup gate, even if you'd planned to grep it for context.
|
|
161
|
+
|
|
162
|
+
Practical implications:
|
|
163
|
+
|
|
164
|
+
- **If you need the post-compact JSONL preserved**, copy it out of `~/.claude/projects/` to a path that isn't subject to CC's cleanup — e.g. `~/snapshots/cc-jsonl-backups/`.
|
|
165
|
+
- **A stopped session held in heal-and-await state is especially vulnerable** — it's idle by definition, so it crosses `cleanupPeriodDays` faster than an actively-used session whose appends keep mtime fresh. If you've stopped a session intending to resume later, either resume promptly, `touch` the `.jsonl` to refresh mtime, or copy it out of the tree.
|
|
166
|
+
- Cleanup keys off `mtime`, and plain reads (`cat`/`grep`/`less`) don't refresh `mtime` — inspection doesn't extend retention.
|
|
167
|
+
- **Raise the retention setting on every machine you use CC on.** Adding `"cleanupPeriodDays": 36500` (~100 years) to `~/.claude/settings.json` defangs the documented cleanup path entirely. There's no documented upper bound; the schema just wants a positive integer. The cleanup logic re-reads the setting at each sweep, so you can land this even on machines where prior sweeps already happened.
|
|
168
|
+
|
|
169
|
+
**If a transcript was already swept** and you need to recover it, [`vsits/restore-claude-history-linux`](https://github.com/vsits/restore-claude-history-linux) (RCB) restores deleted `.jsonl` files from Linux filesystem snapshots — **ZFS**, **Btrfs**, or **Timeshift**. End-to-end-verified on Ubuntu 24.04; a real Btrfs dogfood confirmed a recovered transcript loads and resumes via `/resume` in a fresh CC session. macOS users have the same shape via the upstream [`garrettmoss/restore-claude-history`](https://github.com/garrettmoss/restore-claude-history) (Time Machine). Both tools also remind you to set `cleanupPeriodDays` afterward — otherwise the restored transcript gets re-swept on the next cleanup pass.
|
|
170
|
+
|
|
171
|
+
Tracked upstream as [anthropics/claude-code#62272](https://github.com/anthropics/claude-code/issues/62272) — cache-fix doesn't touch this surface, but documenting it because manual-compact users are the population most likely to bank on the `.jsonl` sticking around.
|
|
172
|
+
|
|
173
|
+
### Summarizer model
|
|
174
|
+
|
|
175
|
+
The tool defaults to `claude --print --model claude-opus-4-7` for the highest-fidelity summary. Override with the `MANUAL_COMPACT_MODEL` env var — e.g. `MANUAL_COMPACT_MODEL=claude-sonnet-4-6` to minimize Q5h impact, or to point at a different model if Opus is rate-limited or retired.
|
|
176
|
+
|
|
177
|
+
### Troubleshooting: empty summary output
|
|
178
|
+
|
|
179
|
+
If `$OUTPUT` comes back empty, the most likely cause is that the extract exceeded the summarizer's context window — this tool runs near the 1M wall, and the relaxed recent-turn caps (active turns up to 8000 chars) make the extract large on exactly those big sessions. The summarizer call swallows stderr, so an oversized-input rejection surfaces as an empty file rather than a visible error. Fixes, in order of preference:
|
|
180
|
+
|
|
181
|
+
- Use a 1M-window model for the summarization: `MANUAL_COMPACT_MODEL='claude-opus-4-7[1m]' manual-compact.sh ...`
|
|
182
|
+
- Or lower the per-turn caps in the script's extraction block (the `text[:8000]` / `text[:1500]` / `text[:300]` slices).
|
|
161
183
|
|
|
162
184
|
## Why the 1M Hack Disables /compact
|
|
163
185
|
|
package/tools/manual-compact.sh
CHANGED
|
@@ -145,31 +145,33 @@ if total == 0:
|
|
|
145
145
|
sys.exit(1)
|
|
146
146
|
|
|
147
147
|
# Split into three segments with different detail levels:
|
|
148
|
-
# - First 20%: truncate to
|
|
149
|
-
# - Middle 40%: truncate to
|
|
150
|
-
# - Last 40%: full text up to
|
|
148
|
+
# - First 20%: truncate to 300 chars each (foundational context)
|
|
149
|
+
# - Middle 40%: truncate to 1500 chars each (working context)
|
|
150
|
+
# - Last 40%: full text up to 8000 chars each (active work — most important)
|
|
151
|
+
# Recent-turn caps were relaxed (was 200/400/2000) so the summarizer sees the
|
|
152
|
+
# active work in near-full detail; the stronger model (Opus, below) handles it.
|
|
151
153
|
seg1_end = int(total * 0.2)
|
|
152
154
|
seg2_end = int(total * 0.6)
|
|
153
155
|
|
|
154
156
|
with open("$EXTRACT", 'w') as f:
|
|
155
157
|
f.write("=== FOUNDATIONAL CONTEXT (early session) ===\n\n")
|
|
156
158
|
for role, text in conversation[:seg1_end]:
|
|
157
|
-
f.write(f"[{role}]: {text[:
|
|
159
|
+
f.write(f"[{role}]: {text[:300]}\n\n")
|
|
158
160
|
|
|
159
161
|
f.write("\n=== WORKING CONTEXT (mid session) ===\n\n")
|
|
160
162
|
for role, text in conversation[seg1_end:seg2_end]:
|
|
161
|
-
f.write(f"[{role}]: {text[:
|
|
163
|
+
f.write(f"[{role}]: {text[:1500]}\n\n")
|
|
162
164
|
|
|
163
165
|
f.write("\n=== ACTIVE WORK (recent — preserve in full detail) ===\n\n")
|
|
164
166
|
for role, text in conversation[seg2_end:]:
|
|
165
|
-
f.write(f"[{role}]: {text[:
|
|
167
|
+
f.write(f"[{role}]: {text[:8000]}\n\n")
|
|
166
168
|
|
|
167
169
|
import os
|
|
168
170
|
size = os.path.getsize("$EXTRACT")
|
|
169
171
|
print(f"Extracted {total} turns ({size:,} bytes, ~{size//4:,} est. tokens)")
|
|
170
|
-
print(f" Foundational: {seg1_end} turns (truncated to
|
|
171
|
-
print(f" Working: {seg2_end - seg1_end} turns (truncated to
|
|
172
|
-
print(f" Active: {total - seg2_end} turns (up to
|
|
172
|
+
print(f" Foundational: {seg1_end} turns (truncated to 300 chars)")
|
|
173
|
+
print(f" Working: {seg2_end - seg1_end} turns (truncated to 1500 chars)")
|
|
174
|
+
print(f" Active: {total - seg2_end} turns (up to 8000 chars)")
|
|
173
175
|
PYEOF
|
|
174
176
|
|
|
175
177
|
# Build the summarization prompt
|
|
@@ -199,10 +201,14 @@ ADDITIONAL USER CONTEXT TO PRESERVE:
|
|
|
199
201
|
$USER_CONTEXT"
|
|
200
202
|
fi
|
|
201
203
|
|
|
204
|
+
# Summarizer model. Defaults to Opus for highest-fidelity summaries; override
|
|
205
|
+
# with MANUAL_COMPACT_MODEL (e.g. when Opus is rate-limited or retired).
|
|
206
|
+
COMPACT_MODEL="${MANUAL_COMPACT_MODEL:-claude-opus-4-7}"
|
|
207
|
+
|
|
202
208
|
echo ""
|
|
203
|
-
echo "Sending to Claude for summarization..."
|
|
209
|
+
echo "Sending to Claude ($COMPACT_MODEL) for summarization..."
|
|
204
210
|
|
|
205
|
-
cat "$EXTRACT" | claude --print --model
|
|
211
|
+
cat "$EXTRACT" | claude --print --model "$COMPACT_MODEL" "$PROMPT" > "$OUTPUT" 2>/dev/null
|
|
206
212
|
|
|
207
213
|
SIZE=$(wc -c < "$OUTPUT")
|
|
208
214
|
echo ""
|
|
@@ -115,11 +115,13 @@ def draw_bar(consumed_pct, elapsed_pct, width=BAR_WIDTH):
|
|
|
115
115
|
# Tick overlays a fill cell when consumed > elapsed, keeping bar width
|
|
116
116
|
# constant — that's what makes the over-pace state legible (┃ inside the
|
|
117
117
|
# filled run) rather than just pushing fill cells around.
|
|
118
|
-
|
|
118
|
+
def to_cells(pct):
|
|
119
|
+
return int(round(max(0, min(100, pct)) / 100 * width))
|
|
120
|
+
fill = to_cells(consumed_pct)
|
|
119
121
|
if elapsed_pct is None:
|
|
120
122
|
tick = -1
|
|
121
123
|
else:
|
|
122
|
-
tick = min(
|
|
124
|
+
tick = min(to_cells(elapsed_pct), width - 1)
|
|
123
125
|
cells = []
|
|
124
126
|
remaining = fill
|
|
125
127
|
for i in range(width):
|