claude-code-session-manager 0.8.3 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +21 -0
  2. package/dist/assets/{cssMode-DyaNC2Cs.js → cssMode-BCLoTYI0.js} +1 -1
  3. package/dist/assets/{editor.main-BhSGi_Jw.js → editor.main-UoasbVGy.js} +3 -3
  4. package/dist/assets/{freemarker2-DZH3si5v.js → freemarker2-dhfKZR7u.js} +1 -1
  5. package/dist/assets/{handlebars-DvzTd6uL.js → handlebars-DdpqwFuV.js} +1 -1
  6. package/dist/assets/{html-C5GmopAN.js → html-1oTJClkg.js} +1 -1
  7. package/dist/assets/{htmlMode-DwnrHwx1.js → htmlMode-CF1QbIg-.js} +1 -1
  8. package/dist/assets/index-DWDcKbgI.js +3046 -0
  9. package/dist/assets/index-eqxng9X2.css +32 -0
  10. package/dist/assets/{javascript-JqHrxiCa.js → javascript-BP_Q5MFx.js} +1 -1
  11. package/dist/assets/{jsonMode-8rZcy09i.js → jsonMode-BtjA-2w_.js} +1 -1
  12. package/dist/assets/{liquid-ClpD_v7G.js → liquid-DstuL8vm.js} +1 -1
  13. package/dist/assets/{lspLanguageFeatures-u0WgQBQz.js → lspLanguageFeatures-DvSiaY4f.js} +1 -1
  14. package/dist/assets/{mdx-DtViUgdm.js → mdx-qO-uvsJd.js} +1 -1
  15. package/dist/assets/{python-CaAvhRGm.js → python-CCPz_1cy.js} +1 -1
  16. package/dist/assets/{razor-saGNVU7l.js → razor-B7tCzkdh.js} +1 -1
  17. package/dist/assets/{tsMode-HZwWTCj8.js → tsMode-hUkEyjsH.js} +1 -1
  18. package/dist/assets/{typescript-BInV4PNE.js → typescript-BeXECzAk.js} +1 -1
  19. package/dist/assets/{whisperWorker-ivwFFLMj.js → whisperWorker-QfIS0sPF.js} +5 -5
  20. package/dist/assets/{xml-tgO806YR.js → xml-MRJd4GHf.js} +1 -1
  21. package/dist/assets/{yaml-CHApZArv.js → yaml-CzGliMNL.js} +1 -1
  22. package/dist/index.html +2 -2
  23. package/package.json +16 -1
  24. package/src/main/historyAggregator.cjs +208 -0
  25. package/src/main/index.cjs +4 -0
  26. package/src/main/ipcSchemas.cjs +15 -0
  27. package/src/main/lib/schedulerConfig.cjs +2 -0
  28. package/src/main/scheduler.cjs +604 -120
  29. package/src/main/supervisor.cjs +512 -0
  30. package/src/main/usage.cjs +44 -2
  31. package/src/preload/api.d.ts +64 -2
  32. package/src/preload/index.cjs +10 -0
  33. package/dist/assets/index-BGshD4Pw.js +0 -2976
  34. package/dist/assets/index-DCK87t79.css +0 -32
@@ -0,0 +1,512 @@
1
+ /**
2
+ * supervisor.cjs — every 15 min, probes each running scheduled job to detect
3
+ * jobs wedged on an unsatisfiable poll-loop (the fizzpop pattern).
4
+ *
5
+ * When stuck, SIGTERMs only the offending child bash so the parent claude -p
6
+ * Sonnet agent sees the failed tool result and can recover. Never kills the
7
+ * agent itself unless the job has zero child processes and is > 60 min stale.
8
+ *
9
+ * Linux only (v1). On darwin, startSupervisor logs a warning and returns.
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ const fs = require('node:fs');
15
+ const path = require('node:path');
16
+ const os = require('node:os');
17
+ const { spawn, execFileSync } = require('node:child_process');
18
+ const { ipcMain } = require('electron');
19
+
20
+ const HOME = os.homedir();
21
+ const SUPERVISOR_LOG_PATH = path.join(HOME, '.claude', 'session-manager', 'supervisor.log');
22
+ const SUPERVISOR_LOG_MAX_BYTES = 1024 * 1024;
23
+ const RUNS_DIR = path.join(HOME, '.claude', 'session-manager', 'scheduled-plans', 'runs');
24
+
25
+ // In-flight probe slugs — prevents duplicate probes across ticks.
26
+ const inFlightProbes = new Set();
27
+
28
+ let supervisorInterval = null;
29
+ let _readQueue = null;
30
+ let _mutate = null;
31
+
32
+ // ─── /proc helpers (Linux-only) ────────────────────────────────────────────
33
+
34
+ /**
35
+ * Walk PPID chain upward (bounded to 32 hops) to test whether `pid` is a
36
+ * descendant of `ancestorPid`. Returns false if /proc is unavailable.
37
+ */
38
+ function isDescendantOf(pid, ancestorPid) {
39
+ let cur = pid;
40
+ for (let i = 0; i < 32; i++) {
41
+ if (cur === ancestorPid) return true;
42
+ if (cur <= 1) return false;
43
+ let ppid;
44
+ try {
45
+ const stat = fs.readFileSync(`/proc/${cur}/stat`, 'utf8');
46
+ // comm may contain spaces; split on the last ')' to get past it.
47
+ const closeParen = stat.lastIndexOf(')');
48
+ const tail = stat.slice(closeParen + 2).split(' ');
49
+ ppid = parseInt(tail[1], 10); // tail[0]=state, tail[1]=ppid
50
+ } catch {
51
+ return false;
52
+ }
53
+ if (!Number.isFinite(ppid) || ppid <= 0) return false;
54
+ cur = ppid;
55
+ }
56
+ return false;
57
+ }
58
+
59
+ function getPstree(pid) {
60
+ try {
61
+ return execFileSync('pstree', ['-p', String(pid)], { encoding: 'utf8', timeout: 5000 });
62
+ } catch {
63
+ return buildProcTreeFallback(pid);
64
+ }
65
+ }
66
+
67
+ function buildProcTreeFallback(rootPid) {
68
+ try {
69
+ const entries = fs.readdirSync('/proc');
70
+ const descendants = [];
71
+ for (const e of entries) {
72
+ const p = parseInt(e, 10);
73
+ if (!Number.isFinite(p) || p <= 0) continue;
74
+ if (isDescendantOf(p, rootPid)) descendants.push(p);
75
+ }
76
+ return `(fallback) ${rootPid}: [${descendants.join(', ')}]`;
77
+ } catch {
78
+ return `(pstree unavailable; rootPid=${rootPid})`;
79
+ }
80
+ }
81
+
82
+ function getChildBashCmdlines(jobPid) {
83
+ const result = [];
84
+ try {
85
+ const entries = fs.readdirSync('/proc');
86
+ for (const e of entries) {
87
+ const pid = parseInt(e, 10);
88
+ if (!Number.isFinite(pid) || pid <= 0) continue;
89
+ try {
90
+ const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
91
+ const m = stat.match(/^\d+ \(([^)]+)\)/);
92
+ if (!m || m[1] !== 'bash') continue;
93
+ if (!isDescendantOf(pid, jobPid)) continue;
94
+ const cmdline = fs.readFileSync(`/proc/${pid}/cmdline`, 'utf8');
95
+ result.push(`pid=${pid}: ${cmdline.replace(/\0/g, '|')}`);
96
+ } catch { /* process may have exited */ }
97
+ }
98
+ } catch { /* /proc unavailable */ }
99
+ return result.join('\n') || '(none found)';
100
+ }
101
+
102
+ // ─── Log tail helpers ───────────────────────────────────────────────────────
103
+
104
+ function readTailBytes(filePath, bytes) {
105
+ try {
106
+ const stat = fs.statSync(filePath);
107
+ const n = Math.min(stat.size, bytes);
108
+ if (n <= 0) return '';
109
+ const fd = fs.openSync(filePath, 'r');
110
+ const buf = Buffer.alloc(n);
111
+ fs.readSync(fd, buf, 0, n, stat.size - n);
112
+ fs.closeSync(fd);
113
+ return buf.toString('utf8');
114
+ } catch {
115
+ return '';
116
+ }
117
+ }
118
+
119
+ /**
120
+ * Skim the last 16 KB of a run log for the most recent assistant/user/result
121
+ * event timestamp. Falls back to file mtime if no parseable timestamps found.
122
+ */
123
+ function getLastActivityTs(logPath) {
124
+ try {
125
+ const stat = fs.statSync(logPath);
126
+ const tail = readTailBytes(logPath, 16384);
127
+ let maxTs = 0;
128
+ for (const line of tail.split('\n')) {
129
+ if (!line.trim()) continue;
130
+ try {
131
+ const obj = JSON.parse(line);
132
+ // stream-json events: look for any timestamp field
133
+ const ts = obj.timestamp
134
+ ? new Date(obj.timestamp).getTime()
135
+ : obj.ts
136
+ ? (typeof obj.ts === 'number' ? obj.ts : new Date(obj.ts).getTime())
137
+ : 0;
138
+ if (ts > 0 && ts > maxTs) maxTs = ts;
139
+ // Also accept events with message.created_at (Anthropic API format)
140
+ if (obj.message?.created_at) {
141
+ const t = typeof obj.message.created_at === 'number'
142
+ ? obj.message.created_at * 1000
143
+ : new Date(obj.message.created_at).getTime();
144
+ if (t > maxTs) maxTs = t;
145
+ }
146
+ } catch { /* not JSON or no timestamp */ }
147
+ }
148
+ return maxTs || stat.mtimeMs;
149
+ } catch {
150
+ return 0;
151
+ }
152
+ }
153
+
154
+ // ─── Supervisor log ─────────────────────────────────────────────────────────
155
+
156
+ function appendSupervisorLog(entry) {
157
+ try {
158
+ let size = 0;
159
+ try { size = fs.statSync(SUPERVISOR_LOG_PATH).size; } catch { /* new file */ }
160
+ if (size >= SUPERVISOR_LOG_MAX_BYTES) {
161
+ const rotated = SUPERVISOR_LOG_PATH + '.1';
162
+ try { fs.unlinkSync(rotated); } catch { /* */ }
163
+ try { fs.renameSync(SUPERVISOR_LOG_PATH, rotated); } catch { /* */ }
164
+ }
165
+ fs.appendFileSync(SUPERVISOR_LOG_PATH, JSON.stringify(entry) + '\n');
166
+ } catch { /* disk full / EROFS — silently drop */ }
167
+ }
168
+
169
+ function readSupervisorLog(n) {
170
+ try {
171
+ const text = fs.readFileSync(SUPERVISOR_LOG_PATH, 'utf8');
172
+ const lines = text.split('\n').filter((l) => l.trim());
173
+ return lines.slice(-n).map((l) => {
174
+ try { return JSON.parse(l); } catch { return null; }
175
+ }).filter(Boolean).reverse();
176
+ } catch {
177
+ return [];
178
+ }
179
+ }
180
+
181
+ // ─── Claude binary resolution ────────────────────────────────────────────────
182
+
183
+ let claudeBinCached = null;
184
+ function resolveClaudeBin() {
185
+ if (claudeBinCached) return claudeBinCached;
186
+ const candidates = [
187
+ path.join(HOME, '.claude', 'local', 'claude'),
188
+ '/usr/local/bin/claude',
189
+ '/opt/homebrew/bin/claude',
190
+ '/usr/bin/claude',
191
+ ];
192
+ for (const c of candidates) {
193
+ try { fs.accessSync(c, fs.constants.X_OK); claudeBinCached = c; return c; } catch { /* */ }
194
+ }
195
+ claudeBinCached = 'claude';
196
+ return claudeBinCached;
197
+ }
198
+
199
+ // ─── Probe ──────────────────────────────────────────────────────────────────
200
+
201
+ function buildProbePrompt({ slug, cwd, startedAt, ageMinutes, lastActivityAge, jobPid, pstreeOutput, childBashCmdlines, logTail }) {
202
+ return `You are a process supervisor. A scheduled \`claude -p\` Sonnet agent
203
+ appears to have stopped making progress. Decide whether it is wedged
204
+ on a child subprocess and, if so, which subprocess to kill.
205
+
206
+ Job slug: ${slug}
207
+ Job cwd: ${cwd}
208
+ Started: ${startedAt} (${ageMinutes} min ago)
209
+ Last JSONL event: ${lastActivityAge} min ago
210
+
211
+ Process tree (pstree -p ${jobPid}):
212
+ ${pstreeOutput}
213
+
214
+ Child bash command lines (cmdline of every descendant bash, NUL-separated args):
215
+ ${childBashCmdlines}
216
+
217
+ Last 4 KB of the JSONL run log:
218
+ ${logTail}
219
+
220
+ Return EXACTLY ONE JSON object on a single line, no prose:
221
+ {"verdict":"ok"|"stuck","action":"none"|"kill-bash"|"kill-agent","targetPid":<integer or null>,"reason":"<one sentence>"}
222
+
223
+ Decision rules:
224
+ - "ok"/none if the agent is mid-tool-use and the last event is < 10 min old, or if it is in a bounded retry loop with a clear termination condition (e.g. \`for i in $(seq 1 60)\` is acceptable; \`until <unbounded check>\` is suspicious).
225
+ - "stuck"/kill-bash if you can identify a single descendant bash polling on an unsatisfiable condition (e.g. \`until $(curl ... uptime) -lt $PREV; do sleep ...; done\` where the live value will never drop). Set targetPid to that bash PID.
226
+ - "stuck"/kill-agent only if the agent itself is wedged (no child processes, no recent log events, > 60 min stale). Set targetPid to the claude root pid.`;
227
+ }
228
+
229
+ function runProbe(claudeBin, prompt) {
230
+ return new Promise((resolve) => {
231
+ let child;
232
+ try {
233
+ child = spawn(claudeBin, [
234
+ '-p', prompt,
235
+ '--model', 'claude-opus-4-7',
236
+ '--no-session-persistence',
237
+ '--output-format', 'json',
238
+ '--max-budget-usd', '0.10',
239
+ '--dangerously-skip-permissions',
240
+ '--allowedTools', 'Bash',
241
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
242
+ } catch (e) {
243
+ console.error('[supervisor] probe spawn failed:', e?.message);
244
+ resolve({ verdict: 'ok', action: 'none', targetPid: null, reason: `spawn failed: ${e?.message}`, costUsd: null });
245
+ return;
246
+ }
247
+
248
+ const stdoutChunks = [];
249
+ child.stdout.on('data', (b) => stdoutChunks.push(b));
250
+ child.stderr.on('data', () => { /* discard */ });
251
+
252
+ child.on('error', (e) => {
253
+ console.error('[supervisor] probe process error:', e?.message);
254
+ resolve({ verdict: 'ok', action: 'none', targetPid: null, reason: `probe error: ${e?.message}`, costUsd: null });
255
+ });
256
+
257
+ child.on('exit', () => {
258
+ const stdout = Buffer.concat(stdoutChunks).toString('utf8');
259
+ let costUsd = null;
260
+ let verdictObj = null;
261
+
262
+ // --output-format json emits a single JSON result object.
263
+ // The `result` field contains the assistant's raw text response.
264
+ for (const line of stdout.split('\n').reverse()) {
265
+ const trimmed = line.trim();
266
+ if (!trimmed) continue;
267
+ try {
268
+ const obj = JSON.parse(trimmed);
269
+ if (typeof obj.total_cost_usd === 'number') costUsd = obj.total_cost_usd;
270
+ // Extract verdict from the result text field (assistant's response).
271
+ if (obj.result && typeof obj.result === 'string') {
272
+ const m = obj.result.match(/\{[^{}]+\}/);
273
+ if (m) {
274
+ try { verdictObj = JSON.parse(m[0]); } catch { /* */ }
275
+ }
276
+ }
277
+ } catch { /* not JSON */ }
278
+ if (verdictObj) break;
279
+ }
280
+
281
+ if (!verdictObj || !verdictObj.verdict) {
282
+ // Non-JSON or unparseable — fail-safe: treat as ok.
283
+ const raw = stdout.slice(0, 1024);
284
+ console.warn('[supervisor] probe non-parseable output (treating as ok):', raw);
285
+ resolve({ verdict: 'ok', action: 'none', targetPid: null, reason: 'probe returned non-parseable output', costUsd });
286
+ return;
287
+ }
288
+
289
+ resolve({
290
+ verdict: verdictObj.verdict === 'stuck' ? 'stuck' : 'ok',
291
+ action: ['none', 'kill-bash', 'kill-agent'].includes(verdictObj.action) ? verdictObj.action : 'none',
292
+ targetPid: typeof verdictObj.targetPid === 'number' ? verdictObj.targetPid : null,
293
+ reason: typeof verdictObj.reason === 'string' ? verdictObj.reason.slice(0, 500) : '',
294
+ costUsd,
295
+ });
296
+ });
297
+ });
298
+ }
299
+
300
+ // ─── Apply action ────────────────────────────────────────────────────────────
301
+
302
+ function applyAction(action, targetPid, jobSlug) {
303
+ if (action === 'none') return;
304
+
305
+ // Retrieve the job's root PID from the live queue for descendant verification.
306
+ const state = _readQueue ? _readQueue() : null;
307
+ const job = state ? state.jobs.find((j) => j.slug === jobSlug) : null;
308
+ const jobRootPid = job?.runtime?.pid ?? null;
309
+
310
+ const selfPid = process.pid;
311
+
312
+ if (action === 'kill-bash') {
313
+ if (!targetPid) {
314
+ console.warn('[supervisor] kill-bash: no targetPid for', jobSlug);
315
+ return;
316
+ }
317
+ if (targetPid === selfPid) {
318
+ console.warn('[supervisor] kill-bash: refused to kill self (slug=%s)', jobSlug);
319
+ return;
320
+ }
321
+ // Verify targetPid is a descendant of the job's root process.
322
+ if (jobRootPid && !isDescendantOf(targetPid, jobRootPid)) {
323
+ console.warn('[supervisor] kill-bash: targetPid=%d is not a descendant of job root pid=%d (slug=%s)', targetPid, jobRootPid, jobSlug);
324
+ return;
325
+ }
326
+ console.log(`[supervisor] kill-bash SIGTERM pid=${targetPid} (${jobSlug})`);
327
+ try {
328
+ process.kill(targetPid, 'SIGTERM');
329
+ } catch (e) {
330
+ if (e.code !== 'ESRCH') console.warn('[supervisor] kill-bash SIGTERM failed', targetPid, e?.message);
331
+ return;
332
+ }
333
+ setTimeout(() => {
334
+ try {
335
+ process.kill(targetPid, 0); // check alive
336
+ process.kill(targetPid, 'SIGKILL');
337
+ console.log(`[supervisor] kill-bash SIGKILL (still alive after 2s) pid=${targetPid} (${jobSlug})`);
338
+ } catch { /* already dead */ }
339
+ }, 2000);
340
+
341
+ } else if (action === 'kill-agent') {
342
+ const pid = targetPid || jobRootPid;
343
+ if (!pid) {
344
+ console.warn('[supervisor] kill-agent: no pid for', jobSlug);
345
+ return;
346
+ }
347
+ if (pid === selfPid) {
348
+ console.warn('[supervisor] kill-agent: refused to kill self (slug=%s)', jobSlug);
349
+ return;
350
+ }
351
+ console.log(`[supervisor] kill-agent SIGTERM pid=${pid} (${jobSlug})`);
352
+ try {
353
+ process.kill(pid, 'SIGTERM');
354
+ } catch (e) {
355
+ if (e.code !== 'ESRCH') console.warn('[supervisor] kill-agent SIGTERM failed', pid, e?.message);
356
+ return;
357
+ }
358
+ setTimeout(() => {
359
+ try {
360
+ process.kill(pid, 0);
361
+ process.kill(pid, 'SIGKILL');
362
+ console.log(`[supervisor] kill-agent SIGKILL (still alive after 2s) pid=${pid} (${jobSlug})`);
363
+ } catch { /* already dead */ }
364
+ }, 2000);
365
+ }
366
+ }
367
+
368
+ // ─── Probe a single job ──────────────────────────────────────────────────────
369
+
370
+ async function probeJob(job, lastActivityAge) {
371
+ const slug = job.slug;
372
+ inFlightProbes.add(slug);
373
+ try {
374
+ const jobPid = job.runtime.pid;
375
+ const cwd = job.cwd || HOME;
376
+ const startedAt = job.startedAt || new Date().toISOString();
377
+ const ageMinutes = Math.floor((Date.now() - Date.parse(startedAt)) / 60_000);
378
+ const runId = job.runtime?.runId;
379
+ const logPath = runId ? path.join(RUNS_DIR, runId, `${slug}.log`) : null;
380
+
381
+ const pstreeOutput = getPstree(jobPid);
382
+ const childBashCmdlines = getChildBashCmdlines(jobPid);
383
+ const logTail = logPath ? readTailBytes(logPath, 4096) : '(no log path)';
384
+
385
+ const prompt = buildProbePrompt({
386
+ slug, cwd, startedAt, ageMinutes, lastActivityAge,
387
+ jobPid, pstreeOutput, childBashCmdlines, logTail,
388
+ });
389
+
390
+ const claudeBin = resolveClaudeBin();
391
+ const result = await runProbe(claudeBin, prompt);
392
+
393
+ appendSupervisorLog({
394
+ ts: Date.now(),
395
+ jobSlug: slug,
396
+ lastActivityAgeMin: lastActivityAge,
397
+ verdict: result.verdict,
398
+ action: result.action,
399
+ targetPid: result.targetPid,
400
+ reason: result.reason,
401
+ costUsd: result.costUsd,
402
+ });
403
+
404
+ console.log(`[supervisor] probe result: slug=${slug} verdict=${result.verdict} action=${result.action} reason=${result.reason}`);
405
+
406
+ if (result.action !== 'none') {
407
+ applyAction(result.action, result.targetPid, slug);
408
+ }
409
+ } catch (e) {
410
+ console.error('[supervisor] probeJob error', slug, e?.message);
411
+ } finally {
412
+ inFlightProbes.delete(slug);
413
+ }
414
+ }
415
+
416
+ // ─── Tick ─────────────────────────────────────────────────────────────────────
417
+
418
+ async function supervisorTick() {
419
+ if (!_readQueue) return;
420
+ const state = _readQueue();
421
+ const cfg = state.config?.supervisor ?? {};
422
+ if (cfg.enabled === false) return;
423
+
424
+ const maxConcurrent = cfg.maxConcurrentProbes ?? 2;
425
+ const staleThresholdMs = (cfg.probeStaleThresholdMinutes ?? 10) * 60_000;
426
+
427
+ const runningJobs = state.jobs.filter((j) => j.status === 'running' && j.runtime?.pid);
428
+ if (runningJobs.length === 0) return;
429
+
430
+ const candidates = [];
431
+ for (const job of runningJobs) {
432
+ if (inFlightProbes.has(job.slug)) continue;
433
+ const runId = job.runtime?.runId;
434
+ const logPath = runId ? path.join(RUNS_DIR, runId, `${job.slug}.log`) : null;
435
+ if (!logPath) continue;
436
+ const lastActivity = getLastActivityTs(logPath);
437
+ const ageMs = Date.now() - lastActivity;
438
+ if (ageMs >= staleThresholdMs) {
439
+ candidates.push({ job, lastActivityAge: Math.floor(ageMs / 60_000) });
440
+ }
441
+ }
442
+
443
+ const slots = maxConcurrent - inFlightProbes.size;
444
+ if (slots <= 0) return;
445
+
446
+ const toProbe = candidates.slice(0, slots);
447
+ for (const { job, lastActivityAge } of toProbe) {
448
+ if (inFlightProbes.size >= maxConcurrent) break;
449
+ probeJob(job, lastActivityAge).catch((e) => {
450
+ console.error('[supervisor] unhandled probe error', job.slug, e);
451
+ });
452
+ }
453
+ }
454
+
455
+ // ─── Lifecycle ───────────────────────────────────────────────────────────────
456
+
457
+ function startSupervisor({ readQueue, mutate }) {
458
+ if (process.platform !== 'linux') {
459
+ console.log('[supervisor] non-Linux platform detected; supervisor is a no-op for v1');
460
+ return;
461
+ }
462
+ if (process.env.SM_SUPERVISOR_DISABLE === '1') {
463
+ console.log('[supervisor] disabled via SM_SUPERVISOR_DISABLE=1');
464
+ return;
465
+ }
466
+
467
+ _readQueue = readQueue;
468
+ _mutate = mutate;
469
+
470
+ stopSupervisor(); // idempotent: clear any existing interval
471
+
472
+ const state = readQueue();
473
+ const cfg = state.config?.supervisor ?? {};
474
+ const intervalMs = Math.max(5, Math.min(60, cfg.intervalMinutes ?? 15)) * 60_000;
475
+
476
+ supervisorInterval = setInterval(() => {
477
+ supervisorTick().catch((e) => console.error('[supervisor] tick error', e));
478
+ }, intervalMs);
479
+ if (supervisorInterval.unref) supervisorInterval.unref();
480
+
481
+ console.log(`[supervisor] started, interval=${intervalMs / 60_000}min`);
482
+ }
483
+
484
+ function stopSupervisor() {
485
+ if (supervisorInterval) {
486
+ clearInterval(supervisorInterval);
487
+ supervisorInterval = null;
488
+ }
489
+ }
490
+
491
+ // ─── IPC ─────────────────────────────────────────────────────────────────────
492
+
493
+ function registerHandlers() {
494
+ // supervisor:tick-now — debug only, used by e2e tests
495
+ ipcMain.handle('supervisor:tick-now', async () => {
496
+ await supervisorTick();
497
+ return { ok: true };
498
+ });
499
+
500
+ // supervisor:get-log — returns last 50 entries descending by ts
501
+ ipcMain.handle('supervisor:get-log', async () => {
502
+ return readSupervisorLog(50);
503
+ });
504
+ }
505
+
506
+ module.exports = {
507
+ startSupervisor,
508
+ stopSupervisor,
509
+ supervisorTick,
510
+ applyAction,
511
+ registerHandlers,
512
+ };
@@ -25,6 +25,27 @@ const { refreshIfNeeded, expiresAtMs } = require('./lib/credentials.cjs');
25
25
  const USAGE_URL = 'https://api.anthropic.com/api/oauth/usage';
26
26
  const CACHE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'billing-cache.json');
27
27
 
28
+ /**
29
+ * Pure: classify a raw HTTP response status + body from the usage endpoint into a
30
+ * result kind. Exported for unit testing without needing to mock fetch or electron.
31
+ *
32
+ * Returns one of: 'ok' | 'auth' | 'transient' | 'meter_rate_limited'
33
+ */
34
+ function classifyUsageResponse(status, bodyText) {
35
+ if (status === 401 || status === 403) return { kind: 'auth', httpStatus: status };
36
+ if (status === 429) {
37
+ let parsed = null;
38
+ try { parsed = JSON.parse(bodyText); } catch { /* */ }
39
+ if (parsed?.error?.type === 'rate_limit_error') {
40
+ return { kind: 'meter_rate_limited', message: bodyText.slice(0, 200), httpStatus: 429 };
41
+ }
42
+ return { kind: 'transient', message: bodyText.slice(0, 200) || 'HTTP 429', httpStatus: 429 };
43
+ }
44
+ if (status === 408 || status >= 500) return { kind: 'transient', httpStatus: status };
45
+ if (!status || status >= 400) return { kind: 'transient', httpStatus: status };
46
+ return { kind: 'ok' };
47
+ }
48
+
28
49
  let cache = null;
29
50
  let hydrationPromise = null;
30
51
 
@@ -44,6 +65,18 @@ async function persistCache(c) {
44
65
  }
45
66
 
46
67
  async function fetchUsage() {
68
+ // Test stub: SM_MOCK_BILLING_KIND lets e2e tests simulate billing API responses
69
+ // without hitting the real endpoint. Only active when SM_E2E=1 to prevent
70
+ // accidental use in production.
71
+ if (process.env.SM_E2E === '1' && process.env.SM_MOCK_BILLING_KIND) {
72
+ const kind = process.env.SM_MOCK_BILLING_KIND;
73
+ if (kind === 'meter_rate_limited') return { kind: 'meter_rate_limited', message: 'e2e stub', httpStatus: 429 };
74
+ if (kind === 'transient') return { kind: 'transient', message: 'e2e stub', httpStatus: 503 };
75
+ if (kind === 'auth') return { kind: 'auth', message: 'e2e stub', httpStatus: 401 };
76
+ // 'ok' stub returns a minimal valid payload.
77
+ return { kind: 'ok', data: { usage: { five_hour: { utilization: 10, resets_at: null }, seven_day: { utilization: 10, resets_at: null }, seven_day_sonnet: null, seven_day_opus: null, extra_usage: null }, subscriptionType: null, rateLimitTier: null, credentialsExpiresAt: null, fetchedAt: Date.now() } };
78
+ }
79
+
47
80
  // Check expiry and attempt proactive refresh before touching the network.
48
81
  const refresh = await refreshIfNeeded();
49
82
  if (refresh.kind === 'auth') {
@@ -71,10 +104,19 @@ async function fetchUsage() {
71
104
  const ms = expiresAtMs(creds);
72
105
  return { kind: 'auth', message: body.slice(0, 200) || `HTTP ${r.status}`, httpStatus: r.status, expiredAt: ms };
73
106
  }
74
- if (r.status === 408 || r.status === 429 || r.status >= 500) {
107
+ if (r.status === 408 || r.status >= 500) {
75
108
  const body = await r.text().catch(() => '');
76
109
  return { kind: 'transient', message: body.slice(0, 200) || `HTTP ${r.status}`, httpStatus: r.status };
77
110
  }
111
+ if (r.status === 429) {
112
+ const body = await r.text().catch(() => '');
113
+ let parsed = null;
114
+ try { parsed = JSON.parse(body); } catch { /* */ }
115
+ if (parsed?.error?.type === 'rate_limit_error') {
116
+ return { kind: 'meter_rate_limited', message: body.slice(0, 200), httpStatus: 429 };
117
+ }
118
+ return { kind: 'transient', message: body.slice(0, 200) || 'HTTP 429', httpStatus: 429 };
119
+ }
78
120
  if (!r.ok) {
79
121
  const body = await r.text().catch(() => '');
80
122
  return { kind: 'transient', message: body.slice(0, 200) || `HTTP ${r.status}`, httpStatus: r.status };
@@ -116,4 +158,4 @@ function registerBillingHandlers() {
116
158
  });
117
159
  }
118
160
 
119
- module.exports = { registerBillingHandlers, fetchUsage };
161
+ module.exports = { registerBillingHandlers, fetchUsage, classifyUsageResponse };
@@ -285,6 +285,8 @@ export interface ScheduleHealthSnapshot {
285
285
  lastPollAt: number | null;
286
286
  lastPollOk: boolean;
287
287
  consecutiveFailures: number;
288
+ /** Kind of the most recent poll failure: 'transient' | 'meter_rate_limited' | 'auth' | null */
289
+ lastFailureKind: string | null;
288
290
  backoffNextAt: number | null;
289
291
  nextResetCached: string | null;
290
292
  pausedSince: number | null;
@@ -301,8 +303,26 @@ export interface PrdListItem {
301
303
  mtimeMs: number;
302
304
  }
303
305
 
306
+ export interface SupervisorConfig {
307
+ enabled: boolean;
308
+ intervalMinutes: number;
309
+ maxConcurrentProbes: number;
310
+ probeStaleThresholdMinutes: number;
311
+ }
312
+
313
+ export interface SupervisorLogEntry {
314
+ ts: number;
315
+ jobSlug: string;
316
+ lastActivityAgeMin: number;
317
+ verdict: 'ok' | 'stuck';
318
+ action: 'none' | 'kill-bash' | 'kill-agent';
319
+ targetPid: number | null;
320
+ reason: string;
321
+ costUsd: number | null;
322
+ }
323
+
304
324
  export interface ScheduleStateSnapshot {
305
- config: ScheduleConfig;
325
+ config: ScheduleConfig & { supervisor?: SupervisorConfig };
306
326
  jobs: ScheduleJob[];
307
327
  scheduledFor: string | null;
308
328
  lastRunAt: string | null;
@@ -315,6 +335,33 @@ export interface ScheduleStateSnapshot {
315
335
  paths?: SchedulePaths;
316
336
  }
317
337
 
338
+ export interface HistoryAggregateRequest {
339
+ fromDate?: string;
340
+ toDate?: string;
341
+ }
342
+
343
+ export interface DayProjectRow {
344
+ date: string;
345
+ projectCwd: string;
346
+ encodedCwd: string;
347
+ promptCount: number;
348
+ inputTokens: number;
349
+ outputTokens: number;
350
+ cacheReadTokens: number;
351
+ cacheCreationTokens: number;
352
+ toolCallCount: number;
353
+ toolBreakdown: Record<string, number>;
354
+ sessionCount: number;
355
+ errorCount: number;
356
+ estimatedCostUsd: number;
357
+ }
358
+
359
+ export interface HistoryAggregateResult {
360
+ rows: DayProjectRow[];
361
+ partial: boolean;
362
+ scannedMs: number;
363
+ }
364
+
318
365
  export interface WatcherInfo {
319
366
  watcherId: string;
320
367
  tabId: string;
@@ -439,13 +486,22 @@ export interface SessionManagerAPI {
439
486
  status: () => Promise<OtelStatus>;
440
487
  configPath: () => Promise<string>;
441
488
  };
489
+ history: {
490
+ aggregate: (req?: HistoryAggregateRequest) => Promise<HistoryAggregateResult>;
491
+ };
442
492
  schedule: {
443
493
  state: () => Promise<ScheduleStateSnapshot>;
444
- setConfig: (partial: Partial<ScheduleConfig>) => Promise<{ ok: boolean; config: ScheduleConfig }>;
494
+ setConfig: (partial: Partial<ScheduleConfig & { supervisor?: Partial<SupervisorConfig> }>) => Promise<{ ok: boolean; config: ScheduleConfig }>;
445
495
  resetJob: (slug: string) => Promise<{ ok: boolean; error?: string }>;
446
496
  runNow: () => Promise<{ ok: boolean }>;
497
+ forceTick: () => Promise<{ ok: boolean }>;
447
498
  resume: () => Promise<{ ok: boolean }>;
448
499
  refreshReset: () => Promise<{ ok: boolean; nextReset: string | null }>;
500
+ /** Re-scan prds/ and merge into queue.json; broadcasts updated state. */
501
+ rescan: () => Promise<{ ok: boolean }>;
502
+ /** Move all pending+failed PRDs to prds-archived/<ISO>/ and drop their
503
+ * queue entries. Completed/running entries are preserved. */
504
+ clearQueue: () => Promise<{ ok: boolean; archived: number; archivedTo: string | null }>;
449
505
  openFolder: () => Promise<{ ok: boolean }>;
450
506
  readPrd: (slug: string) => Promise<{ ok: boolean; text?: string; error?: string }>;
451
507
  readLog: (runId: string, slug: string) => Promise<{ ok: boolean; text?: string; error?: string }>;
@@ -454,6 +510,12 @@ export interface SessionManagerAPI {
454
510
  health: () => Promise<ScheduleHealthSnapshot>;
455
511
  onState: (handler: (snapshot: ScheduleStateSnapshot) => void) => () => void;
456
512
  };
513
+ supervisor: {
514
+ /** Debug-only: run a supervisor tick immediately. Used by e2e tests. */
515
+ tickNow: () => Promise<{ ok: boolean }>;
516
+ /** Return last 50 supervisor log entries, descending by ts. */
517
+ getLog: () => Promise<SupervisorLogEntry[]>;
518
+ };
457
519
  }
458
520
 
459
521
  declare global {