claude-code-session-manager 0.8.1 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +66 -11
  2. package/dist/assets/{cssMode-DKTELvb6.js → cssMode-DyaNC2Cs.js} +1 -1
  3. package/dist/assets/{editor.main-Dx55Am4z.js → editor.main-BhSGi_Jw.js} +3 -3
  4. package/dist/assets/{freemarker2-CBdvn_u-.js → freemarker2-DZH3si5v.js} +1 -1
  5. package/dist/assets/{handlebars-B67ay2ue.js → handlebars-DvzTd6uL.js} +1 -1
  6. package/dist/assets/{html-002uK0_M.js → html-C5GmopAN.js} +1 -1
  7. package/dist/assets/{htmlMode-DsT8oVY_.js → htmlMode-DwnrHwx1.js} +1 -1
  8. package/dist/assets/index-BGshD4Pw.js +2976 -0
  9. package/dist/assets/index-DCK87t79.css +32 -0
  10. package/dist/assets/{javascript-Cfg-gFlu.js → javascript-JqHrxiCa.js} +1 -1
  11. package/dist/assets/{jsonMode-CCIKxANa.js → jsonMode-8rZcy09i.js} +1 -1
  12. package/dist/assets/{liquid-DewgYvox.js → liquid-ClpD_v7G.js} +1 -1
  13. package/dist/assets/{lspLanguageFeatures-BcMPMUo0.js → lspLanguageFeatures-u0WgQBQz.js} +1 -1
  14. package/dist/assets/{mdx-BGrrIvjV.js → mdx-DtViUgdm.js} +1 -1
  15. package/dist/assets/{python-CVhAv32T.js → python-CaAvhRGm.js} +1 -1
  16. package/dist/assets/{razor-DteXtrPO.js → razor-saGNVU7l.js} +1 -1
  17. package/dist/assets/{tsMode-DKeWRYvl.js → tsMode-HZwWTCj8.js} +1 -1
  18. package/dist/assets/{typescript-Dl1KPrAp.js → typescript-BInV4PNE.js} +1 -1
  19. package/dist/assets/{xml-DdyOGE0N.js → xml-tgO806YR.js} +1 -1
  20. package/dist/assets/{yaml-BwFXDW6t.js → yaml-CHApZArv.js} +1 -1
  21. package/dist/index.html +2 -2
  22. package/package.json +1 -1
  23. package/src/main/config.cjs +93 -19
  24. package/src/main/index.cjs +163 -31
  25. package/src/main/ipcSchemas.cjs +59 -2
  26. package/src/main/lib/cleanEnv.cjs +20 -0
  27. package/src/main/lib/credentials.cjs +184 -0
  28. package/src/main/lib/schedulerConfig.cjs +10 -0
  29. package/src/main/logs.cjs +1 -1
  30. package/src/main/otelSettings.cjs +1 -1
  31. package/src/main/pty.cjs +53 -6
  32. package/src/main/scheduler.cjs +518 -147
  33. package/src/main/transcripts.cjs +26 -21
  34. package/src/main/usage.cjs +76 -25
  35. package/src/main/voiceSettings.cjs +1 -1
  36. package/src/main/watchers.cjs +69 -11
  37. package/src/preload/api.d.ts +51 -11
  38. package/src/preload/index.cjs +13 -0
  39. package/dist/assets/index-DsC4vT8M.css +0 -32
  40. package/dist/assets/index-E14-spyd.js +0 -2972
@@ -47,11 +47,20 @@ const os = require('node:os');
47
47
  const { spawn } = require('node:child_process');
48
48
  const { ipcMain } = require('electron');
49
49
  const billing = require('./usage.cjs');
50
+ const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
51
+ const {
52
+ POLL_INTERVAL_MS,
53
+ USAGE_REFRESH_INTERVAL_MS,
54
+ MAX_JOB_DURATION_MS,
55
+ } = require('./lib/schedulerConfig.cjs');
50
56
 
51
57
  const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
52
58
  const PRDS_DIR = path.join(ROOT, 'prds');
53
59
  const RUNS_DIR = path.join(ROOT, 'runs');
54
60
  const QUEUE_PATH = path.join(ROOT, 'queue.json');
61
+ const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
62
+ const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
63
+ const HEARTBEAT_MAX_BYTES = 1024 * 1024;
55
64
  const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
56
65
 
57
66
  const DEFAULT_CONFIG = {
@@ -62,7 +71,7 @@ const DEFAULT_CONFIG = {
62
71
  defaultCwd: DEFAULT_PROJECT_CWD,
63
72
  // 'when-available' = poll usage and fire whenever utilization < threshold.
64
73
  // 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
65
- // 'manual' = only fire on explicit Run-now click.
74
+ // 'manual' = only fire on explicit Run now click.
66
75
  firePolicy: 'when-available',
67
76
  // For 'when-available'. Fire only when five_hour utilization < this percent.
68
77
  utilizationThreshold: 90,
@@ -77,11 +86,61 @@ function ensureDirs() {
77
86
  }
78
87
 
79
88
  function atomicWriteJson(p, data) {
80
- const tmp = `${p}.${process.pid}.tmp`;
89
+ const tmp = `${p}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
81
90
  fs.writeFileSync(tmp, JSON.stringify(data, null, 2));
82
91
  fs.renameSync(tmp, p);
83
92
  }
84
93
 
94
+ // ---------- scheduler-state.json (sidecar) ----------
95
+
96
+ function loadSchedulerState() {
97
+ try {
98
+ const raw = fs.readFileSync(SCHEDULER_STATE_PATH, 'utf8');
99
+ const s = JSON.parse(raw);
100
+ if (s.lastObservedReset) cachedNextReset = s.lastObservedReset;
101
+ if (typeof s.consecutiveFailures === 'number') consecutiveFailures = s.consecutiveFailures;
102
+ if (typeof s.backoffMs === 'number') backoffMs = s.backoffMs;
103
+ if (typeof s.pauseClearedManuallyAt === 'number') pauseClearedManuallyAt = s.pauseClearedManuallyAt;
104
+ if (typeof s.lastPollAt === 'number') lastPollAt = s.lastPollAt;
105
+ } catch { /* first boot or corrupt — start fresh */ }
106
+ }
107
+
108
+ function persistSchedulerState() {
109
+ try {
110
+ atomicWriteJson(SCHEDULER_STATE_PATH, {
111
+ version: 1,
112
+ lastObservedReset: cachedNextReset,
113
+ lastResetObservedAt: cachedNextReset ? Date.now() : null,
114
+ lastPollAt,
115
+ consecutiveFailures,
116
+ backoffMs,
117
+ pausedReason: null,
118
+ pausedSince: null,
119
+ pauseClearedManuallyAt,
120
+ });
121
+ } catch (e) {
122
+ console.warn('[scheduler] failed to persist scheduler state', e?.message);
123
+ }
124
+ }
125
+
126
+ // ---------- heartbeat log ----------
127
+
128
+ function appendHeartbeat(entry) {
129
+ try {
130
+ const line = JSON.stringify(entry) + '\n';
131
+ let size = 0;
132
+ try { size = fs.statSync(HEARTBEAT_PATH).size; } catch { /* new file */ }
133
+ if (size >= HEARTBEAT_MAX_BYTES) {
134
+ const rotated = HEARTBEAT_PATH + '.1';
135
+ try { fs.unlinkSync(rotated); } catch { /* */ }
136
+ try { fs.renameSync(HEARTBEAT_PATH, rotated); } catch { /* */ }
137
+ }
138
+ fs.appendFileSync(HEARTBEAT_PATH, line);
139
+ } catch (e) {
140
+ console.warn('[scheduler] heartbeat write failed', e?.message);
141
+ }
142
+ }
143
+
85
144
  function readQueue() {
86
145
  try {
87
146
  const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
@@ -103,6 +162,25 @@ function writeQueue(state) {
103
162
  atomicWriteJson(QUEUE_PATH, state);
104
163
  }
105
164
 
165
+ // ---------- serialized mutation queue ----------
166
+
167
+ // All read-modify-write operations on queue.json go through mutate() so
168
+ // concurrent job completions in a parallel wave cannot lose each other's
169
+ // status updates. mutateTail is always a resolved promise even when the
170
+ // preceding mutate threw, so the chain never deadlocks.
171
+ let mutateTail = Promise.resolve();
172
+
173
+ function mutate(fn) {
174
+ const next = mutateTail.then(async () => {
175
+ const state = readQueue();
176
+ const ret = await fn(state);
177
+ writeQueue(state);
178
+ return ret;
179
+ });
180
+ mutateTail = next.catch(() => {}); // keep chain alive on errors
181
+ return next;
182
+ }
183
+
106
184
  // ---------- PRD parsing ----------
107
185
 
108
186
  /**
@@ -221,32 +299,41 @@ function reconcile(state) {
221
299
 
222
300
  // ---------- next-reset detection ----------
223
301
 
224
- let cachedNextReset = null;
302
+ let cachedNextReset = null; // bare ISO string or null
225
303
  let cachedUtilization = null; // five_hour utilization %, 0–100, or null if unknown
226
304
 
305
+ /** Fetches latest usage from billing API. Throws on any error — callers handle it. */
227
306
  async function refreshNextReset() {
228
- try {
229
- const r = await billing.fetchUsage();
230
- cachedNextReset = r?.usage?.five_hour?.resets_at ?? null;
231
- cachedUtilization = r?.usage?.five_hour?.utilization ?? cachedUtilization;
232
- return cachedNextReset;
233
- } catch {
234
- return cachedNextReset;
235
- }
307
+ const r = await billing.fetchUsage();
308
+ if (r.kind !== 'ok') throw new Error(`usage fetch failed (${r.kind}): ${r.message ?? ''}`);
309
+ cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? null;
310
+ cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
311
+ return cachedNextReset;
236
312
  }
237
313
 
238
314
  function getNextResetCached() {
239
315
  return cachedNextReset;
240
316
  }
241
317
 
318
+ // ---------- health / poll state ----------
319
+
320
+ let bootedAt = Date.now();
321
+ let lastPollAt = null;
322
+ let lastPollOk = false;
323
+ let consecutiveFailures = 0;
324
+ let backoffMs = 0;
325
+ let backoffNextAt = null;
326
+ let firstFailureAt = null;
327
+ let pauseClearedManuallyAt = null;
328
+
242
329
  // ---------- timer ----------
243
330
 
244
331
  let mainWindow = null;
245
332
  let fireTimer = null;
246
333
  let resumeTimer = null;
247
- let pollTimer = null;
334
+ let pollLoopTimer = null;
248
335
  let rescheduleInterval = null;
249
- let initialPollTimeout = null;
336
+ let heartbeatInterval = null;
250
337
  let isExecuting = false;
251
338
  let cancelToken = { cancelled: false };
252
339
  let claudeBinPathCached = null;
@@ -289,65 +376,83 @@ function computeFireAt(state, nextResetIso) {
289
376
 
290
377
  async function rescheduleTimer() {
291
378
  clearFireTimer();
292
- const state = readQueue();
293
- reconcile(state);
294
- const nextResetIso = await refreshNextReset();
295
- const fireAt = computeFireAt(state, nextResetIso);
296
- if (!fireAt) {
297
- state.scheduledFor = null;
298
- writeQueue(state);
299
- broadcast();
300
- return;
379
+ // Wrap in try/catch — on failure use the cached value so the on-reset
380
+ // timer can still be armed from the last known reset.
381
+ let nextResetIso;
382
+ try {
383
+ nextResetIso = await refreshNextReset();
384
+ } catch {
385
+ nextResetIso = cachedNextReset;
301
386
  }
302
-
303
- state.scheduledFor = new Date(fireAt).toISOString();
304
- writeQueue(state);
387
+ const fireAt = await mutate((state) => {
388
+ reconcile(state);
389
+ const fa = computeFireAt(state, nextResetIso);
390
+ state.scheduledFor = fa ? new Date(fa).toISOString() : null;
391
+ return fa;
392
+ });
305
393
  broadcast();
394
+ if (!fireAt) return;
306
395
 
307
396
  const delay = Math.max(1000, fireAt - Date.now());
308
- // setTimeout caps at int32 ms (~24.8 days) — well above our 5h horizon, so
309
- // a single timer is fine. If reset_at is wildly in the future we'd still
310
- // re-anchor on the next billing refresh.
311
397
  fireTimer = setTimeout(() => { runDueJobs().catch(() => {}); }, delay);
312
- console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${state.scheduledFor}`);
398
+ console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${new Date(fireAt).toISOString()}`);
313
399
  }
314
400
 
315
401
  // ---------- pause / resume ----------
316
402
 
317
- function setPaused(reason, resumeAtIso) {
318
- const s = readQueue();
319
- if (s.paused && s.paused.reason === reason) {
320
- if (resumeAtIso) s.paused.resumeAt = resumeAtIso;
321
- } else {
322
- s.paused = { reason, since: new Date().toISOString(), resumeAt: resumeAtIso || null };
403
+ async function setPaused(reason, resumeAtIso) {
404
+ // Honor manual-override cooldown: if the user cleared a pause within the
405
+ // last 5 minutes, suppress auto-pause re-engagement on the same condition.
406
+ if (pauseClearedManuallyAt && Date.now() - pauseClearedManuallyAt < 300_000) {
407
+ console.log(`[scheduler] setPaused(${reason}) suppressed by manual override cooldown`);
408
+ return;
409
+ }
410
+
411
+ // For 'network' with no explicit resumeAt, auto-resume after 30 minutes.
412
+ let effectiveResumeAt = resumeAtIso;
413
+ if (reason === 'network' && !resumeAtIso) {
414
+ effectiveResumeAt = new Date(Date.now() + 30 * 60_000).toISOString();
323
415
  }
324
- writeQueue(s);
416
+
417
+ await mutate((s) => {
418
+ if (s.paused && s.paused.reason === reason) {
419
+ if (effectiveResumeAt) s.paused.resumeAt = effectiveResumeAt;
420
+ } else {
421
+ s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
422
+ }
423
+ });
325
424
  broadcast();
326
425
  cancelToken.cancelled = true;
327
426
  if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
328
- if (!resumeAtIso) return;
427
+ if (!effectiveResumeAt) return;
428
+
329
429
  // Resume 30s after the reset to give the auth/billing endpoint time to flip.
330
- const delay = Math.max(30_000, new Date(resumeAtIso).getTime() - Date.now() + 30_000);
430
+ const delay = Math.max(30_000, new Date(effectiveResumeAt).getTime() - Date.now() + 30_000);
331
431
  if (delay > 0x7fffffff) {
332
432
  console.warn(`[scheduler] paused (${reason}); resumeAt too far for setTimeout (${delay}ms)`);
333
433
  return;
334
434
  }
335
- resumeTimer = setTimeout(() => {
336
- clearPause('resume-timer');
435
+ resumeTimer = setTimeout(async () => {
436
+ await clearPause('resume-timer');
337
437
  runDueJobs().catch(() => {});
338
438
  }, delay);
339
- console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay/1000)}s`);
439
+ console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay / 1000)}s`);
340
440
  }
341
441
 
342
- function clearPause(source) {
442
+ async function clearPause(source) {
343
443
  if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
344
- const s = readQueue();
345
- if (s.paused) {
444
+ const wasPaused = await mutate((s) => {
445
+ if (!s.paused) return false;
346
446
  console.log(`[scheduler] clearPause (${source || 'manual'})`);
347
447
  s.paused = null;
348
- writeQueue(s);
349
- broadcast();
448
+ return true;
449
+ });
450
+ // Track manual clears for the auto-pause cooldown.
451
+ if (source === 'manual' || source === 'run-now') {
452
+ pauseClearedManuallyAt = Date.now();
453
+ persistSchedulerState();
350
454
  }
455
+ if (wasPaused) broadcast();
351
456
  }
352
457
 
353
458
  /** Mutate a job in place to "pending" with cleared run metadata. */
@@ -358,6 +463,7 @@ function resetJobFields(job, errorMsg) {
358
463
  job.finishedAt = null;
359
464
  job.exitCode = null;
360
465
  job.error = errorMsg ?? null;
466
+ delete job.runtime;
361
467
  }
362
468
 
363
469
  /** Scan the tail of a job's log for the canonical rate-limit signal. We look
@@ -408,15 +514,34 @@ function pickRunDir() {
408
514
  return { runId: ts, dir };
409
515
  }
410
516
 
411
- async function executeJob(job, runDir, defaultCwd) {
517
+ /**
518
+ * Execute a single PRD job. Writes stdout/stderr to a log file and a meta
519
+ * JSON sidecar. Accepts an optional onPid(pid) callback called synchronously
520
+ * after spawn so callers can persist the pid before the job finishes.
521
+ */
522
+ async function executeJob(job, runDir, defaultCwd, onPid) {
412
523
  const logPath = path.join(runDir, `${job.slug}.log`);
413
524
  const metaPath = path.join(runDir, `${job.slug}.meta.json`);
414
525
  const cwd = job.cwd || defaultCwd;
415
526
  const startedAt = Date.now();
416
527
 
417
528
  const fd = fs.openSync(logPath, 'a');
529
+ let fdClosed = false;
530
+ const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
531
+
418
532
  fs.writeSync(fd, `[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
419
533
 
534
+ // Dead-cwd guard: verify the target directory exists and is traversable
535
+ // before handing it to the child process.
536
+ try { fs.accessSync(cwd, fs.constants.X_OK); }
537
+ catch {
538
+ const errMsg = `cwd no longer exists: ${cwd}`;
539
+ fs.writeSync(fd, `[scheduler] ${errMsg}\n`);
540
+ closeFd();
541
+ atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
542
+ return { exitCode: -1, durationMs: 0, error: errMsg };
543
+ }
544
+
420
545
  // Read full PRD body fresh from disk (queue stored only the preview).
421
546
  let prompt;
422
547
  try {
@@ -424,12 +549,16 @@ async function executeJob(job, runDir, defaultCwd) {
424
549
  prompt = parsed.body;
425
550
  } catch (e) {
426
551
  fs.writeSync(fd, `[scheduler] failed to read PRD: ${e?.message}\n`);
427
- fs.closeSync(fd);
552
+ closeFd();
428
553
  return { exitCode: -1, durationMs: 0, error: e?.message };
429
554
  }
430
555
 
431
556
  return await new Promise((resolve) => {
432
557
  const claudeBin = resolveClaudeBin();
558
+ // Strip Claude Code env and secrets that leak in when session-manager is
559
+ // launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
560
+ // overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
561
+ const childEnv = cleanChildEnv();
433
562
  const child = spawn(claudeBin, [
434
563
  '-p', prompt,
435
564
  '--model', 'sonnet',
@@ -438,24 +567,36 @@ async function executeJob(job, runDir, defaultCwd) {
438
567
  '--verbose',
439
568
  ], {
440
569
  cwd,
441
- env: process.env,
570
+ env: childEnv,
442
571
  stdio: ['ignore', fd, fd],
443
572
  });
444
573
 
445
574
  fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
446
575
 
576
+ // Fire-and-forget pid persistence — best effort.
577
+ if (onPid) onPid(child.pid).catch(() => {});
578
+
579
+ // Kill the child if it runs past the maximum allowed duration.
580
+ const watchdog = setTimeout(() => {
581
+ fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
582
+ try { child.kill('SIGKILL'); } catch { /* already dead */ }
583
+ }, MAX_JOB_DURATION_MS);
584
+ if (watchdog.unref) watchdog.unref();
585
+
447
586
  child.on('error', (err) => {
587
+ clearTimeout(watchdog);
448
588
  const durationMs = Date.now() - startedAt;
449
- fs.writeSync(fd, `\n[scheduler] spawn error: ${err.message}\n`);
450
- fs.closeSync(fd);
589
+ fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
590
+ closeFd();
451
591
  atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
452
592
  resolve({ exitCode: -1, durationMs, error: err.message });
453
593
  });
454
594
 
455
595
  child.on('exit', (code) => {
596
+ clearTimeout(watchdog);
456
597
  const durationMs = Date.now() - startedAt;
457
598
  fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
458
- fs.closeSync(fd);
599
+ closeFd();
459
600
  const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
460
601
  atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
461
602
  resolve({ exitCode: code, durationMs, rateLimited });
@@ -479,7 +620,6 @@ async function runDueJobs() {
479
620
  return;
480
621
  }
481
622
  const { runId, dir: runDir } = pickRunDir();
482
- state.lastRunAt = new Date().toISOString();
483
623
 
484
624
  // Group by parallelGroup, ascending. Each group runs serially after the
485
625
  // previous group completes.
@@ -491,7 +631,7 @@ async function runDueJobs() {
491
631
  }
492
632
  const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
493
633
 
494
- writeQueue(state);
634
+ await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
495
635
  broadcast();
496
636
 
497
637
  for (const gk of groupKeys) {
@@ -503,41 +643,59 @@ async function runDueJobs() {
503
643
  const inFlight = new Set();
504
644
 
505
645
  const launch = (job) => {
506
- const s = readQueue();
507
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
508
- if (idx >= 0) {
509
- s.jobs[idx].status = 'running';
510
- s.jobs[idx].runId = runId;
511
- s.jobs[idx].startedAt = new Date().toISOString();
512
- writeQueue(s);
513
- broadcast();
514
- }
515
- const promise = executeJob(job, runDir, state.config.defaultCwd).then(async (res) => {
516
- // Rate-limit OR pause-already-set means we treat this job as
517
- // unfinished — bounce it back to 'pending' so the next run
518
- // (after token reset) picks it up.
519
- if (res.rateLimited) {
520
- const resetIso = await refreshNextReset();
521
- setPaused('rate_limit', resetIso);
522
- }
523
- const sn = readQueue();
524
- const i2 = sn.jobs.findIndex((x) => x.slug === job.slug);
525
- if (i2 >= 0) {
526
- const treatAsPending = res.rateLimited || (sn.paused && sn.paused.reason === 'rate_limit');
527
- if (treatAsPending) {
528
- resetJobFields(sn.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
529
- } else {
530
- sn.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
531
- sn.jobs[i2].finishedAt = new Date().toISOString();
532
- sn.jobs[i2].exitCode = res.exitCode;
533
- sn.jobs[i2].error = res.error || null;
646
+ const promise = (async () => {
647
+ try {
648
+ // Mark job running.
649
+ await mutate((s) => {
650
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
651
+ if (idx >= 0) {
652
+ s.jobs[idx].status = 'running';
653
+ s.jobs[idx].runId = runId;
654
+ s.jobs[idx].startedAt = new Date().toISOString();
655
+ }
656
+ });
657
+ broadcast();
658
+
659
+ // Execute — onPid persists the child PID into the running state.
660
+ const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
661
+ await mutate((s) => {
662
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
663
+ if (idx >= 0) {
664
+ s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
665
+ }
666
+ });
667
+ });
668
+
669
+ // Rate-limit: pause before writing terminal status so the status
670
+ // mutate below can read the pause state.
671
+ if (res.rateLimited) {
672
+ const resetIso = await refreshNextReset().catch(() => cachedNextReset);
673
+ await setPaused('rate_limit', resetIso);
534
674
  }
535
- writeQueue(sn);
675
+
676
+ // Write terminal status; strip runtime regardless of outcome.
677
+ await mutate((s) => {
678
+ const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
679
+ if (i2 >= 0) {
680
+ const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
681
+ if (treatAsPending) {
682
+ resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
683
+ } else {
684
+ s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
685
+ s.jobs[i2].finishedAt = new Date().toISOString();
686
+ s.jobs[i2].exitCode = res.exitCode;
687
+ s.jobs[i2].error = res.error || null;
688
+ delete s.jobs[i2].runtime;
689
+ }
690
+ }
691
+ });
536
692
  broadcast();
693
+ } catch (e) {
694
+ console.error('[scheduler] launch error', job.slug, e);
537
695
  }
538
- inFlight.delete(promise);
539
- });
696
+ })();
540
697
  inFlight.add(promise);
698
+ promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
541
699
  };
542
700
 
543
701
  // Prime up to cap
@@ -557,45 +715,97 @@ async function runDueJobs() {
557
715
  isExecuting = false;
558
716
  // No longer auto-disable after a run. The firePolicy now governs whether
559
717
  // the next batch fires automatically. Just clear the one-shot scheduledFor.
560
- const s = readQueue();
561
- s.scheduledFor = null;
562
- writeQueue(s);
718
+ await mutate((s) => { s.scheduledFor = null; });
563
719
  broadcast();
564
720
  }
565
721
  }
566
722
 
567
- // ---------- when-available poll loop ----------
723
+ // ---------- when-available launch logic ----------
568
724
 
569
- async function pollWhenAvailable() {
725
+ async function maybeLaunchWhenAvailable(state) {
726
+ if (state.config.firePolicy !== 'when-available') return;
727
+ if (state.paused) return;
728
+ if (isExecuting) return;
729
+ const pending = state.jobs.filter((j) => j.status === 'pending');
730
+ if (pending.length === 0) return;
731
+ if (cachedUtilization === null || cachedUtilization === undefined) return;
732
+ if (cachedUtilization >= state.config.utilizationThreshold) {
733
+ broadcast();
734
+ return;
735
+ }
736
+ console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
737
+ runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
738
+ }
739
+
740
+ // ---------- poll loop with exponential backoff ----------
741
+
742
+ async function pollLoop() {
570
743
  try {
571
- const state = readQueue();
572
- if (state.config.firePolicy !== 'when-available') return;
573
- if (state.paused) return;
574
- if (isExecuting) return;
575
- const pending = state.jobs.filter((j) => j.status === 'pending');
576
- if (pending.length === 0) return;
744
+ const r = await billing.fetchUsage();
577
745
 
578
- // Refresh utilization. If the call fails, don't fire blindly — wait
579
- // for the next tick.
580
- let util;
581
- try {
582
- const r = await billing.fetchUsage();
583
- util = r?.usage?.five_hour?.utilization ?? null;
584
- cachedUtilization = util;
585
- cachedNextReset = { at: r?.usage?.five_hour?.resets_at ?? cachedNextReset.at, fetchedAt: Date.now() };
586
- } catch {
587
- return;
588
- }
589
- if (util === null || util === undefined) return;
590
- if (util >= state.config.utilizationThreshold) {
591
- // Tokens too high — broadcast so the UI shows current util but don't fire.
746
+ if (r.kind === 'ok') {
747
+ cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? cachedNextReset;
748
+ cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
749
+ consecutiveFailures = 0;
750
+ backoffMs = 0;
751
+ backoffNextAt = null;
752
+ firstFailureAt = null;
753
+ lastPollAt = Date.now();
754
+ lastPollOk = true;
755
+ persistSchedulerState();
756
+
757
+ // If a 'network' pause resolved, clear it now that we have a good reading.
758
+ const cur = readQueue();
759
+ if (cur.paused?.reason === 'network') {
760
+ await clearPause('network-recovered');
761
+ }
762
+ // If 'reset_failure' was set and we now have a valid reset, clear it.
763
+ if (cur.paused?.reason === 'reset_failure' && cachedNextReset) {
764
+ await clearPause('reset-recovered');
765
+ }
766
+
767
+ await maybeLaunchWhenAvailable(cur);
592
768
  broadcast();
593
- return;
769
+ } else {
770
+ lastPollAt = Date.now();
771
+ lastPollOk = false;
772
+ consecutiveFailures++;
773
+ if (!firstFailureAt) firstFailureAt = Date.now();
774
+
775
+ if (r.kind === 'auth') {
776
+ console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
777
+ await setPaused('auth', null);
778
+ } else {
779
+ // transient or config — apply exponential backoff.
780
+ backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
781
+ const totalFailureMs = Date.now() - firstFailureAt;
782
+ console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
783
+
784
+ // After 30 minutes of consecutive failures, set 'network' pause.
785
+ if (totalFailureMs > 30 * 60_000) {
786
+ const cur2 = readQueue();
787
+ if (!cur2.paused || cur2.paused.reason === 'network') {
788
+ await setPaused('network', null);
789
+ }
790
+ }
791
+ }
792
+
793
+ backoffNextAt = Date.now() + backoffMs;
794
+ persistSchedulerState();
594
795
  }
595
- console.log(`[scheduler] when-available: util=${util}%, ${pending.length} pending — firing`);
596
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
597
796
  } catch (e) {
598
- console.error('[scheduler] poll error', e);
797
+ // Unexpected error (e.g., IPC transport failure)
798
+ lastPollAt = Date.now();
799
+ lastPollOk = false;
800
+ consecutiveFailures++;
801
+ if (!firstFailureAt) firstFailureAt = Date.now();
802
+ backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
803
+ backoffNextAt = Date.now() + backoffMs;
804
+ persistSchedulerState();
805
+ } finally {
806
+ const delay = backoffMs || POLL_INTERVAL_MS;
807
+ pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, delay);
808
+ if (pollLoopTimer.unref) pollLoopTimer.unref();
599
809
  }
600
810
  }
601
811
 
@@ -620,44 +830,85 @@ function registerScheduleHandlers() {
620
830
  };
621
831
  });
622
832
 
623
- ipcMain.handle('schedule:set-config', async (_e, partial) => {
833
+ ipcMain.handle('schedule:health', async () => {
624
834
  const state = readQueue();
625
- state.config = { ...state.config, ...(partial || {}) };
626
- if (typeof state.config.concurrencyCap === 'number') {
627
- state.config.concurrencyCap = Math.max(1, Math.min(20, Math.floor(state.config.concurrencyCap)));
835
+ const runningJobs = [];
836
+ for (const j of state.jobs) {
837
+ if (j.status === 'running' && j.runtime) {
838
+ runningJobs.push({
839
+ slug: j.slug,
840
+ startedAt: j.startedAt ? Date.parse(j.startedAt) : 0,
841
+ pid: j.runtime.pid ?? 0,
842
+ });
843
+ }
628
844
  }
629
- if (typeof state.config.offsetMinutes === 'number') {
630
- state.config.offsetMinutes = Math.max(0, Math.min(180, Math.floor(state.config.offsetMinutes)));
845
+ return {
846
+ bootedAt,
847
+ lastPollAt,
848
+ lastPollOk,
849
+ consecutiveFailures,
850
+ backoffNextAt,
851
+ nextResetCached: cachedNextReset,
852
+ pausedSince: state.paused ? Date.parse(state.paused.since) : null,
853
+ pauseReason: state.paused?.reason ?? null,
854
+ runningJobs,
855
+ };
856
+ });
857
+
858
+ ipcMain.handle('schedule:set-config', async (_e, partial) => {
859
+ const { schemas: s } = require('./ipcSchemas.cjs');
860
+ let validated;
861
+ try {
862
+ validated = s.setConfigSchema.parse(partial || {});
863
+ } catch (e) {
864
+ return { ok: false, error: e?.message ?? 'invalid config' };
631
865
  }
632
- writeQueue(state);
866
+ const config = await mutate((state) => {
867
+ state.config = { ...state.config, ...validated };
868
+ return state.config;
869
+ });
633
870
  await rescheduleTimer();
634
- return { ok: true, config: state.config };
871
+ return { ok: true, config };
635
872
  });
636
873
 
637
- ipcMain.handle('schedule:reset-job', async (_e, { slug }) => {
638
- const state = readQueue();
639
- const idx = state.jobs.findIndex((j) => j.slug === slug);
640
- if (idx < 0) return { ok: false, error: 'not found' };
641
- resetJobFields(state.jobs[idx]);
642
- writeQueue(state);
874
+ ipcMain.handle('schedule:reset-job', async (_e, payload) => {
875
+ const { schemas: s } = require('./ipcSchemas.cjs');
876
+ let slug;
877
+ try {
878
+ ({ slug } = s.scheduleSlug.parse(payload));
879
+ } catch (e) {
880
+ return { ok: false, error: 'invalid slug' };
881
+ }
882
+ // Containment check after path.join.
883
+ const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
884
+ if (!resolved.startsWith(PRDS_DIR + path.sep)) {
885
+ return { ok: false, error: 'invalid slug' };
886
+ }
887
+ const found = await mutate((state) => {
888
+ const idx = state.jobs.findIndex((j) => j.slug === slug);
889
+ if (idx < 0) return false;
890
+ resetJobFields(state.jobs[idx]);
891
+ return true;
892
+ });
893
+ if (!found) return { ok: false, error: 'not found' };
643
894
  broadcast();
644
895
  return { ok: true };
645
896
  });
646
897
 
647
898
  ipcMain.handle('schedule:run-now', async () => {
648
899
  // Manual run-now overrides any auto-pause. Clear it first.
649
- clearPause('run-now');
900
+ await clearPause('run-now');
650
901
  runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
651
902
  return { ok: true };
652
903
  });
653
904
 
654
905
  ipcMain.handle('schedule:resume', async () => {
655
- clearPause('manual');
906
+ await clearPause('manual');
656
907
  return { ok: true };
657
908
  });
658
909
 
659
910
  ipcMain.handle('schedule:refresh-reset', async () => {
660
- const at = await refreshNextReset();
911
+ const at = await refreshNextReset().catch(() => cachedNextReset);
661
912
  await rescheduleTimer();
662
913
  return { ok: true, nextReset: at };
663
914
  });
@@ -668,39 +919,123 @@ function registerScheduleHandlers() {
668
919
  return { ok: true };
669
920
  });
670
921
 
671
- ipcMain.handle('schedule:read-prd', async (_e, { slug }) => {
922
+ ipcMain.handle('schedule:read-prd', async (_e, payload) => {
923
+ const { schemas: s } = require('./ipcSchemas.cjs');
924
+ let slug;
672
925
  try {
673
- const text = await fsp.readFile(path.join(PRDS_DIR, `${slug}.md`), 'utf8');
926
+ ({ slug } = s.scheduleSlug.parse(payload));
927
+ } catch {
928
+ return { ok: false, error: 'invalid slug' };
929
+ }
930
+ const filePath = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
931
+ if (!filePath.startsWith(PRDS_DIR + path.sep)) {
932
+ return { ok: false, error: 'invalid slug' };
933
+ }
934
+ try {
935
+ const text = await fsp.readFile(filePath, 'utf8');
674
936
  return { ok: true, text };
675
937
  } catch (e) {
676
938
  return { ok: false, error: e?.message };
677
939
  }
678
940
  });
679
941
 
680
- ipcMain.handle('schedule:read-log', async (_e, { runId, slug }) => {
942
+ ipcMain.handle('schedule:read-log', async (_e, payload) => {
943
+ const { schemas: s } = require('./ipcSchemas.cjs');
944
+ let slug, runId;
945
+ try {
946
+ ({ slug, runId } = s.scheduleReadLog.parse(payload));
947
+ } catch {
948
+ return { ok: false, error: 'invalid slug or runId' };
949
+ }
950
+ const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
951
+ if (!logPath.startsWith(RUNS_DIR + path.sep)) {
952
+ return { ok: false, error: 'invalid slug or runId' };
953
+ }
681
954
  try {
682
- const p = path.join(RUNS_DIR, runId, `${slug}.log`);
683
- const text = await fsp.readFile(p, 'utf8');
955
+ const text = await fsp.readFile(logPath, 'utf8');
684
956
  return { ok: true, text };
685
957
  } catch (e) {
686
958
  return { ok: false, error: e?.message };
687
959
  }
688
960
  });
961
+
962
+ const PRD_WRITE_MAX_BYTES = 256 * 1024;
963
+ const SLUG_RE = /^[A-Za-z0-9._-]{1,128}$/;
964
+
965
+ ipcMain.handle('schedule:write-prd', async (_e, { slug, body }) => {
966
+ if (!SLUG_RE.test(slug)) throw new Error(`invalid slug: ${slug}`);
967
+ if (typeof body !== 'string') throw new Error('body must be string');
968
+ if (Buffer.byteLength(body, 'utf8') > PRD_WRITE_MAX_BYTES) throw new Error('body too large');
969
+ const file = path.join(PRDS_DIR, `${slug}.md`);
970
+ const resolved = path.resolve(file);
971
+ if (!resolved.startsWith(PRDS_DIR + path.sep)) throw new Error('path escape');
972
+ const tmp = `${resolved}.${process.pid}.${Date.now()}.tmp`;
973
+ await fsp.writeFile(tmp, body, { encoding: 'utf8', mode: 0o644 });
974
+ await fsp.rename(tmp, resolved);
975
+ const stat = await fsp.stat(resolved);
976
+ return { ok: true, bytesWritten: stat.size };
977
+ });
978
+
979
+ ipcMain.handle('schedule:list-prds', async () => {
980
+ ensureDirs();
981
+ let entries;
982
+ try {
983
+ entries = await fsp.readdir(PRDS_DIR);
984
+ } catch {
985
+ return [];
986
+ }
987
+ const out = [];
988
+ for (const name of entries) {
989
+ if (!name.endsWith('.md') || name.startsWith('.')) continue;
990
+ const filePath = path.join(PRDS_DIR, name);
991
+ try {
992
+ const parsed = parsePrd(filePath);
993
+ const stat = await fsp.stat(filePath);
994
+ out.push({
995
+ slug: parsed.slug,
996
+ parallelGroup: parsed.parallelGroup,
997
+ title: parsed.title,
998
+ cwd: parsed.cwd || '',
999
+ estimateMinutes: parsed.estimateMinutes,
1000
+ mtimeMs: stat.mtimeMs,
1001
+ });
1002
+ } catch (e) {
1003
+ console.warn('[scheduler] list-prds: skipping unparseable', name, e?.message);
1004
+ }
1005
+ }
1006
+ out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
1007
+ return out;
1008
+ });
689
1009
  }
690
1010
 
691
1011
  async function init() {
692
1012
  ensureDirs();
693
- // Ensure queue.json exists with defaults so the renderer can read it.
694
- if (!fs.existsSync(QUEUE_PATH)) writeQueue({ config: { ...DEFAULT_CONFIG }, jobs: [], scheduledFor: null, lastRunAt: null, paused: null });
1013
+
1014
+ // Hydrate cached state from the sidecar before any scheduling decisions.
1015
+ loadSchedulerState();
1016
+ bootedAt = Date.now();
1017
+
1018
+ // Boot reconciliation: mark any job that was 'running' when the app died as
1019
+ // 'failed'. mutate() creates queue.json from defaults if it doesn't exist.
1020
+ await mutate((state) => {
1021
+ for (const j of state.jobs) {
1022
+ if (j.status === 'running') {
1023
+ j.status = 'failed';
1024
+ j.error = 'orphaned: app restarted while running';
1025
+ j.finishedAt = new Date().toISOString();
1026
+ delete j.runtime;
1027
+ }
1028
+ }
1029
+ });
695
1030
 
696
1031
  // If we boot up while paused with a resumeAt in the past, clear it. This
697
1032
  // happens when the app was closed across the reset window.
698
1033
  const boot = readQueue();
699
1034
  if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
700
- clearPause('boot-elapsed');
1035
+ await clearPause('boot-elapsed');
701
1036
  } else if (boot.paused && boot.paused.resumeAt) {
702
1037
  // Re-arm the resume timer (lost across restart).
703
- setPaused(boot.paused.reason, boot.paused.resumeAt);
1038
+ await setPaused(boot.paused.reason, boot.paused.resumeAt);
704
1039
  }
705
1040
 
706
1041
  await rescheduleTimer();
@@ -708,13 +1043,49 @@ async function init() {
708
1043
  // resets early or the auth token rotates. Tracked so re-init doesn't leak.
709
1044
  if (rescheduleInterval) clearInterval(rescheduleInterval);
710
1045
  rescheduleInterval = setInterval(() => { rescheduleTimer().catch(() => {}); }, 10 * 60_000);
711
- // when-available poll loop. Tick every 2 minutes; the function itself is
712
- // a no-op when policy != 'when-available' or queue is empty/paused.
713
- if (pollTimer) clearInterval(pollTimer);
714
- pollTimer = setInterval(() => { pollWhenAvailable().catch(() => {}); }, 2 * 60_000);
715
- // First tick fires after a short delay so billing is warmed up.
716
- if (initialPollTimeout) clearTimeout(initialPollTimeout);
717
- initialPollTimeout = setTimeout(() => { pollWhenAvailable().catch(() => {}); }, 15_000);
1046
+
1047
+ // Self-rescheduling poll loop with exponential backoff. Replaces the
1048
+ // old fixed-interval pollTimer + initialPollTimeout.
1049
+ if (pollLoopTimer) clearTimeout(pollLoopTimer);
1050
+ // First tick fires after the standard warmup delay so billing is ready.
1051
+ pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
1052
+ if (pollLoopTimer.unref) pollLoopTimer.unref();
1053
+
1054
+ // Heartbeat: once per minute, log queue state for 24h visibility.
1055
+ if (heartbeatInterval) clearInterval(heartbeatInterval);
1056
+ heartbeatInterval = setInterval(() => {
1057
+ const s = readQueue();
1058
+ const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
1059
+ for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
1060
+ appendHeartbeat({
1061
+ ts: Date.now(),
1062
+ counts,
1063
+ paused: s.paused ? { reason: s.paused.reason, resumeAt: s.paused.resumeAt } : null,
1064
+ nextReset: cachedNextReset,
1065
+ utilization: cachedUtilization,
1066
+ consecutiveFailures,
1067
+ });
1068
+ }, 60_000);
1069
+ if (heartbeatInterval.unref) heartbeatInterval.unref();
1070
+
1071
+ // Wake-from-sleep: immediately re-poll and re-evaluate the queue.
1072
+ try {
1073
+ const { powerMonitor } = require('electron');
1074
+ powerMonitor.on('resume', () => {
1075
+ console.log('[scheduler] system resumed; re-polling and re-evaluating queue');
1076
+ if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
1077
+ backoffMs = 0;
1078
+ backoffNextAt = null;
1079
+ // Clear any paused-but-resumeAt-elapsed state immediately.
1080
+ const wakeState = readQueue();
1081
+ if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
1082
+ clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
1083
+ }
1084
+ pollLoop().catch(() => {});
1085
+ });
1086
+ } catch (e) {
1087
+ console.warn('[scheduler] powerMonitor unavailable', e?.message);
1088
+ }
718
1089
  }
719
1090
 
720
1091
  module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR };