claude-code-session-manager 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/assets/{cssMode-CDGOCAW5.js → cssMode-30PYohIN.js} +1 -1
  2. package/dist/assets/{editor.main-zj3Myqhk.js → editor.main-CZ_l_CSt.js} +3 -3
  3. package/dist/assets/{freemarker2-Dh6-Vi35.js → freemarker2-DA5xODSz.js} +1 -1
  4. package/dist/assets/{handlebars-KgXZ3LUu.js → handlebars-BgJKogMf.js} +1 -1
  5. package/dist/assets/{html-D12q2PkL.js → html-D3DAPwAR.js} +1 -1
  6. package/dist/assets/{htmlMode-CCaSY5vs.js → htmlMode-mS5mzFjU.js} +1 -1
  7. package/dist/assets/index-Bs-mHiD-.js +2976 -0
  8. package/dist/assets/index-DCK87t79.css +32 -0
  9. package/dist/assets/{javascript-ClkFzW_a.js → javascript-CJ-Uxk_I.js} +1 -1
  10. package/dist/assets/{jsonMode-BV7azwkW.js → jsonMode-DbcDRati.js} +1 -1
  11. package/dist/assets/{liquid-C9Id9V-K.js → liquid-I4DHwPR_.js} +1 -1
  12. package/dist/assets/{lspLanguageFeatures-COD69jzF.js → lspLanguageFeatures-BntDl6Xn.js} +1 -1
  13. package/dist/assets/{mdx-D8YvWtiq.js → mdx-DWI58irx.js} +1 -1
  14. package/dist/assets/{python-YtsitwE4.js → python-DPx3c0QA.js} +1 -1
  15. package/dist/assets/{razor-T48OHh5u.js → razor-BcxFqE_H.js} +1 -1
  16. package/dist/assets/{tsMode-DHTMR4b8.js → tsMode-CGTi49DJ.js} +1 -1
  17. package/dist/assets/{typescript-Ckq032Ud.js → typescript-CE9RqBjC.js} +1 -1
  18. package/dist/assets/{xml-B7dYWEXB.js → xml-DsrLAWcV.js} +1 -1
  19. package/dist/assets/{yaml-DUufEgrd.js → yaml-CA8rRsQI.js} +1 -1
  20. package/dist/index.html +2 -2
  21. package/package.json +1 -1
  22. package/src/main/config.cjs +93 -19
  23. package/src/main/index.cjs +163 -31
  24. package/src/main/ipcSchemas.cjs +59 -2
  25. package/src/main/lib/cleanEnv.cjs +20 -0
  26. package/src/main/lib/credentials.cjs +184 -0
  27. package/src/main/lib/schedulerConfig.cjs +10 -0
  28. package/src/main/logs.cjs +1 -1
  29. package/src/main/otelSettings.cjs +1 -1
  30. package/src/main/pty.cjs +53 -6
  31. package/src/main/scheduler.cjs +521 -148
  32. package/src/main/transcripts.cjs +26 -21
  33. package/src/main/usage.cjs +76 -25
  34. package/src/main/voiceSettings.cjs +1 -1
  35. package/src/main/watchers.cjs +69 -11
  36. package/src/preload/api.d.ts +53 -11
  37. package/src/preload/index.cjs +13 -0
  38. package/dist/assets/index-Dejlz0I1.js +0 -2972
  39. package/dist/assets/index-DsC4vT8M.css +0 -32
@@ -24,7 +24,8 @@
24
24
  * frees in that group.
25
25
  *
26
26
  * Execution:
27
- * - `claude -p "<PRD body>" --dangerously-skip-permissions` per PRD.
27
+ * - `claude -p "<PRD body>" --model sonnet --dangerously-skip-permissions`
28
+ * per PRD. Backlog runs on Sonnet; interactive sessions stay on Opus.
28
29
  * - Stdout/stderr → runs/<ts>/<slug>.log; meta json gets exit + duration.
29
30
  * - PRD frontmatter `cwd` → child cwd. Default: PROJECT_CWD const below.
30
31
  *
@@ -46,11 +47,20 @@ const os = require('node:os');
46
47
  const { spawn } = require('node:child_process');
47
48
  const { ipcMain } = require('electron');
48
49
  const billing = require('./usage.cjs');
50
+ const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
51
+ const {
52
+ POLL_INTERVAL_MS,
53
+ USAGE_REFRESH_INTERVAL_MS,
54
+ MAX_JOB_DURATION_MS,
55
+ } = require('./lib/schedulerConfig.cjs');
49
56
 
50
57
  const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
51
58
  const PRDS_DIR = path.join(ROOT, 'prds');
52
59
  const RUNS_DIR = path.join(ROOT, 'runs');
53
60
  const QUEUE_PATH = path.join(ROOT, 'queue.json');
61
+ const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
62
+ const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
63
+ const HEARTBEAT_MAX_BYTES = 1024 * 1024;
54
64
  const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
55
65
 
56
66
  const DEFAULT_CONFIG = {
@@ -61,7 +71,7 @@ const DEFAULT_CONFIG = {
61
71
  defaultCwd: DEFAULT_PROJECT_CWD,
62
72
  // 'when-available' = poll usage and fire whenever utilization < threshold.
63
73
  // 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
64
- // 'manual' = only fire on explicit Run-now click.
74
+ // 'manual' = only fire on explicit Run now click.
65
75
  firePolicy: 'when-available',
66
76
  // For 'when-available'. Fire only when five_hour utilization < this percent.
67
77
  utilizationThreshold: 90,
@@ -76,11 +86,61 @@ function ensureDirs() {
76
86
  }
77
87
 
78
88
  function atomicWriteJson(p, data) {
79
- const tmp = `${p}.${process.pid}.tmp`;
89
+ const tmp = `${p}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
80
90
  fs.writeFileSync(tmp, JSON.stringify(data, null, 2));
81
91
  fs.renameSync(tmp, p);
82
92
  }
83
93
 
94
+ // ---------- scheduler-state.json (sidecar) ----------
95
+
96
+ function loadSchedulerState() {
97
+ try {
98
+ const raw = fs.readFileSync(SCHEDULER_STATE_PATH, 'utf8');
99
+ const s = JSON.parse(raw);
100
+ if (s.lastObservedReset) cachedNextReset = s.lastObservedReset;
101
+ if (typeof s.consecutiveFailures === 'number') consecutiveFailures = s.consecutiveFailures;
102
+ if (typeof s.backoffMs === 'number') backoffMs = s.backoffMs;
103
+ if (typeof s.pauseClearedManuallyAt === 'number') pauseClearedManuallyAt = s.pauseClearedManuallyAt;
104
+ if (typeof s.lastPollAt === 'number') lastPollAt = s.lastPollAt;
105
+ } catch { /* first boot or corrupt — start fresh */ }
106
+ }
107
+
108
+ function persistSchedulerState() {
109
+ try {
110
+ atomicWriteJson(SCHEDULER_STATE_PATH, {
111
+ version: 1,
112
+ lastObservedReset: cachedNextReset,
113
+ lastResetObservedAt: cachedNextReset ? Date.now() : null,
114
+ lastPollAt,
115
+ consecutiveFailures,
116
+ backoffMs,
117
+ pausedReason: null,
118
+ pausedSince: null,
119
+ pauseClearedManuallyAt,
120
+ });
121
+ } catch (e) {
122
+ console.warn('[scheduler] failed to persist scheduler state', e?.message);
123
+ }
124
+ }
125
+
126
+ // ---------- heartbeat log ----------
127
+
128
+ function appendHeartbeat(entry) {
129
+ try {
130
+ const line = JSON.stringify(entry) + '\n';
131
+ let size = 0;
132
+ try { size = fs.statSync(HEARTBEAT_PATH).size; } catch { /* new file */ }
133
+ if (size >= HEARTBEAT_MAX_BYTES) {
134
+ const rotated = HEARTBEAT_PATH + '.1';
135
+ try { fs.unlinkSync(rotated); } catch { /* */ }
136
+ try { fs.renameSync(HEARTBEAT_PATH, rotated); } catch { /* */ }
137
+ }
138
+ fs.appendFileSync(HEARTBEAT_PATH, line);
139
+ } catch (e) {
140
+ console.warn('[scheduler] heartbeat write failed', e?.message);
141
+ }
142
+ }
143
+
84
144
  function readQueue() {
85
145
  try {
86
146
  const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
@@ -102,6 +162,25 @@ function writeQueue(state) {
102
162
  atomicWriteJson(QUEUE_PATH, state);
103
163
  }
104
164
 
165
+ // ---------- serialized mutation queue ----------
166
+
167
+ // All read-modify-write operations on queue.json go through mutate() so
168
+ // concurrent job completions in a parallel wave cannot lose each other's
169
+ // status updates. mutateTail is always a resolved promise even when the
170
+ // preceding mutate threw, so the chain never deadlocks.
171
+ let mutateTail = Promise.resolve();
172
+
173
+ function mutate(fn) {
174
+ const next = mutateTail.then(async () => {
175
+ const state = readQueue();
176
+ const ret = await fn(state);
177
+ writeQueue(state);
178
+ return ret;
179
+ });
180
+ mutateTail = next.catch(() => {}); // keep chain alive on errors
181
+ return next;
182
+ }
183
+
105
184
  // ---------- PRD parsing ----------
106
185
 
107
186
  /**
@@ -220,32 +299,41 @@ function reconcile(state) {
220
299
 
221
300
  // ---------- next-reset detection ----------
222
301
 
223
- let cachedNextReset = null;
302
+ let cachedNextReset = null; // bare ISO string or null
224
303
  let cachedUtilization = null; // five_hour utilization %, 0–100, or null if unknown
225
304
 
305
+ /** Fetches latest usage from billing API. Throws on any error — callers handle it. */
226
306
  async function refreshNextReset() {
227
- try {
228
- const r = await billing.fetchUsage();
229
- cachedNextReset = r?.usage?.five_hour?.resets_at ?? null;
230
- cachedUtilization = r?.usage?.five_hour?.utilization ?? cachedUtilization;
231
- return cachedNextReset;
232
- } catch {
233
- return cachedNextReset;
234
- }
307
+ const r = await billing.fetchUsage();
308
+ if (r.kind !== 'ok') throw new Error(`usage fetch failed (${r.kind}): ${r.message ?? ''}`);
309
+ cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? null;
310
+ cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
311
+ return cachedNextReset;
235
312
  }
236
313
 
237
314
  function getNextResetCached() {
238
315
  return cachedNextReset;
239
316
  }
240
317
 
318
+ // ---------- health / poll state ----------
319
+
320
+ let bootedAt = Date.now();
321
+ let lastPollAt = null;
322
+ let lastPollOk = false;
323
+ let consecutiveFailures = 0;
324
+ let backoffMs = 0;
325
+ let backoffNextAt = null;
326
+ let firstFailureAt = null;
327
+ let pauseClearedManuallyAt = null;
328
+
241
329
  // ---------- timer ----------
242
330
 
243
331
  let mainWindow = null;
244
332
  let fireTimer = null;
245
333
  let resumeTimer = null;
246
- let pollTimer = null;
334
+ let pollLoopTimer = null;
247
335
  let rescheduleInterval = null;
248
- let initialPollTimeout = null;
336
+ let heartbeatInterval = null;
249
337
  let isExecuting = false;
250
338
  let cancelToken = { cancelled: false };
251
339
  let claudeBinPathCached = null;
@@ -288,65 +376,83 @@ function computeFireAt(state, nextResetIso) {
288
376
 
289
377
  async function rescheduleTimer() {
290
378
  clearFireTimer();
291
- const state = readQueue();
292
- reconcile(state);
293
- const nextResetIso = await refreshNextReset();
294
- const fireAt = computeFireAt(state, nextResetIso);
295
- if (!fireAt) {
296
- state.scheduledFor = null;
297
- writeQueue(state);
298
- broadcast();
299
- return;
379
+ // Wrap in try/catch — on failure use the cached value so the on-reset
380
+ // timer can still be armed from the last known reset.
381
+ let nextResetIso;
382
+ try {
383
+ nextResetIso = await refreshNextReset();
384
+ } catch {
385
+ nextResetIso = cachedNextReset;
300
386
  }
301
-
302
- state.scheduledFor = new Date(fireAt).toISOString();
303
- writeQueue(state);
387
+ const fireAt = await mutate((state) => {
388
+ reconcile(state);
389
+ const fa = computeFireAt(state, nextResetIso);
390
+ state.scheduledFor = fa ? new Date(fa).toISOString() : null;
391
+ return fa;
392
+ });
304
393
  broadcast();
394
+ if (!fireAt) return;
305
395
 
306
396
  const delay = Math.max(1000, fireAt - Date.now());
307
- // setTimeout caps at int32 ms (~24.8 days) — well above our 5h horizon, so
308
- // a single timer is fine. If reset_at is wildly in the future we'd still
309
- // re-anchor on the next billing refresh.
310
397
  fireTimer = setTimeout(() => { runDueJobs().catch(() => {}); }, delay);
311
- console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${state.scheduledFor}`);
398
+ console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${new Date(fireAt).toISOString()}`);
312
399
  }
313
400
 
314
401
  // ---------- pause / resume ----------
315
402
 
316
- function setPaused(reason, resumeAtIso) {
317
- const s = readQueue();
318
- if (s.paused && s.paused.reason === reason) {
319
- if (resumeAtIso) s.paused.resumeAt = resumeAtIso;
320
- } else {
321
- s.paused = { reason, since: new Date().toISOString(), resumeAt: resumeAtIso || null };
403
+ async function setPaused(reason, resumeAtIso) {
404
+ // Honor manual-override cooldown: if the user cleared a pause within the
405
+ // last 5 minutes, suppress auto-pause re-engagement on the same condition.
406
+ if (pauseClearedManuallyAt && Date.now() - pauseClearedManuallyAt < 300_000) {
407
+ console.log(`[scheduler] setPaused(${reason}) suppressed by manual override cooldown`);
408
+ return;
409
+ }
410
+
411
+ // For 'network' with no explicit resumeAt, auto-resume after 30 minutes.
412
+ let effectiveResumeAt = resumeAtIso;
413
+ if (reason === 'network' && !resumeAtIso) {
414
+ effectiveResumeAt = new Date(Date.now() + 30 * 60_000).toISOString();
322
415
  }
323
- writeQueue(s);
416
+
417
+ await mutate((s) => {
418
+ if (s.paused && s.paused.reason === reason) {
419
+ if (effectiveResumeAt) s.paused.resumeAt = effectiveResumeAt;
420
+ } else {
421
+ s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
422
+ }
423
+ });
324
424
  broadcast();
325
425
  cancelToken.cancelled = true;
326
426
  if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
327
- if (!resumeAtIso) return;
427
+ if (!effectiveResumeAt) return;
428
+
328
429
  // Resume 30s after the reset to give the auth/billing endpoint time to flip.
329
- const delay = Math.max(30_000, new Date(resumeAtIso).getTime() - Date.now() + 30_000);
430
+ const delay = Math.max(30_000, new Date(effectiveResumeAt).getTime() - Date.now() + 30_000);
330
431
  if (delay > 0x7fffffff) {
331
432
  console.warn(`[scheduler] paused (${reason}); resumeAt too far for setTimeout (${delay}ms)`);
332
433
  return;
333
434
  }
334
- resumeTimer = setTimeout(() => {
335
- clearPause('resume-timer');
435
+ resumeTimer = setTimeout(async () => {
436
+ await clearPause('resume-timer');
336
437
  runDueJobs().catch(() => {});
337
438
  }, delay);
338
- console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay/1000)}s`);
439
+ console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay / 1000)}s`);
339
440
  }
340
441
 
341
- function clearPause(source) {
442
+ async function clearPause(source) {
342
443
  if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
343
- const s = readQueue();
344
- if (s.paused) {
444
+ const wasPaused = await mutate((s) => {
445
+ if (!s.paused) return false;
345
446
  console.log(`[scheduler] clearPause (${source || 'manual'})`);
346
447
  s.paused = null;
347
- writeQueue(s);
348
- broadcast();
448
+ return true;
449
+ });
450
+ // Track manual clears for the auto-pause cooldown.
451
+ if (source === 'manual' || source === 'run-now') {
452
+ pauseClearedManuallyAt = Date.now();
453
+ persistSchedulerState();
349
454
  }
455
+ if (wasPaused) broadcast();
350
456
  }
351
457
 
352
458
  /** Mutate a job in place to "pending" with cleared run metadata. */
@@ -357,6 +463,7 @@ function resetJobFields(job, errorMsg) {
357
463
  job.finishedAt = null;
358
464
  job.exitCode = null;
359
465
  job.error = errorMsg ?? null;
466
+ delete job.runtime;
360
467
  }
361
468
 
362
469
  /** Scan the tail of a job's log for the canonical rate-limit signal. We look
@@ -407,15 +514,34 @@ function pickRunDir() {
407
514
  return { runId: ts, dir };
408
515
  }
409
516
 
410
- async function executeJob(job, runDir, defaultCwd) {
517
+ /**
518
+ * Execute a single PRD job. Writes stdout/stderr to a log file and a meta
519
+ * JSON sidecar. Accepts an optional onPid(pid) callback called synchronously
520
+ * after spawn so callers can persist the pid before the job finishes.
521
+ */
522
+ async function executeJob(job, runDir, defaultCwd, onPid) {
411
523
  const logPath = path.join(runDir, `${job.slug}.log`);
412
524
  const metaPath = path.join(runDir, `${job.slug}.meta.json`);
413
525
  const cwd = job.cwd || defaultCwd;
414
526
  const startedAt = Date.now();
415
527
 
416
528
  const fd = fs.openSync(logPath, 'a');
529
+ let fdClosed = false;
530
+ const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
531
+
417
532
  fs.writeSync(fd, `[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
418
533
 
534
+ // Dead-cwd guard: verify the target directory exists and is traversable
535
+ // before handing it to the child process.
536
+ try { fs.accessSync(cwd, fs.constants.X_OK); }
537
+ catch {
538
+ const errMsg = `cwd no longer exists: ${cwd}`;
539
+ fs.writeSync(fd, `[scheduler] ${errMsg}\n`);
540
+ closeFd();
541
+ atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
542
+ return { exitCode: -1, durationMs: 0, error: errMsg };
543
+ }
544
+
419
545
  // Read full PRD body fresh from disk (queue stored only the preview).
420
546
  let prompt;
421
547
  try {
@@ -423,37 +549,54 @@ async function executeJob(job, runDir, defaultCwd) {
423
549
  prompt = parsed.body;
424
550
  } catch (e) {
425
551
  fs.writeSync(fd, `[scheduler] failed to read PRD: ${e?.message}\n`);
426
- fs.closeSync(fd);
552
+ closeFd();
427
553
  return { exitCode: -1, durationMs: 0, error: e?.message };
428
554
  }
429
555
 
430
556
  return await new Promise((resolve) => {
431
557
  const claudeBin = resolveClaudeBin();
558
+ // Strip Claude Code env and secrets that leak in when session-manager is
559
+ // launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
560
+ // overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
561
+ const childEnv = cleanChildEnv();
432
562
  const child = spawn(claudeBin, [
433
563
  '-p', prompt,
564
+ '--model', 'sonnet',
434
565
  '--dangerously-skip-permissions',
435
566
  '--output-format', 'stream-json',
436
567
  '--verbose',
437
568
  ], {
438
569
  cwd,
439
- env: process.env,
570
+ env: childEnv,
440
571
  stdio: ['ignore', fd, fd],
441
572
  });
442
573
 
443
574
  fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
444
575
 
576
+ // Fire-and-forget pid persistence — best effort.
577
+ if (onPid) onPid(child.pid).catch(() => {});
578
+
579
+ // Kill the child if it runs past the maximum allowed duration.
580
+ const watchdog = setTimeout(() => {
581
+ fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
582
+ try { child.kill('SIGKILL'); } catch { /* already dead */ }
583
+ }, MAX_JOB_DURATION_MS);
584
+ if (watchdog.unref) watchdog.unref();
585
+
445
586
  child.on('error', (err) => {
587
+ clearTimeout(watchdog);
446
588
  const durationMs = Date.now() - startedAt;
447
- fs.writeSync(fd, `\n[scheduler] spawn error: ${err.message}\n`);
448
- fs.closeSync(fd);
589
+ fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
590
+ closeFd();
449
591
  atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
450
592
  resolve({ exitCode: -1, durationMs, error: err.message });
451
593
  });
452
594
 
453
595
  child.on('exit', (code) => {
596
+ clearTimeout(watchdog);
454
597
  const durationMs = Date.now() - startedAt;
455
598
  fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
456
- fs.closeSync(fd);
599
+ closeFd();
457
600
  const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
458
601
  atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
459
602
  resolve({ exitCode: code, durationMs, rateLimited });
@@ -477,7 +620,6 @@ async function runDueJobs() {
477
620
  return;
478
621
  }
479
622
  const { runId, dir: runDir } = pickRunDir();
480
- state.lastRunAt = new Date().toISOString();
481
623
 
482
624
  // Group by parallelGroup, ascending. Each group runs serially after the
483
625
  // previous group completes.
@@ -489,7 +631,7 @@ async function runDueJobs() {
489
631
  }
490
632
  const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
491
633
 
492
- writeQueue(state);
634
+ await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
493
635
  broadcast();
494
636
 
495
637
  for (const gk of groupKeys) {
@@ -501,41 +643,59 @@ async function runDueJobs() {
501
643
  const inFlight = new Set();
502
644
 
503
645
  const launch = (job) => {
504
- const s = readQueue();
505
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
506
- if (idx >= 0) {
507
- s.jobs[idx].status = 'running';
508
- s.jobs[idx].runId = runId;
509
- s.jobs[idx].startedAt = new Date().toISOString();
510
- writeQueue(s);
511
- broadcast();
512
- }
513
- const promise = executeJob(job, runDir, state.config.defaultCwd).then(async (res) => {
514
- // Rate-limit OR pause-already-set means we treat this job as
515
- // unfinished — bounce it back to 'pending' so the next run
516
- // (after token reset) picks it up.
517
- if (res.rateLimited) {
518
- const resetIso = await refreshNextReset();
519
- setPaused('rate_limit', resetIso);
520
- }
521
- const sn = readQueue();
522
- const i2 = sn.jobs.findIndex((x) => x.slug === job.slug);
523
- if (i2 >= 0) {
524
- const treatAsPending = res.rateLimited || (sn.paused && sn.paused.reason === 'rate_limit');
525
- if (treatAsPending) {
526
- resetJobFields(sn.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
527
- } else {
528
- sn.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
529
- sn.jobs[i2].finishedAt = new Date().toISOString();
530
- sn.jobs[i2].exitCode = res.exitCode;
531
- sn.jobs[i2].error = res.error || null;
646
+ const promise = (async () => {
647
+ try {
648
+ // Mark job running.
649
+ await mutate((s) => {
650
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
651
+ if (idx >= 0) {
652
+ s.jobs[idx].status = 'running';
653
+ s.jobs[idx].runId = runId;
654
+ s.jobs[idx].startedAt = new Date().toISOString();
655
+ }
656
+ });
657
+ broadcast();
658
+
659
+ // Execute — onPid persists the child PID into the running state.
660
+ const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
661
+ await mutate((s) => {
662
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
663
+ if (idx >= 0) {
664
+ s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
665
+ }
666
+ });
667
+ });
668
+
669
+ // Rate-limit: pause before writing terminal status so the status
670
+ // mutate below can read the pause state.
671
+ if (res.rateLimited) {
672
+ const resetIso = await refreshNextReset().catch(() => cachedNextReset);
673
+ await setPaused('rate_limit', resetIso);
532
674
  }
533
- writeQueue(sn);
675
+
676
+ // Write terminal status; strip runtime regardless of outcome.
677
+ await mutate((s) => {
678
+ const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
679
+ if (i2 >= 0) {
680
+ const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
681
+ if (treatAsPending) {
682
+ resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
683
+ } else {
684
+ s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
685
+ s.jobs[i2].finishedAt = new Date().toISOString();
686
+ s.jobs[i2].exitCode = res.exitCode;
687
+ s.jobs[i2].error = res.error || null;
688
+ delete s.jobs[i2].runtime;
689
+ }
690
+ }
691
+ });
534
692
  broadcast();
693
+ } catch (e) {
694
+ console.error('[scheduler] launch error', job.slug, e);
535
695
  }
536
- inFlight.delete(promise);
537
- });
696
+ })();
538
697
  inFlight.add(promise);
698
+ promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
539
699
  };
540
700
 
541
701
  // Prime up to cap
@@ -555,45 +715,97 @@ async function runDueJobs() {
555
715
  isExecuting = false;
556
716
  // No longer auto-disable after a run. The firePolicy now governs whether
557
717
  // the next batch fires automatically. Just clear the one-shot scheduledFor.
558
- const s = readQueue();
559
- s.scheduledFor = null;
560
- writeQueue(s);
718
+ await mutate((s) => { s.scheduledFor = null; });
561
719
  broadcast();
562
720
  }
563
721
  }
564
722
 
565
- // ---------- when-available poll loop ----------
723
+ // ---------- when-available launch logic ----------
566
724
 
567
- async function pollWhenAvailable() {
725
+ async function maybeLaunchWhenAvailable(state) {
726
+ if (state.config.firePolicy !== 'when-available') return;
727
+ if (state.paused) return;
728
+ if (isExecuting) return;
729
+ const pending = state.jobs.filter((j) => j.status === 'pending');
730
+ if (pending.length === 0) return;
731
+ if (cachedUtilization === null || cachedUtilization === undefined) return;
732
+ if (cachedUtilization >= state.config.utilizationThreshold) {
733
+ broadcast();
734
+ return;
735
+ }
736
+ console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
737
+ runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
738
+ }
739
+
740
+ // ---------- poll loop with exponential backoff ----------
741
+
742
+ async function pollLoop() {
568
743
  try {
569
- const state = readQueue();
570
- if (state.config.firePolicy !== 'when-available') return;
571
- if (state.paused) return;
572
- if (isExecuting) return;
573
- const pending = state.jobs.filter((j) => j.status === 'pending');
574
- if (pending.length === 0) return;
744
+ const r = await billing.fetchUsage();
575
745
 
576
- // Refresh utilization. If the call fails, don't fire blindly — wait
577
- // for the next tick.
578
- let util;
579
- try {
580
- const r = await billing.fetchUsage();
581
- util = r?.usage?.five_hour?.utilization ?? null;
582
- cachedUtilization = util;
583
- cachedNextReset = { at: r?.usage?.five_hour?.resets_at ?? cachedNextReset.at, fetchedAt: Date.now() };
584
- } catch {
585
- return;
586
- }
587
- if (util === null || util === undefined) return;
588
- if (util >= state.config.utilizationThreshold) {
589
- // Tokens too high — broadcast so the UI shows current util but don't fire.
746
+ if (r.kind === 'ok') {
747
+ cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? cachedNextReset;
748
+ cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
749
+ consecutiveFailures = 0;
750
+ backoffMs = 0;
751
+ backoffNextAt = null;
752
+ firstFailureAt = null;
753
+ lastPollAt = Date.now();
754
+ lastPollOk = true;
755
+ persistSchedulerState();
756
+
757
+ // If a 'network' pause resolved, clear it now that we have a good reading.
758
+ const cur = readQueue();
759
+ if (cur.paused?.reason === 'network') {
760
+ await clearPause('network-recovered');
761
+ }
762
+ // If 'reset_failure' was set and we now have a valid reset, clear it.
763
+ if (cur.paused?.reason === 'reset_failure' && cachedNextReset) {
764
+ await clearPause('reset-recovered');
765
+ }
766
+
767
+ await maybeLaunchWhenAvailable(cur);
590
768
  broadcast();
591
- return;
769
+ } else {
770
+ lastPollAt = Date.now();
771
+ lastPollOk = false;
772
+ consecutiveFailures++;
773
+ if (!firstFailureAt) firstFailureAt = Date.now();
774
+
775
+ if (r.kind === 'auth') {
776
+ console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
777
+ await setPaused('auth', null);
778
+ } else {
779
+ // transient or config — apply exponential backoff.
780
+ backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
781
+ const totalFailureMs = Date.now() - firstFailureAt;
782
+ console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
783
+
784
+ // After 30 minutes of consecutive failures, set 'network' pause.
785
+ if (totalFailureMs > 30 * 60_000) {
786
+ const cur2 = readQueue();
787
+ if (!cur2.paused || cur2.paused.reason === 'network') {
788
+ await setPaused('network', null);
789
+ }
790
+ }
791
+ }
792
+
793
+ backoffNextAt = Date.now() + backoffMs;
794
+ persistSchedulerState();
592
795
  }
593
- console.log(`[scheduler] when-available: util=${util}%, ${pending.length} pending — firing`);
594
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
595
796
  } catch (e) {
596
- console.error('[scheduler] poll error', e);
797
+ // Unexpected error (e.g., IPC transport failure)
798
+ lastPollAt = Date.now();
799
+ lastPollOk = false;
800
+ consecutiveFailures++;
801
+ if (!firstFailureAt) firstFailureAt = Date.now();
802
+ backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
803
+ backoffNextAt = Date.now() + backoffMs;
804
+ persistSchedulerState();
805
+ } finally {
806
+ const delay = backoffMs || POLL_INTERVAL_MS;
807
+ pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, delay);
808
+ if (pollLoopTimer.unref) pollLoopTimer.unref();
597
809
  }
598
810
  }
599
811
 
@@ -618,44 +830,85 @@ function registerScheduleHandlers() {
618
830
  };
619
831
  });
620
832
 
621
- ipcMain.handle('schedule:set-config', async (_e, partial) => {
833
+ ipcMain.handle('schedule:health', async () => {
622
834
  const state = readQueue();
623
- state.config = { ...state.config, ...(partial || {}) };
624
- if (typeof state.config.concurrencyCap === 'number') {
625
- state.config.concurrencyCap = Math.max(1, Math.min(20, Math.floor(state.config.concurrencyCap)));
835
+ const runningJobs = [];
836
+ for (const j of state.jobs) {
837
+ if (j.status === 'running' && j.runtime) {
838
+ runningJobs.push({
839
+ slug: j.slug,
840
+ startedAt: j.startedAt ? Date.parse(j.startedAt) : 0,
841
+ pid: j.runtime.pid ?? 0,
842
+ });
843
+ }
626
844
  }
627
- if (typeof state.config.offsetMinutes === 'number') {
628
- state.config.offsetMinutes = Math.max(0, Math.min(180, Math.floor(state.config.offsetMinutes)));
845
+ return {
846
+ bootedAt,
847
+ lastPollAt,
848
+ lastPollOk,
849
+ consecutiveFailures,
850
+ backoffNextAt,
851
+ nextResetCached: cachedNextReset,
852
+ pausedSince: state.paused ? Date.parse(state.paused.since) : null,
853
+ pauseReason: state.paused?.reason ?? null,
854
+ runningJobs,
855
+ };
856
+ });
857
+
858
+ ipcMain.handle('schedule:set-config', async (_e, partial) => {
859
+ const { schemas: s } = require('./ipcSchemas.cjs');
860
+ let validated;
861
+ try {
862
+ validated = s.setConfigSchema.parse(partial || {});
863
+ } catch (e) {
864
+ return { ok: false, error: e?.message ?? 'invalid config' };
629
865
  }
630
- writeQueue(state);
866
+ const config = await mutate((state) => {
867
+ state.config = { ...state.config, ...validated };
868
+ return state.config;
869
+ });
631
870
  await rescheduleTimer();
632
- return { ok: true, config: state.config };
871
+ return { ok: true, config };
633
872
  });
634
873
 
635
- ipcMain.handle('schedule:reset-job', async (_e, { slug }) => {
636
- const state = readQueue();
637
- const idx = state.jobs.findIndex((j) => j.slug === slug);
638
- if (idx < 0) return { ok: false, error: 'not found' };
639
- resetJobFields(state.jobs[idx]);
640
- writeQueue(state);
874
+ ipcMain.handle('schedule:reset-job', async (_e, payload) => {
875
+ const { schemas: s } = require('./ipcSchemas.cjs');
876
+ let slug;
877
+ try {
878
+ ({ slug } = s.scheduleSlug.parse(payload));
879
+ } catch (e) {
880
+ return { ok: false, error: 'invalid slug' };
881
+ }
882
+ // Containment check after path.join.
883
+ const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
884
+ if (!resolved.startsWith(PRDS_DIR + path.sep)) {
885
+ return { ok: false, error: 'invalid slug' };
886
+ }
887
+ const found = await mutate((state) => {
888
+ const idx = state.jobs.findIndex((j) => j.slug === slug);
889
+ if (idx < 0) return false;
890
+ resetJobFields(state.jobs[idx]);
891
+ return true;
892
+ });
893
+ if (!found) return { ok: false, error: 'not found' };
641
894
  broadcast();
642
895
  return { ok: true };
643
896
  });
644
897
 
645
898
  ipcMain.handle('schedule:run-now', async () => {
646
899
  // Manual run-now overrides any auto-pause. Clear it first.
647
- clearPause('run-now');
900
+ await clearPause('run-now');
648
901
  runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
649
902
  return { ok: true };
650
903
  });
651
904
 
652
905
  ipcMain.handle('schedule:resume', async () => {
653
- clearPause('manual');
906
+ await clearPause('manual');
654
907
  return { ok: true };
655
908
  });
656
909
 
657
910
  ipcMain.handle('schedule:refresh-reset', async () => {
658
- const at = await refreshNextReset();
911
+ const at = await refreshNextReset().catch(() => cachedNextReset);
659
912
  await rescheduleTimer();
660
913
  return { ok: true, nextReset: at };
661
914
  });
@@ -666,39 +919,123 @@ function registerScheduleHandlers() {
666
919
  return { ok: true };
667
920
  });
668
921
 
669
- ipcMain.handle('schedule:read-prd', async (_e, { slug }) => {
922
+ ipcMain.handle('schedule:read-prd', async (_e, payload) => {
923
+ const { schemas: s } = require('./ipcSchemas.cjs');
924
+ let slug;
670
925
  try {
671
- const text = await fsp.readFile(path.join(PRDS_DIR, `${slug}.md`), 'utf8');
926
+ ({ slug } = s.scheduleSlug.parse(payload));
927
+ } catch {
928
+ return { ok: false, error: 'invalid slug' };
929
+ }
930
+ const filePath = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
931
+ if (!filePath.startsWith(PRDS_DIR + path.sep)) {
932
+ return { ok: false, error: 'invalid slug' };
933
+ }
934
+ try {
935
+ const text = await fsp.readFile(filePath, 'utf8');
672
936
  return { ok: true, text };
673
937
  } catch (e) {
674
938
  return { ok: false, error: e?.message };
675
939
  }
676
940
  });
677
941
 
678
- ipcMain.handle('schedule:read-log', async (_e, { runId, slug }) => {
942
+ ipcMain.handle('schedule:read-log', async (_e, payload) => {
943
+ const { schemas: s } = require('./ipcSchemas.cjs');
944
+ let slug, runId;
945
+ try {
946
+ ({ slug, runId } = s.scheduleReadLog.parse(payload));
947
+ } catch {
948
+ return { ok: false, error: 'invalid slug or runId' };
949
+ }
950
+ const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
951
+ if (!logPath.startsWith(RUNS_DIR + path.sep)) {
952
+ return { ok: false, error: 'invalid slug or runId' };
953
+ }
679
954
  try {
680
- const p = path.join(RUNS_DIR, runId, `${slug}.log`);
681
- const text = await fsp.readFile(p, 'utf8');
955
+ const text = await fsp.readFile(logPath, 'utf8');
682
956
  return { ok: true, text };
683
957
  } catch (e) {
684
958
  return { ok: false, error: e?.message };
685
959
  }
686
960
  });
961
+
962
+ const PRD_WRITE_MAX_BYTES = 256 * 1024;
963
+ const SLUG_RE = /^[A-Za-z0-9._-]{1,128}$/;
964
+
965
+ ipcMain.handle('schedule:write-prd', async (_e, { slug, body }) => {
966
+ if (!SLUG_RE.test(slug)) throw new Error(`invalid slug: ${slug}`);
967
+ if (typeof body !== 'string') throw new Error('body must be string');
968
+ if (Buffer.byteLength(body, 'utf8') > PRD_WRITE_MAX_BYTES) throw new Error('body too large');
969
+ const file = path.join(PRDS_DIR, `${slug}.md`);
970
+ const resolved = path.resolve(file);
971
+ if (!resolved.startsWith(PRDS_DIR + path.sep)) throw new Error('path escape');
972
+ const tmp = `${resolved}.${process.pid}.${Date.now()}.tmp`;
973
+ await fsp.writeFile(tmp, body, { encoding: 'utf8', mode: 0o644 });
974
+ await fsp.rename(tmp, resolved);
975
+ const stat = await fsp.stat(resolved);
976
+ return { ok: true, bytesWritten: stat.size };
977
+ });
978
+
979
+ ipcMain.handle('schedule:list-prds', async () => {
980
+ ensureDirs();
981
+ let entries;
982
+ try {
983
+ entries = await fsp.readdir(PRDS_DIR);
984
+ } catch {
985
+ return [];
986
+ }
987
+ const out = [];
988
+ for (const name of entries) {
989
+ if (!name.endsWith('.md') || name.startsWith('.')) continue;
990
+ const filePath = path.join(PRDS_DIR, name);
991
+ try {
992
+ const parsed = parsePrd(filePath);
993
+ const stat = await fsp.stat(filePath);
994
+ out.push({
995
+ slug: parsed.slug,
996
+ parallelGroup: parsed.parallelGroup,
997
+ title: parsed.title,
998
+ cwd: parsed.cwd || '',
999
+ estimateMinutes: parsed.estimateMinutes,
1000
+ mtimeMs: stat.mtimeMs,
1001
+ });
1002
+ } catch (e) {
1003
+ console.warn('[scheduler] list-prds: skipping unparseable', name, e?.message);
1004
+ }
1005
+ }
1006
+ out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
1007
+ return out;
1008
+ });
687
1009
  }
688
1010
 
689
1011
  async function init() {
690
1012
  ensureDirs();
691
- // Ensure queue.json exists with defaults so the renderer can read it.
692
- if (!fs.existsSync(QUEUE_PATH)) writeQueue({ config: { ...DEFAULT_CONFIG }, jobs: [], scheduledFor: null, lastRunAt: null, paused: null });
1013
+
1014
+ // Hydrate cached state from the sidecar before any scheduling decisions.
1015
+ loadSchedulerState();
1016
+ bootedAt = Date.now();
1017
+
1018
+ // Boot reconciliation: mark any job that was 'running' when the app died as
1019
+ // 'failed'. mutate() creates queue.json from defaults if it doesn't exist.
1020
+ await mutate((state) => {
1021
+ for (const j of state.jobs) {
1022
+ if (j.status === 'running') {
1023
+ j.status = 'failed';
1024
+ j.error = 'orphaned: app restarted while running';
1025
+ j.finishedAt = new Date().toISOString();
1026
+ delete j.runtime;
1027
+ }
1028
+ }
1029
+ });
693
1030
 
694
1031
  // If we boot up while paused with a resumeAt in the past, clear it. This
695
1032
  // happens when the app was closed across the reset window.
696
1033
  const boot = readQueue();
697
1034
  if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
698
- clearPause('boot-elapsed');
1035
+ await clearPause('boot-elapsed');
699
1036
  } else if (boot.paused && boot.paused.resumeAt) {
700
1037
  // Re-arm the resume timer (lost across restart).
701
- setPaused(boot.paused.reason, boot.paused.resumeAt);
1038
+ await setPaused(boot.paused.reason, boot.paused.resumeAt);
702
1039
  }
703
1040
 
704
1041
  await rescheduleTimer();
@@ -706,13 +1043,49 @@ async function init() {
706
1043
  // resets early or the auth token rotates. Tracked so re-init doesn't leak.
707
1044
  if (rescheduleInterval) clearInterval(rescheduleInterval);
708
1045
  rescheduleInterval = setInterval(() => { rescheduleTimer().catch(() => {}); }, 10 * 60_000);
709
- // when-available poll loop. Tick every 2 minutes; the function itself is
710
- // a no-op when policy != 'when-available' or queue is empty/paused.
711
- if (pollTimer) clearInterval(pollTimer);
712
- pollTimer = setInterval(() => { pollWhenAvailable().catch(() => {}); }, 2 * 60_000);
713
- // First tick fires after a short delay so billing is warmed up.
714
- if (initialPollTimeout) clearTimeout(initialPollTimeout);
715
- initialPollTimeout = setTimeout(() => { pollWhenAvailable().catch(() => {}); }, 15_000);
1046
+
1047
+ // Self-rescheduling poll loop with exponential backoff. Replaces the
1048
+ // old fixed-interval pollTimer + initialPollTimeout.
1049
+ if (pollLoopTimer) clearTimeout(pollLoopTimer);
1050
+ // First tick fires after the standard warmup delay so billing is ready.
1051
+ pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
1052
+ if (pollLoopTimer.unref) pollLoopTimer.unref();
1053
+
1054
+ // Heartbeat: once per minute, log queue state for 24h visibility.
1055
+ if (heartbeatInterval) clearInterval(heartbeatInterval);
1056
+ heartbeatInterval = setInterval(() => {
1057
+ const s = readQueue();
1058
+ const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
1059
+ for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
1060
+ appendHeartbeat({
1061
+ ts: Date.now(),
1062
+ counts,
1063
+ paused: s.paused ? { reason: s.paused.reason, resumeAt: s.paused.resumeAt } : null,
1064
+ nextReset: cachedNextReset,
1065
+ utilization: cachedUtilization,
1066
+ consecutiveFailures,
1067
+ });
1068
+ }, 60_000);
1069
+ if (heartbeatInterval.unref) heartbeatInterval.unref();
1070
+
1071
+ // Wake-from-sleep: immediately re-poll and re-evaluate the queue.
1072
+ try {
1073
+ const { powerMonitor } = require('electron');
1074
+ powerMonitor.on('resume', () => {
1075
+ console.log('[scheduler] system resumed; re-polling and re-evaluating queue');
1076
+ if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
1077
+ backoffMs = 0;
1078
+ backoffNextAt = null;
1079
+ // Clear any paused-but-resumeAt-elapsed state immediately.
1080
+ const wakeState = readQueue();
1081
+ if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
1082
+ clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
1083
+ }
1084
+ pollLoop().catch(() => {});
1085
+ });
1086
+ } catch (e) {
1087
+ console.warn('[scheduler] powerMonitor unavailable', e?.message);
1088
+ }
716
1089
  }
717
1090
 
718
1091
  module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR };