claude-code-session-manager 0.8.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +95 -65
  2. package/dist/assets/{cssMode-DBg6nxUL.js → cssMode-DWlBzlpW.js} +1 -1
  3. package/dist/assets/{freemarker2-CyjUGY3f.js → freemarker2-Cgg83m-Z.js} +1 -1
  4. package/dist/assets/{handlebars-lhtCWqlB.js → handlebars-C4r4LOI9.js} +1 -1
  5. package/dist/assets/{html-egptHwbZ.js → html-DaxRI5sW.js} +1 -1
  6. package/dist/assets/htmlMode-Bu_8jtXo.js +1 -0
  7. package/dist/assets/{index-DjeqNwqn.js → index-C_tgFedf.js} +1115 -1081
  8. package/dist/assets/{index-DnLtSCQS.css → index-Dj3Db4OA.css} +1 -1
  9. package/dist/assets/{javascript-tZbiID3O.js → javascript-D5Ztx-Ej.js} +1 -1
  10. package/dist/assets/{jsonMode-BGtPN-L-.js → jsonMode-tfsgezVc.js} +1 -1
  11. package/dist/assets/{liquid-DvTeXhev.js → liquid-F2cD9OL0.js} +1 -1
  12. package/dist/assets/{lspLanguageFeatures-D9xoxVlV.js → lspLanguageFeatures-Bz_Eih8F.js} +2 -2
  13. package/dist/assets/{mdx-BQ3Ja4wM.js → mdx-BPlD1clX.js} +1 -1
  14. package/dist/assets/{ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm → ort-wasm-simd-threaded.asyncify-DMmc6YqF.wasm} +0 -0
  15. package/dist/assets/{python-C71RWXaP.js → python-B4gUOWNI.js} +1 -1
  16. package/dist/assets/{razor-w__Mkyns.js → razor-B6pMxVp1.js} +1 -1
  17. package/dist/assets/{tsMode-DOQLQDB3.js → tsMode-C9nq6cHi.js} +1 -1
  18. package/dist/assets/{typescript-DEiub2Jt.js → typescript-Do5Vtwxu.js} +1 -1
  19. package/dist/assets/{whisperWorker-QfIS0sPF.js → whisperWorker-CcsPqZUS.js} +19 -19
  20. package/dist/assets/{xml-RXkLQscS.js → xml-C0mTbVRp.js} +1 -1
  21. package/dist/assets/{yaml-C8HIpJku.js → yaml-D3sePJfA.js} +1 -1
  22. package/dist/index.html +2 -2
  23. package/package.json +18 -10
  24. package/screenshots/.gitkeep +0 -0
  25. package/screenshots/README-screenshots.md +13 -0
  26. package/src/main/config.cjs +47 -9
  27. package/src/main/historyAggregator.cjs +10 -5
  28. package/src/main/index.cjs +85 -14
  29. package/src/main/ipcSchemas.cjs +165 -3
  30. package/src/main/lib/claudeBin.cjs +39 -0
  31. package/src/main/lib/encodeCwd.cjs +19 -0
  32. package/src/main/lib/fileTail.cjs +35 -0
  33. package/src/main/lib/insideHome.cjs +38 -0
  34. package/src/main/lib/prdFrontmatter.cjs +51 -0
  35. package/src/main/lib/sendToRenderer.cjs +21 -0
  36. package/src/main/memoryTool.cjs +203 -0
  37. package/src/main/otelSettings.cjs +2 -7
  38. package/src/main/pluginInstall.cjs +129 -0
  39. package/src/main/pty.cjs +13 -29
  40. package/src/main/queueOps.cjs +404 -0
  41. package/src/main/scheduler/prdParser.cjs +135 -0
  42. package/src/main/scheduler.cjs +291 -250
  43. package/src/main/sessionsStore.cjs +2 -6
  44. package/src/main/supervisor.cjs +3 -35
  45. package/src/main/teams.cjs +95 -0
  46. package/src/main/transcripts.cjs +5 -7
  47. package/src/main/usage.cjs +8 -0
  48. package/src/main/voiceHotkey.cjs +13 -9
  49. package/src/main/voiceSettings.cjs +2 -9
  50. package/src/main/voiceWizard.cjs +4 -11
  51. package/src/main/watchers.cjs +18 -42
  52. package/src/preload/api.d.ts +153 -1
  53. package/src/preload/index.cjs +29 -0
  54. package/dist/assets/htmlMode-tPDeHGOB.js +0 -1
@@ -45,10 +45,17 @@ const fsp = require('node:fs/promises');
45
45
  const path = require('node:path');
46
46
  const os = require('node:os');
47
47
  const { spawn } = require('node:child_process');
48
+ const { randomUUID } = require('node:crypto');
48
49
  const { ipcMain } = require('electron');
49
50
  const billing = require('./usage.cjs');
50
51
  const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
51
52
  const supervisor = require('./supervisor.cjs');
53
+ const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
54
+ const { readTail } = require('./lib/fileTail.cjs');
55
+ const { sendIfAlive } = require('./lib/sendToRenderer.cjs');
56
+ const prdParser = require('./scheduler/prdParser.cjs');
57
+ const logs = require('./logs.cjs');
58
+ const { schemas } = require('./ipcSchemas.cjs');
52
59
  const {
53
60
  POLL_INTERVAL_MS,
54
61
  USAGE_REFRESH_INTERVAL_MS,
@@ -95,8 +102,6 @@ const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
95
102
  : null;
96
103
 
97
104
  const DEFAULT_CONFIG = {
98
- // Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
99
- enabled: false,
100
105
  offsetMinutes: 15,
101
106
  concurrencyCap: ENV_CAP ?? 4,
102
107
  defaultCwd: DEFAULT_PROJECT_CWD,
@@ -117,16 +122,31 @@ const DEFAULT_CONFIG = {
117
122
 
118
123
  // ---------- fs helpers ----------
119
124
 
125
+ /**
126
+ * Resolve PRDS_DIR/<slug>.md and enforce path containment. Returns the
127
+ * absolute path on success, null on slug-escape attempts. The zod schema
128
+ * for slugs already blocks `..` because the SLUG_RE excludes `/`, but
129
+ * defense-in-depth: a second containment check after path.resolve costs
130
+ * nothing and catches future regex laxity.
131
+ */
132
+ function safeSlugPath(slug) {
133
+ const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
134
+ if (!resolved.startsWith(PRDS_DIR + path.sep)) return null;
135
+ return resolved;
136
+ }
137
+
120
138
  function ensureDirs() {
121
139
  fs.mkdirSync(PRDS_DIR, { recursive: true });
122
140
  fs.mkdirSync(RUNS_DIR, { recursive: true });
123
141
  }
124
142
 
125
- function atomicWriteJson(p, data) {
126
- const tmp = `${p}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
127
- fs.writeFileSync(tmp, JSON.stringify(data, null, 2));
128
- fs.renameSync(tmp, p);
129
- }
143
+ // Atomic JSON write helpers delegate to config.cjs's shared implementation.
144
+ // Sync variant is required for the executeJob exit handler (Promise resolver
145
+ // callback that must flush meta.json before resolving) — replacing with async
146
+ // would deadlock the exit path.
147
+ const config = require('./config.cjs');
148
+ const atomicWriteJsonSync = (p, data) => config.writeJsonSync(p, data);
149
+ const atomicWriteJson = (p, data) => config.writeJson(p, data);
130
150
 
131
151
  // ---------- scheduler-state.json (sidecar) ----------
132
152
 
@@ -143,8 +163,12 @@ function loadSchedulerState() {
143
163
  }
144
164
 
145
165
  function persistSchedulerState() {
166
+ // Sync write: called from many sync hot paths (clearPause, pollLoop catch
167
+ // block) and the sidecar is tiny (<1 KB). Converting to async here would
168
+ // require threading awaits through pause/resume bookkeeping for negligible
169
+ // benefit — the file is well under one page.
146
170
  try {
147
- atomicWriteJson(SCHEDULER_STATE_PATH, {
171
+ atomicWriteJsonSync(SCHEDULER_STATE_PATH, {
148
172
  version: 1,
149
173
  lastObservedReset: cachedNextReset,
150
174
  lastResetObservedAt: cachedNextReset ? Date.now() : null,
@@ -178,7 +202,10 @@ function appendHeartbeat(entry) {
178
202
  }
179
203
  }
180
204
 
181
- function readQueue() {
205
+ // Sync queue read — passed to the supervisor module (which calls it from
206
+ // supervisorTick / applyAction with no await) and the heartbeat interval.
207
+ // IPC handlers and mutate() use readQueue (async) below.
208
+ function readQueueSync() {
182
209
  try {
183
210
  const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
184
211
  const data = JSON.parse(raw);
@@ -194,9 +221,28 @@ function readQueue() {
194
221
  }
195
222
  }
196
223
 
197
- function writeQueue(state) {
224
+ // Async queue read — used on all IPC hot paths. Reading queue.json sync was
225
+ // blocking the main thread inside ipcMain.handle callbacks; awaiting fsp.readFile
226
+ // hands control back to the renderer while the kernel paginates the file.
227
+ async function readQueue() {
228
+ try {
229
+ const raw = await fsp.readFile(QUEUE_PATH, 'utf8');
230
+ const data = JSON.parse(raw);
231
+ return {
232
+ config: { ...DEFAULT_CONFIG, ...(data.config || {}) },
233
+ jobs: Array.isArray(data.jobs) ? data.jobs : [],
234
+ scheduledFor: data.scheduledFor ?? null,
235
+ lastRunAt: data.lastRunAt ?? null,
236
+ paused: data.paused ?? null,
237
+ };
238
+ } catch {
239
+ return { config: { ...DEFAULT_CONFIG }, jobs: [], scheduledFor: null, lastRunAt: null, paused: null };
240
+ }
241
+ }
242
+
243
+ async function writeQueue(state) {
198
244
  ensureDirs();
199
- atomicWriteJson(QUEUE_PATH, state);
245
+ await atomicWriteJson(QUEUE_PATH, state);
200
246
  }
201
247
 
202
248
  // ---------- serialized mutation queue ----------
@@ -209,9 +255,9 @@ let mutateTail = Promise.resolve();
209
255
 
210
256
  function mutate(fn) {
211
257
  const next = mutateTail.then(async () => {
212
- const state = readQueue();
258
+ const state = await readQueue();
213
259
  const ret = await fn(state);
214
- writeQueue(state);
260
+ await writeQueue(state);
215
261
  return ret;
216
262
  });
217
263
  mutateTail = next.catch(() => {}); // keep chain alive on errors
@@ -225,55 +271,13 @@ function mutate(fn) {
225
271
  * yaml dep; the schema is small (title, cwd, estimateMinutes, parallelGroup)
226
272
  * and the format is documented in the user-facing README.
227
273
  */
228
- function parsePrd(filePath) {
229
- const text = fs.readFileSync(filePath, 'utf8');
230
- const meta = { title: null, cwd: null, estimateMinutes: null, parallelGroup: null };
231
- let body = text;
232
-
233
- if (text.startsWith('---\n')) {
234
- const end = text.indexOf('\n---', 4);
235
- if (end !== -1) {
236
- const fm = text.slice(4, end);
237
- body = text.slice(end + 4).replace(/^\n/, '');
238
- for (const line of fm.split('\n')) {
239
- const m = line.match(/^([a-zA-Z]+):\s*(.+?)\s*$/);
240
- if (!m) continue;
241
- const k = m[1];
242
- let v = m[2];
243
- if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
244
- v = v.slice(1, -1);
245
- }
246
- if (k === 'title') meta.title = v;
247
- else if (k === 'cwd') meta.cwd = v;
248
- else if (k === 'estimateMinutes') meta.estimateMinutes = Number(v) || null;
249
- else if (k === 'parallelGroup') meta.parallelGroup = Number(v) || null;
250
- }
251
- }
252
- }
253
-
254
- const base = path.basename(filePath, '.md');
255
- const groupFromName = (() => {
256
- const m = base.match(/^(\d+)-/);
257
- return m ? Number(m[1]) : null;
258
- })();
259
-
260
- return {
261
- slug: base,
262
- path: filePath,
263
- title: meta.title || base,
264
- cwd: meta.cwd || null,
265
- estimateMinutes: meta.estimateMinutes,
266
- parallelGroup: meta.parallelGroup ?? groupFromName ?? 99,
267
- body: body.trim(),
268
- };
269
- }
270
-
271
- function listPrdFiles() {
274
+ // PRD parsing + dir-mtime cache live in scheduler/prdParser.cjs. Local wrappers
275
+ // preserve the existing call shape (callers don't need to thread PRDS_DIR).
276
+ const parsePrdRaw = prdParser.parsePrdRaw;
277
+ const parsePrd = prdParser.parsePrd;
278
+ async function listPrdFiles() {
272
279
  ensureDirs();
273
- return fs.readdirSync(PRDS_DIR)
274
- .filter((f) => f.endsWith('.md') && !f.startsWith('.'))
275
- .map((f) => path.join(PRDS_DIR, f))
276
- .sort();
280
+ return prdParser.listPrdFiles(PRDS_DIR);
277
281
  }
278
282
 
279
283
  // ---------- queue reconciliation ----------
@@ -286,12 +290,14 @@ function listPrdFiles() {
286
290
  * Status is preserved: pending stays pending, completed stays completed.
287
291
  * Newly-discovered PRDs land as `pending`.
288
292
  */
289
- function reconcile(state) {
290
- const files = listPrdFiles();
293
+ async function reconcile(state) {
294
+ const files = await listPrdFiles();
291
295
  const onDisk = new Map();
292
296
  for (const f of files) {
293
297
  try {
294
- const p = parsePrd(f);
298
+ // Per-file await: parsing is mtime-cached so steady-state hits zero
299
+ // disk reads; on cold cache the awaits keep the main thread responsive.
300
+ const p = await parsePrd(f);
295
301
  onDisk.set(p.slug, p);
296
302
  } catch (e) {
297
303
  console.warn('[scheduler] failed to parse', f, e?.message);
@@ -377,16 +383,17 @@ let heartbeatInterval = null;
377
383
  // double-spawn when runDueJobs() is called while jobs are in flight.
378
384
  const runningSet = new Set();
379
385
  let cancelToken = { cancelled: false };
380
- let claudeBinPathCached = null;
381
386
 
382
387
  function attachWindow(w) { mainWindow = w; }
383
388
 
384
- function broadcast() {
385
- if (!mainWindow || mainWindow.isDestroyed()) return;
386
- const state = readQueue();
387
- reconcile(state);
388
- writeQueue(state);
389
- mainWindow.webContents.send('schedule:state', {
389
+ /**
390
+ * Build the snapshot payload consumed by both the `schedule:state` IPC
391
+ * handler and the `schedule:state` broadcast event. The IPC return adds a
392
+ * `paths` map (renderer uses it for "open folder" actions); broadcast omits
393
+ * it because subscribers don't need to re-derive paths on every tick.
394
+ */
395
+ function buildScheduleStatePayload(state, { withPaths = false } = {}) {
396
+ const payload = {
390
397
  config: state.config,
391
398
  jobs: state.jobs,
392
399
  scheduledFor: state.scheduledFor,
@@ -394,7 +401,19 @@ function broadcast() {
394
401
  nextReset: getNextResetCached(),
395
402
  paused: state.paused,
396
403
  utilization: cachedUtilization,
397
- });
404
+ };
405
+ if (withPaths) {
406
+ payload.paths = { root: ROOT, prds: PRDS_DIR, runs: RUNS_DIR, queue: QUEUE_PATH };
407
+ }
408
+ return payload;
409
+ }
410
+
411
+ async function broadcast() {
412
+ if (!mainWindow || mainWindow.isDestroyed()) return;
413
+ const state = await readQueue();
414
+ await reconcile(state);
415
+ await writeQueue(state);
416
+ sendIfAlive(mainWindow, 'schedule:state', buildScheduleStatePayload(state));
398
417
  }
399
418
 
400
419
  function clearFireTimer() {
@@ -425,13 +444,13 @@ async function rescheduleTimer() {
425
444
  } catch {
426
445
  nextResetIso = cachedNextReset;
427
446
  }
428
- const fireAt = await mutate((state) => {
429
- reconcile(state);
447
+ const fireAt = await mutate(async (state) => {
448
+ await reconcile(state);
430
449
  const fa = computeFireAt(state, nextResetIso);
431
450
  state.scheduledFor = fa ? new Date(fa).toISOString() : null;
432
451
  return fa;
433
452
  });
434
- broadcast();
453
+ await broadcast();
435
454
  if (!fireAt) return;
436
455
 
437
456
  const delay = Math.max(1000, fireAt - Date.now());
@@ -462,7 +481,7 @@ async function setPaused(reason, resumeAtIso) {
462
481
  s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
463
482
  }
464
483
  });
465
- broadcast();
484
+ await broadcast();
466
485
  cancelToken.cancelled = true;
467
486
  if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
468
487
  if (!effectiveResumeAt) return;
@@ -491,9 +510,18 @@ async function clearPause(source) {
491
510
  // Track manual clears for the auto-pause cooldown.
492
511
  if (source === 'manual' || source === 'run-now') {
493
512
  pauseClearedManuallyAt = Date.now();
513
+ // The user has just affirmed the queue should run — clear the failure
514
+ // counters so the renderer doesn't keep nagging about stale poll fails.
515
+ // The next poll will set them again if the condition still applies.
516
+ consecutiveFailures = 0;
517
+ backoffMs = 0;
518
+ backoffNextAt = null;
519
+ firstFailureAt = null;
520
+ firstNon429FailureAt = null;
521
+ lastFailureKind = null;
494
522
  persistSchedulerState();
495
523
  }
496
- if (wasPaused) broadcast();
524
+ if (wasPaused) await broadcast();
497
525
  }
498
526
 
499
527
  /** Mutate a job in place to "pending" with cleared run metadata. */
@@ -528,24 +556,6 @@ function detectRateLimitInLog(logPath) {
528
556
  }
529
557
  }
530
558
 
531
- // ---------- claude binary ----------
532
-
533
- function resolveClaudeBin() {
534
- if (claudeBinPathCached) return claudeBinPathCached;
535
- const candidates = [
536
- path.join(os.homedir(), '.claude', 'local', 'claude'),
537
- '/usr/local/bin/claude',
538
- '/opt/homebrew/bin/claude',
539
- '/usr/bin/claude',
540
- ];
541
- for (const c of candidates) {
542
- try { fs.accessSync(c, fs.constants.X_OK); claudeBinPathCached = c; return c; } catch { /* */ }
543
- }
544
- // Last resort: rely on PATH lookup at spawn time.
545
- claudeBinPathCached = 'claude';
546
- return claudeBinPathCached;
547
- }
548
-
549
559
  // ---------- execution ----------
550
560
 
551
561
  function pickRunDir() {
@@ -565,31 +575,43 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
565
575
  const metaPath = path.join(runDir, `${job.slug}.meta.json`);
566
576
  const cwd = job.cwd || defaultCwd;
567
577
  const startedAt = Date.now();
578
+ const sessionId = randomUUID();
568
579
 
569
580
  const fd = fs.openSync(logPath, 'a');
570
581
  let fdClosed = false;
571
- const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
582
+ const closeFd = () => { if (fdClosed) return; fdClosed = true; try { fs.closeSync(fd); } catch { /* */ } };
583
+ // safeLog: no-op once the fd is closed, never throws on the watchdog timer
584
+ // path. Pre-fix, a post-result/idle watchdog firing AFTER closeFd would
585
+ // throw EBADF and crash the host. Every fs.writeSync(fd, …) below goes
586
+ // through this helper.
587
+ const safeLog = (msg) => {
588
+ if (fdClosed) return;
589
+ try { fs.writeSync(fd, msg); } catch { /* fd vanished mid-write */ }
590
+ };
572
591
 
573
- fs.writeSync(fd, `[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
592
+ safeLog(`[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
574
593
 
575
594
  // Dead-cwd guard: verify the target directory exists and is traversable
576
595
  // before handing it to the child process.
577
596
  try { fs.accessSync(cwd, fs.constants.X_OK); }
578
597
  catch {
579
598
  const errMsg = `cwd no longer exists: ${cwd}`;
580
- fs.writeSync(fd, `[scheduler] ${errMsg}\n`);
599
+ safeLog(`[scheduler] ${errMsg}\n`);
581
600
  closeFd();
582
- atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
583
- return { exitCode: -1, durationMs: 0, error: errMsg };
601
+ // Sync write: this is an early-exit error path inside an async function,
602
+ // so we could await, but using the sync variant keeps the error path
603
+ // ordering identical to the spawn-failed branch below (also sync).
604
+ atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
605
+ return { exitCode: -1, durationMs: 0, error: errMsg, sessionId };
584
606
  }
585
607
 
586
608
  // Read full PRD body fresh from disk (queue stored only the preview).
587
609
  let prompt;
588
610
  try {
589
- const parsed = parsePrd(path.join(PRDS_DIR, `${job.slug}.md`));
611
+ const parsed = await parsePrd(path.join(PRDS_DIR, `${job.slug}.md`));
590
612
  prompt = parsed.body;
591
613
  } catch (e) {
592
- fs.writeSync(fd, `[scheduler] failed to read PRD: ${e?.message}\n`);
614
+ safeLog(`[scheduler] failed to read PRD: ${e?.message}\n`);
593
615
  closeFd();
594
616
  return { exitCode: -1, durationMs: 0, error: e?.message };
595
617
  }
@@ -600,28 +622,46 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
600
622
  // launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
601
623
  // overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
602
624
  const childEnv = cleanChildEnv();
603
- const child = spawn(claudeBin, [
604
- '-p', prompt,
605
- '--model', 'sonnet',
606
- '--dangerously-skip-permissions',
607
- '--output-format', 'stream-json',
608
- '--verbose',
609
- ], {
610
- cwd,
611
- env: childEnv,
612
- stdio: ['ignore', fd, fd],
613
- // detached:true puts the child in its own process group so we can kill
614
- // the entire descendant tree (including any stray background bashes the
615
- // agent spawned) with `process.kill(-pid)`. Without this, child.kill()
616
- // only kills the immediate `claude` process, leaving orphaned subprocs
617
- // that keep the parent alive (the 2026-05-10 cellar-publish hang).
618
- detached: true,
619
- });
625
+ // Guard against synchronous spawn failures (EAGAIN, ENOMEM on fork).
626
+ // Without this, the throw bubbles out of the Promise executor and the
627
+ // outer await rejects — but the open fd is leaked.
628
+ let child;
629
+ try {
630
+ child = spawn(claudeBin, [
631
+ '-p', prompt,
632
+ '--model', 'sonnet',
633
+ '--dangerously-skip-permissions',
634
+ '--output-format', 'stream-json',
635
+ '--verbose',
636
+ '--session-id', sessionId,
637
+ ], {
638
+ cwd,
639
+ env: childEnv,
640
+ stdio: ['ignore', fd, fd],
641
+ // detached:true puts the child in its own process group so we can kill
642
+ // the entire descendant tree (including any stray background bashes the
643
+ // agent spawned) with `process.kill(-pid)`. Without this, child.kill()
644
+ // only kills the immediate `claude` process, leaving orphaned subprocs
645
+ // that keep the parent alive (the 2026-05-10 cellar-publish hang).
646
+ detached: true,
647
+ });
648
+ } catch (e) {
649
+ const errMsg = `spawn failed: ${e?.message ?? String(e)}`;
650
+ safeLog(`[scheduler] ${errMsg}\n`);
651
+ closeFd();
652
+ const durationMs = Date.now() - startedAt;
653
+ // Sync write: inside the Promise executor, before resolve(). Awaiting
654
+ // here would require restructuring the executor; the meta file is tiny
655
+ // and this is an error path, not the IPC hot path.
656
+ atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs });
657
+ resolve({ exitCode: -1, durationMs, error: errMsg, sessionId });
658
+ return;
659
+ }
620
660
 
621
- fs.writeSync(fd, `[scheduler] spawned pid=${child.pid} (process group)\n\n`);
661
+ safeLog(`[scheduler] spawned pid=${child.pid} sessionId=${sessionId} (process group)\n\n`);
622
662
 
623
663
  // Fire-and-forget pid persistence — best effort.
624
- if (onPid) onPid(child.pid).catch(() => {});
664
+ if (onPid) onPid(child.pid, sessionId, cwd).catch(() => {});
625
665
 
626
666
  // Track whether the agent has emitted a `result` event in its JSONL stream.
627
667
  // null until seen; then one of "success" | "error_max_turns" | ... per the
@@ -657,15 +697,15 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
657
697
  const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
658
698
  if (!m) return;
659
699
  agentResultSubtype = m[1];
660
- fs.writeSync(fd, `\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
700
+ safeLog(`\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
661
701
  `starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
662
702
  clearInterval(resultTailer);
663
703
  postResultTimer = setTimeout(() => {
664
- fs.writeSync(fd, `\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
704
+ safeLog(`\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
665
705
  `child still alive — SIGTERM process group\n`);
666
706
  killTree('SIGTERM');
667
707
  postResultKillTimer = setTimeout(() => {
668
- fs.writeSync(fd, `\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
708
+ safeLog(`\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
669
709
  killTree('SIGKILL');
670
710
  }, POST_RESULT_KILL_MS);
671
711
  if (postResultKillTimer.unref) postResultKillTimer.unref();
@@ -677,7 +717,7 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
677
717
 
678
718
  // Kill the child if it runs past the maximum allowed duration.
679
719
  const watchdog = setTimeout(() => {
680
- fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
720
+ safeLog(`\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
681
721
  killTree('SIGKILL');
682
722
  }, MAX_JOB_DURATION_MS);
683
723
  if (watchdog.unref) watchdog.unref();
@@ -691,12 +731,12 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
691
731
  const stat = fs.statSync(logPath);
692
732
  const idleMs = Date.now() - stat.mtimeMs;
693
733
  if (idleMs > IDLE_OUTPUT_KILL_MS) {
694
- fs.writeSync(fd, `\n[scheduler] idle-output watchdog: log mtime stalled ` +
734
+ safeLog(`\n[scheduler] idle-output watchdog: log mtime stalled ` +
695
735
  `${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
696
736
  clearInterval(idleChecker);
697
737
  killTree('SIGTERM');
698
738
  idleKillTimer = setTimeout(() => {
699
- fs.writeSync(fd, `\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
739
+ safeLog(`\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
700
740
  killTree('SIGKILL');
701
741
  }, POST_RESULT_KILL_MS);
702
742
  if (idleKillTimer.unref) idleKillTimer.unref();
@@ -717,10 +757,11 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
717
757
  child.on('error', (err) => {
718
758
  clearAllTimers();
719
759
  const durationMs = Date.now() - startedAt;
720
- fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
760
+ safeLog(`\n[scheduler] error: ${err.message}\n`);
721
761
  closeFd();
722
- atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
723
- resolve({ exitCode: -1, durationMs, error: err.message });
762
+ // Sync write: child event handler must flush meta before resolve().
763
+ atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
764
+ resolve({ exitCode: -1, durationMs, error: err.message, sessionId });
724
765
  });
725
766
 
726
767
  child.on('exit', (code, signal) => {
@@ -737,19 +778,21 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
737
778
  const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
738
779
  if (mappedToSuccess) {
739
780
  effectiveCode = 0;
740
- fs.writeSync(fd, `\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
781
+ safeLog(`\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
741
782
  `(result=success was emitted before kill)\n`);
742
783
  }
743
- fs.writeSync(fd, `\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
784
+ safeLog(`\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
744
785
  `duration=${Math.round(durationMs / 1000)}s\n`);
745
786
  closeFd();
746
787
  const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
747
- atomicWriteJson(metaPath, {
748
- slug: job.slug, cwd, exitCode: effectiveCode, rateLimited,
788
+ // Sync write: child 'exit' handler must flush meta before resolve()
789
+ // so the spawnJob mutate() that follows sees the persisted exit code.
790
+ atomicWriteJsonSync(metaPath, {
791
+ slug: job.slug, cwd, sessionId, exitCode: effectiveCode, rateLimited,
749
792
  startedAt, finishedAt: Date.now(), durationMs,
750
793
  agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
751
794
  });
752
- resolve({ exitCode: effectiveCode, durationMs, rateLimited });
795
+ resolve({ exitCode: effectiveCode, durationMs, rateLimited, sessionId });
753
796
  });
754
797
  });
755
798
  }
@@ -859,23 +902,6 @@ function isFixPlanSlug(slug) {
859
902
  return /^\d+-fix-/.test(slug);
860
903
  }
861
904
 
862
- /**
863
- * Read the last `bytes` of a file as utf8. Returns '' on error.
864
- */
865
- function readTail(filePath, bytes) {
866
- try {
867
- const stat = fs.statSync(filePath);
868
- const n = Math.min(stat.size, bytes);
869
- const fd = fs.openSync(filePath, 'r');
870
- const buf = Buffer.alloc(n);
871
- fs.readSync(fd, buf, 0, n, stat.size - n);
872
- fs.closeSync(fd);
873
- return buf.toString('utf8');
874
- } catch {
875
- return '';
876
- }
877
- }
878
-
879
905
  /**
880
906
  * Spawn an Opus investigation session for a failed job. The investigator's job
881
907
  * is to read the failure log + original PRD, identify the root cause, and write
@@ -897,7 +923,7 @@ async function spawnInvestigation(failedJob, runDir) {
897
923
 
898
924
  let originalBody = '';
899
925
  try {
900
- originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
926
+ originalBody = (await parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`))).body;
901
927
  } catch {
902
928
  originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
903
929
  }
@@ -914,7 +940,16 @@ async function spawnInvestigation(failedJob, runDir) {
914
940
  return;
915
941
  }
916
942
 
917
- const cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
943
+ // cwd fallback: if the failed job's cwd is missing on disk, the investigator
944
+ // child would itself fail to spawn (ENOENT). Fall back to DEFAULT_PROJECT_CWD
945
+ // so the investigation can still write a fix plan that updates the cwd or
946
+ // re-creates the missing project directory.
947
+ let cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
948
+ try { fs.accessSync(cwd, fs.constants.X_OK); }
949
+ catch {
950
+ console.warn(`[scheduler] investigation cwd missing (${cwd}); falling back to ${DEFAULT_PROJECT_CWD}`);
951
+ cwd = DEFAULT_PROJECT_CWD;
952
+ }
918
953
  const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
919
954
 
920
955
  # Failed job
@@ -960,26 +995,37 @@ ${logTail}
960
995
  DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
961
996
 
962
997
  const fd = fs.openSync(investigationLogPath, 'a');
963
- fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n\n`);
998
+ const sessionId = randomUUID();
999
+ try {
1000
+ fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n[scheduler] sessionId=${sessionId}\n\n`);
1001
+ } catch { /* */ }
964
1002
 
965
1003
  const claudeBin = resolveClaudeBin();
966
1004
  const childEnv = cleanChildEnv();
967
- const child = spawn(claudeBin, [
968
- '-p', prompt,
969
- '--model', 'opus',
970
- '--dangerously-skip-permissions',
971
- '--output-format', 'stream-json',
972
- '--verbose',
973
- ], {
974
- cwd,
975
- env: childEnv,
976
- stdio: ['ignore', fd, fd],
977
- });
1005
+ let child;
1006
+ try {
1007
+ child = spawn(claudeBin, [
1008
+ '-p', prompt,
1009
+ '--model', 'opus',
1010
+ '--dangerously-skip-permissions',
1011
+ '--output-format', 'stream-json',
1012
+ '--verbose',
1013
+ '--session-id', sessionId,
1014
+ ], {
1015
+ cwd,
1016
+ env: childEnv,
1017
+ stdio: ['ignore', fd, fd],
1018
+ });
1019
+ } catch (e) {
1020
+ try { fs.writeSync(fd, `\n[scheduler] investigation spawn failed: ${e?.message ?? e}\n`); } catch { /* */ }
1021
+ try { fs.closeSync(fd); } catch { /* */ }
1022
+ return;
1023
+ }
978
1024
 
979
- fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
1025
+ try { fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`); } catch { /* */ }
980
1026
 
981
1027
  const watchdog = setTimeout(() => {
982
- fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
1028
+ try { fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`); } catch { /* */ }
983
1029
  try { child.kill('SIGKILL'); } catch { /* already dead */ }
984
1030
  }, MAX_INVESTIGATION_DURATION_MS);
985
1031
  if (watchdog.unref) watchdog.unref();
@@ -1015,15 +1061,17 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
1015
1061
  s.jobs[idx].startedAt = new Date().toISOString();
1016
1062
  }
1017
1063
  });
1018
- broadcast();
1064
+ await broadcast();
1019
1065
 
1020
- const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
1066
+ const res = await executeJob(job, runDir, defaultCwd, async (pid, sessionId, cwd) => {
1021
1067
  await mutate((s) => {
1022
1068
  const idx = s.jobs.findIndex((x) => x.slug === job.slug);
1023
1069
  if (idx >= 0) {
1024
- s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
1070
+ s.jobs[idx].sessionId = sessionId;
1071
+ s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt, sessionId, cwd };
1025
1072
  }
1026
1073
  });
1074
+ await broadcast();
1027
1075
  });
1028
1076
 
1029
1077
  if (res.rateLimited) {
@@ -1052,7 +1100,7 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
1052
1100
  }
1053
1101
  }
1054
1102
  });
1055
- broadcast();
1103
+ await broadcast();
1056
1104
 
1057
1105
  if (actuallyFailed && failedJobSnapshot) {
1058
1106
  spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
@@ -1075,20 +1123,20 @@ let tickTail = Promise.resolve();
1075
1123
 
1076
1124
  function tickQueue() {
1077
1125
  const next = tickTail.then(async () => {
1078
- const state = readQueue();
1126
+ const state = await readQueue();
1079
1127
  if (state.paused) {
1080
1128
  console.log('[scheduler] tickQueue skipped: paused');
1081
1129
  return;
1082
1130
  }
1083
1131
  if (cancelToken.cancelled) return;
1084
1132
 
1085
- reconcile(state);
1133
+ await reconcile(state);
1086
1134
  const cap = ENV_CAP ?? state.config.concurrencyCap;
1087
1135
  const batch = pickNextBatch(state.jobs, runningSet, cap);
1088
1136
  if (batch.length === 0) return;
1089
1137
 
1090
1138
  await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
1091
- broadcast();
1139
+ await broadcast();
1092
1140
 
1093
1141
  const { runId, dir: runDir } = pickRunDir();
1094
1142
  for (const job of batch) {
@@ -1102,7 +1150,7 @@ function tickQueue() {
1102
1150
  }
1103
1151
 
1104
1152
  async function runDueJobs() {
1105
- const state = readQueue();
1153
+ const state = await readQueue();
1106
1154
  if (state.paused) {
1107
1155
  console.log('[scheduler] runDueJobs skipped: paused');
1108
1156
  return;
@@ -1111,7 +1159,7 @@ async function runDueJobs() {
1111
1159
  await tickQueue();
1112
1160
  // Clear the one-shot scheduledFor without waiting for jobs to settle.
1113
1161
  await mutate((s) => { s.scheduledFor = null; });
1114
- broadcast();
1162
+ await broadcast();
1115
1163
  }
1116
1164
 
1117
1165
  // ---------- when-available launch logic ----------
@@ -1123,7 +1171,7 @@ async function maybeLaunchWhenAvailable(state) {
1123
1171
  if (pending.length === 0) return;
1124
1172
  if (cachedUtilization === null || cachedUtilization === undefined) return;
1125
1173
  if (cachedUtilization >= state.config.utilizationThreshold) {
1126
- broadcast();
1174
+ await broadcast();
1127
1175
  return;
1128
1176
  }
1129
1177
  console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running — ticking`);
@@ -1150,7 +1198,7 @@ async function pollLoop() {
1150
1198
  persistSchedulerState();
1151
1199
 
1152
1200
  // If a 'network' pause resolved, clear it now that we have a good reading.
1153
- const cur = readQueue();
1201
+ const cur = await readQueue();
1154
1202
  if (cur.paused?.reason === 'network') {
1155
1203
  await clearPause('network-recovered');
1156
1204
  }
@@ -1160,7 +1208,7 @@ async function pollLoop() {
1160
1208
  }
1161
1209
 
1162
1210
  await maybeLaunchWhenAvailable(cur);
1163
- broadcast();
1211
+ await broadcast();
1164
1212
  } else if (r.kind === 'meter_rate_limited') {
1165
1213
  // Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
1166
1214
  // fire available jobs anyway at utilization=0 rather than pausing the queue.
@@ -1171,9 +1219,9 @@ async function pollLoop() {
1171
1219
  // Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
1172
1220
  cachedUtilization = 0; // assume safe; fire any pending work
1173
1221
  console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
1174
- const cur = readQueue();
1222
+ const cur = await readQueue();
1175
1223
  await maybeLaunchWhenAvailable(cur);
1176
- broadcast();
1224
+ await broadcast();
1177
1225
  } else {
1178
1226
  lastPollAt = Date.now();
1179
1227
  lastPollOk = false;
@@ -1194,7 +1242,7 @@ async function pollLoop() {
1194
1242
 
1195
1243
  // After 30 minutes of consecutive non-429 failures, set 'network' pause.
1196
1244
  if (totalNon429FailureMs > 30 * 60_000) {
1197
- const cur2 = readQueue();
1245
+ const cur2 = await readQueue();
1198
1246
  if (!cur2.paused || cur2.paused.reason === 'network') {
1199
1247
  await setPaused('network', null);
1200
1248
  }
@@ -1229,23 +1277,14 @@ function registerScheduleHandlers() {
1229
1277
  supervisor.registerHandlers();
1230
1278
 
1231
1279
  ipcMain.handle('schedule:state', async () => {
1232
- const state = readQueue();
1233
- reconcile(state);
1234
- writeQueue(state);
1235
- return {
1236
- config: state.config,
1237
- jobs: state.jobs,
1238
- scheduledFor: state.scheduledFor,
1239
- lastRunAt: state.lastRunAt,
1240
- nextReset: getNextResetCached(),
1241
- paused: state.paused,
1242
- utilization: cachedUtilization,
1243
- paths: { root: ROOT, prds: PRDS_DIR, runs: RUNS_DIR, queue: QUEUE_PATH },
1244
- };
1280
+ const state = await readQueue();
1281
+ await reconcile(state);
1282
+ await writeQueue(state);
1283
+ return buildScheduleStatePayload(state, { withPaths: true });
1245
1284
  });
1246
1285
 
1247
1286
  ipcMain.handle('schedule:health', async () => {
1248
- const state = readQueue();
1287
+ const state = await readQueue();
1249
1288
  const runningJobs = [];
1250
1289
  for (const j of state.jobs) {
1251
1290
  if (j.status === 'running' && j.runtime) {
@@ -1274,15 +1313,14 @@ function registerScheduleHandlers() {
1274
1313
  // Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
1275
1314
  // Clears any existing pause first (same semantics as run-now).
1276
1315
  await clearPause('run-now');
1277
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error (force-tick)', e));
1316
+ runDueJobs().catch((e) => logs.writeLine({ level: 'error', scope: 'scheduler', message: 'runDueJobs error (force-tick)', meta: { error: e?.message } }));
1278
1317
  return { ok: true };
1279
1318
  });
1280
1319
 
1281
1320
  ipcMain.handle('schedule:set-config', async (_e, partial) => {
1282
- const { schemas: s } = require('./ipcSchemas.cjs');
1283
1321
  let validated;
1284
1322
  try {
1285
- validated = s.setConfigSchema.parse(partial || {});
1323
+ validated = schemas.setConfigSchema.parse(partial || {});
1286
1324
  } catch (e) {
1287
1325
  return { ok: false, error: e?.message ?? 'invalid config' };
1288
1326
  }
@@ -1299,18 +1337,13 @@ function registerScheduleHandlers() {
1299
1337
  });
1300
1338
 
1301
1339
  ipcMain.handle('schedule:reset-job', async (_e, payload) => {
1302
- const { schemas: s } = require('./ipcSchemas.cjs');
1303
1340
  let slug;
1304
1341
  try {
1305
- ({ slug } = s.scheduleSlug.parse(payload));
1342
+ ({ slug } = schemas.scheduleSlug.parse(payload));
1306
1343
  } catch (e) {
1307
1344
  return { ok: false, error: 'invalid slug' };
1308
1345
  }
1309
- // Containment check after path.join.
1310
- const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
1311
- if (!resolved.startsWith(PRDS_DIR + path.sep)) {
1312
- return { ok: false, error: 'invalid slug' };
1313
- }
1346
+ if (!safeSlugPath(slug)) return { ok: false, error: 'invalid slug' };
1314
1347
  const found = await mutate((state) => {
1315
1348
  const idx = state.jobs.findIndex((j) => j.slug === slug);
1316
1349
  if (idx < 0) return false;
@@ -1318,14 +1351,14 @@ function registerScheduleHandlers() {
1318
1351
  return true;
1319
1352
  });
1320
1353
  if (!found) return { ok: false, error: 'not found' };
1321
- broadcast();
1354
+ await broadcast();
1322
1355
  return { ok: true };
1323
1356
  });
1324
1357
 
1325
1358
  ipcMain.handle('schedule:run-now', async () => {
1326
1359
  // Manual run-now overrides any auto-pause. Clear it first.
1327
1360
  await clearPause('run-now');
1328
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
1361
+ runDueJobs().catch((e) => logs.writeLine({ level: 'error', scope: 'scheduler', message: 'runDueJobs error (run-now)', meta: { error: e?.message } }));
1329
1362
  return { ok: true };
1330
1363
  });
1331
1364
 
@@ -1344,11 +1377,11 @@ function registerScheduleHandlers() {
1344
1377
  // handler already reconciles on read, but this gives the renderer an
1345
1378
  // explicit refresh path that also broadcasts so all views update.
1346
1379
  ipcMain.handle('schedule:rescan', async () => {
1347
- await mutate((state) => {
1348
- reconcile(state);
1380
+ await mutate(async (state) => {
1381
+ await reconcile(state);
1349
1382
  return null;
1350
1383
  });
1351
- broadcast();
1384
+ await broadcast();
1352
1385
  return { ok: true };
1353
1386
  });
1354
1387
 
@@ -1360,12 +1393,12 @@ function registerScheduleHandlers() {
1360
1393
  ensureDirs();
1361
1394
  const ts = new Date().toISOString().replace(/[:.]/g, '-');
1362
1395
  const archiveDir = path.join(PRDS_ARCHIVE_DIR, ts);
1363
- const state = readQueue();
1396
+ const state = await readQueue();
1364
1397
  const victims = state.jobs.filter((j) => j.status === 'pending' || j.status === 'failed');
1365
1398
  if (victims.length === 0) {
1366
1399
  return { ok: true, archived: 0, archivedTo: null };
1367
1400
  }
1368
- fs.mkdirSync(archiveDir, { recursive: true });
1401
+ await fsp.mkdir(archiveDir, { recursive: true });
1369
1402
  let archived = 0;
1370
1403
  for (const job of victims) {
1371
1404
  const src = path.resolve(path.join(PRDS_DIR, `${job.slug}.md`));
@@ -1378,17 +1411,17 @@ function registerScheduleHandlers() {
1378
1411
  // ENOENT: the .md is already gone (reconcile would drop it on next
1379
1412
  // read anyway). Either way, fall through and remove from queue.
1380
1413
  if (e?.code !== 'ENOENT') {
1381
- console.warn('[scheduler] clear-queue: rename failed', job.slug, e?.message);
1414
+ logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'clear-queue: rename failed', meta: { slug: job.slug, error: e?.message } });
1382
1415
  }
1383
1416
  }
1384
1417
  }
1385
- await mutate((s) => {
1418
+ await mutate(async (s) => {
1386
1419
  const victimSlugs = new Set(victims.map((j) => j.slug));
1387
1420
  s.jobs = s.jobs.filter((j) => !victimSlugs.has(j.slug));
1388
- reconcile(s);
1421
+ await reconcile(s);
1389
1422
  return null;
1390
1423
  });
1391
- broadcast();
1424
+ await broadcast();
1392
1425
  return { ok: true, archived, archivedTo: archiveDir };
1393
1426
  });
1394
1427
 
@@ -1399,17 +1432,14 @@ function registerScheduleHandlers() {
1399
1432
  });
1400
1433
 
1401
1434
  ipcMain.handle('schedule:read-prd', async (_e, payload) => {
1402
- const { schemas: s } = require('./ipcSchemas.cjs');
1403
1435
  let slug;
1404
1436
  try {
1405
- ({ slug } = s.scheduleSlug.parse(payload));
1437
+ ({ slug } = schemas.scheduleSlug.parse(payload));
1406
1438
  } catch {
1407
1439
  return { ok: false, error: 'invalid slug' };
1408
1440
  }
1409
- const filePath = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
1410
- if (!filePath.startsWith(PRDS_DIR + path.sep)) {
1411
- return { ok: false, error: 'invalid slug' };
1412
- }
1441
+ const filePath = safeSlugPath(slug);
1442
+ if (!filePath) return { ok: false, error: 'invalid slug' };
1413
1443
  try {
1414
1444
  const text = await fsp.readFile(filePath, 'utf8');
1415
1445
  return { ok: true, text };
@@ -1419,13 +1449,14 @@ function registerScheduleHandlers() {
1419
1449
  });
1420
1450
 
1421
1451
  ipcMain.handle('schedule:read-log', async (_e, payload) => {
1422
- const { schemas: s } = require('./ipcSchemas.cjs');
1423
1452
  let slug, runId;
1424
1453
  try {
1425
- ({ slug, runId } = s.scheduleReadLog.parse(payload));
1454
+ ({ slug, runId } = schemas.scheduleReadLog.parse(payload));
1426
1455
  } catch {
1427
1456
  return { ok: false, error: 'invalid slug or runId' };
1428
1457
  }
1458
+ // Defense-in-depth: re-check containment after path.resolve even though
1459
+ // SLUG_RE / RUN_ID_RE already forbid path separators.
1429
1460
  const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
1430
1461
  if (!logPath.startsWith(RUNS_DIR + path.sep)) {
1431
1462
  return { ok: false, error: 'invalid slug or runId' };
@@ -1438,21 +1469,23 @@ function registerScheduleHandlers() {
1438
1469
  }
1439
1470
  });
1440
1471
 
1441
- const PRD_WRITE_MAX_BYTES = 256 * 1024;
1442
- const SLUG_RE = /^[A-Za-z0-9._-]{1,128}$/;
1443
-
1444
- ipcMain.handle('schedule:write-prd', async (_e, { slug, body }) => {
1445
- if (!SLUG_RE.test(slug)) throw new Error(`invalid slug: ${slug}`);
1446
- if (typeof body !== 'string') throw new Error('body must be string');
1447
- if (Buffer.byteLength(body, 'utf8') > PRD_WRITE_MAX_BYTES) throw new Error('body too large');
1448
- const file = path.join(PRDS_DIR, `${slug}.md`);
1449
- const resolved = path.resolve(file);
1450
- if (!resolved.startsWith(PRDS_DIR + path.sep)) throw new Error('path escape');
1451
- const tmp = `${resolved}.${process.pid}.${Date.now()}.tmp`;
1452
- await fsp.writeFile(tmp, body, { encoding: 'utf8', mode: 0o644 });
1453
- await fsp.rename(tmp, resolved);
1454
- const stat = await fsp.stat(resolved);
1455
- return { ok: true, bytesWritten: stat.size };
1472
+ ipcMain.handle('schedule:write-prd', async (_e, payload) => {
1473
+ let parsed;
1474
+ try { parsed = schemas.scheduleWritePrd.parse(payload); }
1475
+ catch (e) { return { ok: false, error: e?.message ?? 'invalid payload' }; }
1476
+ const resolved = safeSlugPath(parsed.slug);
1477
+ if (!resolved) return { ok: false, error: 'invalid slug' };
1478
+ try {
1479
+ await config.writeTextAtomic(resolved, parsed.body);
1480
+ } catch (e) {
1481
+ return { ok: false, error: e?.message ?? 'write failed' };
1482
+ }
1483
+ try {
1484
+ const stat = await fsp.stat(resolved);
1485
+ return { ok: true, bytesWritten: stat.size };
1486
+ } catch (e) {
1487
+ return { ok: false, error: e?.message ?? 'stat failed' };
1488
+ }
1456
1489
  });
1457
1490
 
1458
1491
  ipcMain.handle('schedule:list-prds', async () => {
@@ -1460,7 +1493,8 @@ function registerScheduleHandlers() {
1460
1493
  let entries;
1461
1494
  try {
1462
1495
  entries = await fsp.readdir(PRDS_DIR);
1463
- } catch {
1496
+ } catch (e) {
1497
+ logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'list-prds: readdir failed', meta: { error: e?.message } });
1464
1498
  return [];
1465
1499
  }
1466
1500
  const out = [];
@@ -1468,7 +1502,7 @@ function registerScheduleHandlers() {
1468
1502
  if (!name.endsWith('.md') || name.startsWith('.')) continue;
1469
1503
  const filePath = path.join(PRDS_DIR, name);
1470
1504
  try {
1471
- const parsed = parsePrd(filePath);
1505
+ const parsed = await parsePrd(filePath);
1472
1506
  const stat = await fsp.stat(filePath);
1473
1507
  out.push({
1474
1508
  slug: parsed.slug,
@@ -1479,7 +1513,7 @@ function registerScheduleHandlers() {
1479
1513
  mtimeMs: stat.mtimeMs,
1480
1514
  });
1481
1515
  } catch (e) {
1482
- console.warn('[scheduler] list-prds: skipping unparseable', name, e?.message);
1516
+ logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'list-prds: skipping unparseable file', meta: { name, error: e?.message } });
1483
1517
  }
1484
1518
  }
1485
1519
  out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
@@ -1509,7 +1543,7 @@ async function init() {
1509
1543
 
1510
1544
  // If we boot up while paused with a resumeAt in the past, clear it. This
1511
1545
  // happens when the app was closed across the reset window.
1512
- const boot = readQueue();
1546
+ const boot = await readQueue();
1513
1547
  if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
1514
1548
  await clearPause('boot-elapsed');
1515
1549
  } else if (boot.paused && boot.paused.resumeAt) {
@@ -1530,15 +1564,20 @@ async function init() {
1530
1564
  pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
1531
1565
  if (pollLoopTimer.unref) pollLoopTimer.unref();
1532
1566
 
1533
- // Supervisor: probe running jobs for wedged poll-loops.
1567
+ // Supervisor: probe running jobs for wedged poll-loops. Supervisor calls
1568
+ // its injected readQueue() synchronously from supervisorTick and applyAction,
1569
+ // so pass the sync variant; the 15-min probe cadence makes the blocking cost
1570
+ // negligible vs IPC handler latency.
1534
1571
  if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
1535
- supervisor.startSupervisor({ readQueue, mutate });
1572
+ supervisor.startSupervisor({ readQueue: readQueueSync });
1536
1573
  }
1537
1574
 
1538
1575
  // Heartbeat: once per minute, log queue state for 24h visibility.
1576
+ // setInterval callback is sync; readQueueSync stays sync to avoid awaiting
1577
+ // inside the timer body (and the 60s cadence makes the cost moot).
1539
1578
  if (heartbeatInterval) clearInterval(heartbeatInterval);
1540
1579
  heartbeatInterval = setInterval(() => {
1541
- const s = readQueue();
1580
+ const s = readQueueSync();
1542
1581
  const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
1543
1582
  for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
1544
1583
  appendHeartbeat({
@@ -1560,8 +1599,10 @@ async function init() {
1560
1599
  if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
1561
1600
  backoffMs = 0;
1562
1601
  backoffNextAt = null;
1563
- // Clear any paused-but-resumeAt-elapsed state immediately.
1564
- const wakeState = readQueue();
1602
+ // Clear any paused-but-resumeAt-elapsed state immediately. Sync read:
1603
+ // the powerMonitor 'resume' callback fires rarely and isn't on the IPC
1604
+ // hot path; switching to async would require an IIFE wrapper for no gain.
1605
+ const wakeState = readQueueSync();
1565
1606
  if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
1566
1607
  clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
1567
1608
  }