claude-code-session-manager 0.8.3 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +21 -0
  2. package/dist/assets/{cssMode-DyaNC2Cs.js → cssMode-BCLoTYI0.js} +1 -1
  3. package/dist/assets/{editor.main-BhSGi_Jw.js → editor.main-UoasbVGy.js} +3 -3
  4. package/dist/assets/{freemarker2-DZH3si5v.js → freemarker2-dhfKZR7u.js} +1 -1
  5. package/dist/assets/{handlebars-DvzTd6uL.js → handlebars-DdpqwFuV.js} +1 -1
  6. package/dist/assets/{html-C5GmopAN.js → html-1oTJClkg.js} +1 -1
  7. package/dist/assets/{htmlMode-DwnrHwx1.js → htmlMode-CF1QbIg-.js} +1 -1
  8. package/dist/assets/index-DWDcKbgI.js +3046 -0
  9. package/dist/assets/index-eqxng9X2.css +32 -0
  10. package/dist/assets/{javascript-JqHrxiCa.js → javascript-BP_Q5MFx.js} +1 -1
  11. package/dist/assets/{jsonMode-8rZcy09i.js → jsonMode-BtjA-2w_.js} +1 -1
  12. package/dist/assets/{liquid-ClpD_v7G.js → liquid-DstuL8vm.js} +1 -1
  13. package/dist/assets/{lspLanguageFeatures-u0WgQBQz.js → lspLanguageFeatures-DvSiaY4f.js} +1 -1
  14. package/dist/assets/{mdx-DtViUgdm.js → mdx-qO-uvsJd.js} +1 -1
  15. package/dist/assets/{python-CaAvhRGm.js → python-CCPz_1cy.js} +1 -1
  16. package/dist/assets/{razor-saGNVU7l.js → razor-B7tCzkdh.js} +1 -1
  17. package/dist/assets/{tsMode-HZwWTCj8.js → tsMode-hUkEyjsH.js} +1 -1
  18. package/dist/assets/{typescript-BInV4PNE.js → typescript-BeXECzAk.js} +1 -1
  19. package/dist/assets/{whisperWorker-ivwFFLMj.js → whisperWorker-QfIS0sPF.js} +5 -5
  20. package/dist/assets/{xml-tgO806YR.js → xml-MRJd4GHf.js} +1 -1
  21. package/dist/assets/{yaml-CHApZArv.js → yaml-CzGliMNL.js} +1 -1
  22. package/dist/index.html +2 -2
  23. package/package.json +16 -1
  24. package/src/main/historyAggregator.cjs +208 -0
  25. package/src/main/index.cjs +4 -0
  26. package/src/main/ipcSchemas.cjs +15 -0
  27. package/src/main/lib/schedulerConfig.cjs +2 -0
  28. package/src/main/scheduler.cjs +604 -120
  29. package/src/main/supervisor.cjs +512 -0
  30. package/src/main/usage.cjs +44 -2
  31. package/src/preload/api.d.ts +64 -2
  32. package/src/preload/index.cjs +10 -0
  33. package/dist/assets/index-BGshD4Pw.js +0 -2976
  34. package/dist/assets/index-DCK87t79.css +0 -32
@@ -48,26 +48,57 @@ const { spawn } = require('node:child_process');
48
48
  const { ipcMain } = require('electron');
49
49
  const billing = require('./usage.cjs');
50
50
  const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
51
+ const supervisor = require('./supervisor.cjs');
51
52
  const {
52
53
  POLL_INTERVAL_MS,
53
54
  USAGE_REFRESH_INTERVAL_MS,
54
55
  MAX_JOB_DURATION_MS,
55
56
  } = require('./lib/schedulerConfig.cjs');
56
57
 
58
+ const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
59
+
60
+ // After the agent emits a `result` event in its JSONL stream, the parent
61
+ // `claude -p` process should exit promptly. Real-world failure (2026-05-10
62
+ // cellar-publish): an agent emitted result=success, then spawned unbounded
63
+ // `until $(curl ...)` background bashes that kept the parent alive for 22
64
+ // minutes until manual intervention. The post-result watchdog catches this:
65
+ // if the process is still alive POST_RESULT_GRACE_MS after result, SIGTERM
66
+ // the whole process group; if still alive POST_RESULT_KILL_MS after SIGTERM,
67
+ // SIGKILL. The original `result.subtype` is preserved and used to map the
68
+ // kill exit code back to 0 so legit work isn't mismarked as failed.
69
+ const POST_RESULT_GRACE_MS = 90_000;
70
+ const POST_RESULT_KILL_MS = 30_000;
71
+ const RESULT_TAIL_POLL_MS = 5_000;
72
+ const RESULT_TAIL_BYTES = 8 * 1024;
73
+
74
+ // Idle-output watchdog: if the log file mtime stops advancing for this long
75
+ // while the process is still alive, the agent is hung mid-work (network
76
+ // stall, infinite tool loop, compaction wedge). User rule: anything not
77
+ // making progress for 20 minutes is presumed stuck. SIGTERM the process
78
+ // group, then SIGKILL after POST_RESULT_KILL_MS. The scheduler logs this
79
+ // distinctly from MAX_JOB_DURATION_MS so post-mortems can tell them apart.
80
+ const IDLE_OUTPUT_KILL_MS = 20 * 60_000;
81
+ const IDLE_CHECK_INTERVAL_MS = 60_000;
82
+
57
83
  const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
58
84
  const PRDS_DIR = path.join(ROOT, 'prds');
59
85
  const RUNS_DIR = path.join(ROOT, 'runs');
86
+ const PRDS_ARCHIVE_DIR = path.join(ROOT, 'prds-archived');
60
87
  const QUEUE_PATH = path.join(ROOT, 'queue.json');
61
88
  const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
62
89
  const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
63
90
  const HEARTBEAT_MAX_BYTES = 1024 * 1024;
64
91
  const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
65
92
 
93
+ const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
94
+ ? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
95
+ : null;
96
+
66
97
  const DEFAULT_CONFIG = {
67
98
  // Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
68
99
  enabled: false,
69
100
  offsetMinutes: 15,
70
- concurrencyCap: 5,
101
+ concurrencyCap: ENV_CAP ?? 4,
71
102
  defaultCwd: DEFAULT_PROJECT_CWD,
72
103
  // 'when-available' = poll usage and fire whenever utilization < threshold.
73
104
  // 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
@@ -76,6 +107,12 @@ const DEFAULT_CONFIG = {
76
107
  // For 'when-available'. Fire only when five_hour utilization < this percent.
77
108
  utilizationThreshold: 90,
78
109
  schemaVersion: 1,
110
+ supervisor: {
111
+ enabled: true,
112
+ intervalMinutes: 15,
113
+ maxConcurrentProbes: 2,
114
+ probeStaleThresholdMinutes: 10,
115
+ },
79
116
  };
80
117
 
81
118
  // ---------- fs helpers ----------
@@ -324,6 +361,8 @@ let consecutiveFailures = 0;
324
361
  let backoffMs = 0;
325
362
  let backoffNextAt = null;
326
363
  let firstFailureAt = null;
364
+ let firstNon429FailureAt = null; // tracks only transient/config failures; 429s don't count toward network-pause threshold
365
+ let lastFailureKind = null; // 'transient' | 'meter_rate_limited' | 'auth' | null
327
366
  let pauseClearedManuallyAt = null;
328
367
 
329
368
  // ---------- timer ----------
@@ -334,7 +373,9 @@ let resumeTimer = null;
334
373
  let pollLoopTimer = null;
335
374
  let rescheduleInterval = null;
336
375
  let heartbeatInterval = null;
337
- let isExecuting = false;
376
+ // In-memory set of slugs currently spawned in this process. Prevents
377
+ // double-spawn when runDueJobs() is called while jobs are in flight.
378
+ const runningSet = new Set();
338
379
  let cancelToken = { cancelled: false };
339
380
  let claudeBinPathCached = null;
340
381
 
@@ -569,22 +610,112 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
569
610
  cwd,
570
611
  env: childEnv,
571
612
  stdio: ['ignore', fd, fd],
613
+ // detached:true puts the child in its own process group so we can kill
614
+ // the entire descendant tree (including any stray background bashes the
615
+ // agent spawned) with `process.kill(-pid)`. Without this, child.kill()
616
+ // only kills the immediate `claude` process, leaving orphaned subprocs
617
+ // that keep the parent alive (the 2026-05-10 cellar-publish hang).
618
+ detached: true,
572
619
  });
573
620
 
574
- fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
621
+ fs.writeSync(fd, `[scheduler] spawned pid=${child.pid} (process group)\n\n`);
575
622
 
576
623
  // Fire-and-forget pid persistence — best effort.
577
624
  if (onPid) onPid(child.pid).catch(() => {});
578
625
 
626
+ // Track whether the agent has emitted a `result` event in its JSONL stream.
627
+ // null until seen; then one of "success" | "error_max_turns" | ... per the
628
+ // claude harness's result subtype taxonomy.
629
+ let agentResultSubtype = null;
630
+ let postResultTimer = null;
631
+ let postResultKillTimer = null;
632
+
633
+ const killTree = (signal) => {
634
+ // Kill the whole process group. Negative pid targets the group leader's
635
+ // group (only works because we spawned with detached:true).
636
+ try { process.kill(-child.pid, signal); return true; }
637
+ catch {
638
+ try { process.kill(child.pid, signal); return true; }
639
+ catch { return false; /* already dead */ }
640
+ }
641
+ };
642
+
643
+ // Tail the log for {"type":"result","subtype":"..."} events. When we see
644
+ // one, start the post-result grace timer — the agent has declared done,
645
+ // so the process should exit promptly. If not, something is hanging
646
+ // (the cellar-publish failure mode).
647
+ const resultTailer = setInterval(() => {
648
+ if (agentResultSubtype) return; // already seen; tailer will be cleared below
649
+ try {
650
+ const stat = fs.statSync(logPath);
651
+ if (stat.size === 0) return;
652
+ const n = Math.min(stat.size, RESULT_TAIL_BYTES);
653
+ const buf = Buffer.alloc(n);
654
+ const fdR = fs.openSync(logPath, 'r');
655
+ fs.readSync(fdR, buf, 0, n, stat.size - n);
656
+ fs.closeSync(fdR);
657
+ const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
658
+ if (!m) return;
659
+ agentResultSubtype = m[1];
660
+ fs.writeSync(fd, `\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
661
+ `starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
662
+ clearInterval(resultTailer);
663
+ postResultTimer = setTimeout(() => {
664
+ fs.writeSync(fd, `\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
665
+ `child still alive — SIGTERM process group\n`);
666
+ killTree('SIGTERM');
667
+ postResultKillTimer = setTimeout(() => {
668
+ fs.writeSync(fd, `\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
669
+ killTree('SIGKILL');
670
+ }, POST_RESULT_KILL_MS);
671
+ if (postResultKillTimer.unref) postResultKillTimer.unref();
672
+ }, POST_RESULT_GRACE_MS);
673
+ if (postResultTimer.unref) postResultTimer.unref();
674
+ } catch { /* log not readable yet; try again */ }
675
+ }, RESULT_TAIL_POLL_MS);
676
+ if (resultTailer.unref) resultTailer.unref();
677
+
579
678
  // Kill the child if it runs past the maximum allowed duration.
580
679
  const watchdog = setTimeout(() => {
581
680
  fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
582
- try { child.kill('SIGKILL'); } catch { /* already dead */ }
681
+ killTree('SIGKILL');
583
682
  }, MAX_JOB_DURATION_MS);
584
683
  if (watchdog.unref) watchdog.unref();
585
684
 
586
- child.on('error', (err) => {
685
+ // Idle-output watchdog: poll log mtime every IDLE_CHECK_INTERVAL_MS; if
686
+ // it hasn't advanced in IDLE_OUTPUT_KILL_MS, presume the agent is stuck
687
+ // and SIGTERM the process group.
688
+ let idleKillTimer = null;
689
+ const idleChecker = setInterval(() => {
690
+ try {
691
+ const stat = fs.statSync(logPath);
692
+ const idleMs = Date.now() - stat.mtimeMs;
693
+ if (idleMs > IDLE_OUTPUT_KILL_MS) {
694
+ fs.writeSync(fd, `\n[scheduler] idle-output watchdog: log mtime stalled ` +
695
+ `${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
696
+ clearInterval(idleChecker);
697
+ killTree('SIGTERM');
698
+ idleKillTimer = setTimeout(() => {
699
+ fs.writeSync(fd, `\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
700
+ killTree('SIGKILL');
701
+ }, POST_RESULT_KILL_MS);
702
+ if (idleKillTimer.unref) idleKillTimer.unref();
703
+ }
704
+ } catch { /* log not statable; skip */ }
705
+ }, IDLE_CHECK_INTERVAL_MS);
706
+ if (idleChecker.unref) idleChecker.unref();
707
+
708
+ const clearAllTimers = () => {
587
709
  clearTimeout(watchdog);
710
+ clearInterval(resultTailer);
711
+ clearInterval(idleChecker);
712
+ if (postResultTimer) clearTimeout(postResultTimer);
713
+ if (postResultKillTimer) clearTimeout(postResultKillTimer);
714
+ if (idleKillTimer) clearTimeout(idleKillTimer);
715
+ };
716
+
717
+ child.on('error', (err) => {
718
+ clearAllTimers();
588
719
  const durationMs = Date.now() - startedAt;
589
720
  fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
590
721
  closeFd();
@@ -592,132 +723,395 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
592
723
  resolve({ exitCode: -1, durationMs, error: err.message });
593
724
  });
594
725
 
595
- child.on('exit', (code) => {
596
- clearTimeout(watchdog);
726
+ child.on('exit', (code, signal) => {
727
+ clearAllTimers();
597
728
  const durationMs = Date.now() - startedAt;
598
- fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
729
+ // If we SIGTERM'd because of the post-result watchdog AND the agent had
730
+ // emitted result=success, the work succeeded; only the cleanup hung.
731
+ // Map the kill exit code to 0 so the job is marked completed, not failed.
732
+ // Node's child.on('exit') reports either code (normal) or signal (killed);
733
+ // when killed by signal, code is null. We also check 143 (128+SIGTERM)
734
+ // and 137 (128+SIGKILL) in case the process exited via signal-as-code.
735
+ let effectiveCode = code;
736
+ const killedBySignal = signal === 'SIGTERM' || signal === 'SIGKILL' || code === 143 || code === 137 || code === null;
737
+ const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
738
+ if (mappedToSuccess) {
739
+ effectiveCode = 0;
740
+ fs.writeSync(fd, `\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
741
+ `(result=success was emitted before kill)\n`);
742
+ }
743
+ fs.writeSync(fd, `\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
744
+ `duration=${Math.round(durationMs / 1000)}s\n`);
599
745
  closeFd();
600
- const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
601
- atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
602
- resolve({ exitCode: code, durationMs, rateLimited });
746
+ const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
747
+ atomicWriteJson(metaPath, {
748
+ slug: job.slug, cwd, exitCode: effectiveCode, rateLimited,
749
+ startedAt, finishedAt: Date.now(), durationMs,
750
+ agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
751
+ });
752
+ resolve({ exitCode: effectiveCode, durationMs, rateLimited });
603
753
  });
604
754
  });
605
755
  }
606
756
 
607
- async function runDueJobs() {
608
- if (isExecuting) return;
609
- isExecuting = true;
610
- cancelToken = { cancelled: false };
611
- try {
612
- const state = readQueue();
613
- if (state.paused) {
614
- console.log('[scheduler] runDueJobs skipped: paused');
615
- return;
757
+ /**
758
+ * Pick the next batch of jobs to spawn this tick.
759
+ *
760
+ * Rules:
761
+ * 1. Find the lowest parallelGroup that has pending jobs not already in
762
+ * runningSet.
763
+ * 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
764
+ * up to (cap - runningSet.size) more from the SAME group.
765
+ * 3. If the current group has NO jobs in runningSet (new group), and there
766
+ * are still jobs from an earlier group in runningSet, do nothing — wait
767
+ * for the earlier group to drain before advancing.
768
+ * 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
769
+ * AFTER a higher-numbered group was already picked, fire the late-arrival
770
+ * immediately in parallel with the active group rather than starving it
771
+ * until the active group drains. This handles the reconcile-race where
772
+ * a PRD file lands on disk between two pickNextBatch invocations.
773
+ * 5. A singleton group (unique NN, no other jobs share it) runs alone;
774
+ * no bleed into adjacent groups.
775
+ *
776
+ * Returns array of job objects to spawn. O(N) where N = pending.length.
777
+ */
778
+ function pickNextBatch(allJobs, running, cap) {
779
+ const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
780
+ if (pending.length === 0) return [];
781
+
782
+ // Groups with at least one job in flight: either tracked in runningSet
783
+ // (this process spawned it) or still marked 'running' in queue.json
784
+ // (persisted from a previous session that hasn't been orphan-reset yet).
785
+ const activeGroups = new Set();
786
+ for (const slug of running) {
787
+ const job = allJobs.find((j) => j.slug === slug);
788
+ if (job) activeGroups.add(job.parallelGroup ?? 99);
789
+ }
790
+ for (const j of allJobs) {
791
+ if (j.status === 'running' && !running.has(j.slug)) {
792
+ activeGroups.add(j.parallelGroup ?? 99);
616
793
  }
617
- reconcile(state);
618
- const pending = state.jobs.filter((j) => j.status === 'pending');
619
- if (pending.length === 0) {
620
- return;
794
+ }
795
+ // Total slots consumed: in-process spawns + queue.json running count.
796
+ const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
797
+ const effectiveRunning = Math.max(running.size, queueRunningCount);
798
+
799
+ // Lowest pending group.
800
+ const lowestPendingGroup = pending.reduce(
801
+ (min, j) => Math.min(min, j.parallelGroup ?? 99),
802
+ Infinity,
803
+ );
804
+
805
+ if (activeGroups.size > 0) {
806
+ const lowestActive = Math.min(...activeGroups);
807
+ if (lowestPendingGroup > lowestActive) {
808
+ // Earlier group still running — wait for it to drain before advancing.
809
+ console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
810
+ return [];
621
811
  }
622
- const { runId, dir: runDir } = pickRunDir();
812
+ if (lowestPendingGroup < lowestActive) {
813
+ // Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
814
+ // a higher-numbered group was already picked. Without this branch the
815
+ // pending PRD starves until the active group drains — the bug observed
816
+ // on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
817
+ // the g130 hardening trio ran. Honor priority: fire the late-arrival
818
+ // now, in parallel with the active group. (Strict serial group
819
+ // ordering still applies between groups that were both present at the
820
+ // time of picking; this only handles the reconcile-race edge case.)
821
+ const slots = cap - effectiveRunning;
822
+ if (slots <= 0) {
823
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
824
+ return [];
825
+ }
826
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
827
+ console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
828
+ return batch;
829
+ }
830
+ // Backfill slots remaining in the current group.
831
+ const slots = cap - effectiveRunning;
832
+ if (slots <= 0) {
833
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
834
+ return [];
835
+ }
836
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
837
+ if (batch.length > 0) {
838
+ console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
839
+ }
840
+ return batch;
841
+ }
842
+
843
+ // No active group — start the next group fresh.
844
+ const slots = cap - effectiveRunning;
845
+ if (slots <= 0) {
846
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
847
+ return [];
848
+ }
849
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
850
+ console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
851
+ return batch;
852
+ }
853
+
854
+ /**
855
+ * Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
856
+ * itself failed. The pattern matches the slug we generate in spawnInvestigation.
857
+ */
858
+ function isFixPlanSlug(slug) {
859
+ return /^\d+-fix-/.test(slug);
860
+ }
623
861
 
624
- // Group by parallelGroup, ascending. Each group runs serially after the
625
- // previous group completes.
626
- const groups = new Map();
627
- for (const j of pending) {
628
- const g = j.parallelGroup ?? 99;
629
- if (!groups.has(g)) groups.set(g, []);
630
- groups.get(g).push(j);
862
+ /**
863
+ * Read the last `bytes` of a file as utf8. Returns '' on error.
864
+ */
865
+ function readTail(filePath, bytes) {
866
+ try {
867
+ const stat = fs.statSync(filePath);
868
+ const n = Math.min(stat.size, bytes);
869
+ const fd = fs.openSync(filePath, 'r');
870
+ const buf = Buffer.alloc(n);
871
+ fs.readSync(fd, buf, 0, n, stat.size - n);
872
+ fs.closeSync(fd);
873
+ return buf.toString('utf8');
874
+ } catch {
875
+ return '';
876
+ }
877
+ }
878
+
879
+ /**
880
+ * Spawn an Opus investigation session for a failed job. The investigator's job
881
+ * is to read the failure log + original PRD, identify the root cause, and write
882
+ * a fix-plan PRD into prds/<NN>-fix-<base>.md. Reconcile picks it up; the next
883
+ * Sonnet slot fires it. Investigations themselves are NOT queue entries — they
884
+ * run out-of-band, so they don't consume the concurrency cap. They DO consume
885
+ * tokens, which the when-available throttle will reflect on the next poll.
886
+ *
887
+ * Skipped if the failed job is itself a fix-plan (avoids infinite recursion).
888
+ */
889
+ async function spawnInvestigation(failedJob, runDir) {
890
+ if (isFixPlanSlug(failedJob.slug)) {
891
+ console.log(`[scheduler] skip investigation: ${failedJob.slug} is itself a fix plan`);
892
+ return;
893
+ }
894
+
895
+ const failedLogPath = path.join(runDir, `${failedJob.slug}.log`);
896
+ const investigationLogPath = path.join(runDir, `${failedJob.slug}.investigation.log`);
897
+
898
+ let originalBody = '';
899
+ try {
900
+ originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
901
+ } catch {
902
+ originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
903
+ }
904
+
905
+ const logTail = readTail(failedLogPath, 16 * 1024) || '(failed to read log)';
906
+
907
+ const baseSlug = failedJob.slug.replace(/^\d+-/, '');
908
+ const group = failedJob.parallelGroup ?? 99;
909
+ const fixSlug = `${String(group).padStart(2, '0')}-fix-${baseSlug}`;
910
+ const fixPath = path.join(PRDS_DIR, `${fixSlug}.md`);
911
+
912
+ if (fs.existsSync(fixPath)) {
913
+ console.log(`[scheduler] skip investigation: fix plan already exists at ${fixPath}`);
914
+ return;
915
+ }
916
+
917
+ const cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
918
+ const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
919
+
920
+ # Failed job
921
+ - Slug: ${failedJob.slug}
922
+ - Title: ${failedJob.title}
923
+ - cwd: ${cwd}
924
+ - Exit code: ${failedJob.exitCode}
925
+ - Full failure log: ${failedLogPath}
926
+
927
+ # Original PRD body (this is what the job was trying to do)
928
+ \`\`\`
929
+ ${originalBody}
930
+ \`\`\`
931
+
932
+ # Last ~16KB of the failure log (stream-json format from \`claude -p\`)
933
+ \`\`\`
934
+ ${logTail}
935
+ \`\`\`
936
+
937
+ # Your task
938
+ 1. Read the full failure log at ${failedLogPath} if the tail above isn't sufficient.
939
+ 2. Read source files in ${cwd} as needed to understand the context.
940
+ 3. Identify the root cause of the failure.
941
+ 4. Write a NEW fix-plan PRD file at exactly this path:
942
+
943
+ ${fixPath}
944
+
945
+ 5. The frontmatter MUST be exactly this format (no extra keys):
946
+ \`\`\`
947
+ ---
948
+ title: Fix: <short summary of the fix>
949
+ cwd: ${cwd}
950
+ parallelGroup: ${group}
951
+ estimateMinutes: <your time estimate>
952
+ ---
953
+ \`\`\`
954
+ 6. The PRD body MUST be self-contained — \`claude -p\` runs it on a fresh Sonnet session with NO conversation context. Include:
955
+ - Root-cause analysis (what went wrong and why)
956
+ - Concrete fix steps (specific files / commands / edits)
957
+ - Verification command(s) the next agent should run to confirm the fix
958
+ - Acceptance criteria
959
+
960
+ DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
961
+
962
+ const fd = fs.openSync(investigationLogPath, 'a');
963
+ fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n\n`);
964
+
965
+ const claudeBin = resolveClaudeBin();
966
+ const childEnv = cleanChildEnv();
967
+ const child = spawn(claudeBin, [
968
+ '-p', prompt,
969
+ '--model', 'opus',
970
+ '--dangerously-skip-permissions',
971
+ '--output-format', 'stream-json',
972
+ '--verbose',
973
+ ], {
974
+ cwd,
975
+ env: childEnv,
976
+ stdio: ['ignore', fd, fd],
977
+ });
978
+
979
+ fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
980
+
981
+ const watchdog = setTimeout(() => {
982
+ fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
983
+ try { child.kill('SIGKILL'); } catch { /* already dead */ }
984
+ }, MAX_INVESTIGATION_DURATION_MS);
985
+ if (watchdog.unref) watchdog.unref();
986
+
987
+ child.on('error', (err) => {
988
+ clearTimeout(watchdog);
989
+ try { fs.writeSync(fd, `\n[scheduler] investigation error: ${err.message}\n`); } catch { /* */ }
990
+ try { fs.closeSync(fd); } catch { /* */ }
991
+ });
992
+
993
+ child.on('exit', (code) => {
994
+ clearTimeout(watchdog);
995
+ try { fs.writeSync(fd, `\n[scheduler] investigation exit code=${code}\n`); } catch { /* */ }
996
+ try { fs.closeSync(fd); } catch { /* */ }
997
+ if (fs.existsSync(fixPath)) {
998
+ console.log(`[scheduler] investigation produced fix plan: ${fixSlug}`);
999
+ } else {
1000
+ console.log(`[scheduler] investigation finished WITHOUT producing fix plan (slug=${failedJob.slug}, code=${code})`);
631
1001
  }
632
- const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
1002
+ // Trigger a tick so the new fix plan is reconciled into the queue and fired.
1003
+ tickQueue().catch(() => {});
1004
+ });
1005
+ }
633
1006
 
634
- await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
1007
+ async function spawnJob(job, runId, runDir, defaultCwd) {
1008
+ runningSet.add(job.slug);
1009
+ try {
1010
+ await mutate((s) => {
1011
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
1012
+ if (idx >= 0) {
1013
+ s.jobs[idx].status = 'running';
1014
+ s.jobs[idx].runId = runId;
1015
+ s.jobs[idx].startedAt = new Date().toISOString();
1016
+ }
1017
+ });
635
1018
  broadcast();
636
1019
 
637
- for (const gk of groupKeys) {
638
- if (cancelToken.cancelled) break;
639
- const groupJobs = groups.get(gk);
640
- // Within a group: cap concurrency and run waves until all done.
641
- const cap = Math.max(1, Math.min(state.config.concurrencyCap, groupJobs.length));
642
- const queue = [...groupJobs];
643
- const inFlight = new Set();
644
-
645
- const launch = (job) => {
646
- const promise = (async () => {
647
- try {
648
- // Mark job running.
649
- await mutate((s) => {
650
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
651
- if (idx >= 0) {
652
- s.jobs[idx].status = 'running';
653
- s.jobs[idx].runId = runId;
654
- s.jobs[idx].startedAt = new Date().toISOString();
655
- }
656
- });
657
- broadcast();
658
-
659
- // Execute — onPid persists the child PID into the running state.
660
- const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
661
- await mutate((s) => {
662
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
663
- if (idx >= 0) {
664
- s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
665
- }
666
- });
667
- });
668
-
669
- // Rate-limit: pause before writing terminal status so the status
670
- // mutate below can read the pause state.
671
- if (res.rateLimited) {
672
- const resetIso = await refreshNextReset().catch(() => cachedNextReset);
673
- await setPaused('rate_limit', resetIso);
674
- }
675
-
676
- // Write terminal status; strip runtime regardless of outcome.
677
- await mutate((s) => {
678
- const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
679
- if (i2 >= 0) {
680
- const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
681
- if (treatAsPending) {
682
- resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
683
- } else {
684
- s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
685
- s.jobs[i2].finishedAt = new Date().toISOString();
686
- s.jobs[i2].exitCode = res.exitCode;
687
- s.jobs[i2].error = res.error || null;
688
- delete s.jobs[i2].runtime;
689
- }
690
- }
691
- });
692
- broadcast();
693
- } catch (e) {
694
- console.error('[scheduler] launch error', job.slug, e);
1020
+ const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
1021
+ await mutate((s) => {
1022
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
1023
+ if (idx >= 0) {
1024
+ s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
1025
+ }
1026
+ });
1027
+ });
1028
+
1029
+ if (res.rateLimited) {
1030
+ const resetIso = await refreshNextReset().catch(() => cachedNextReset);
1031
+ await setPaused('rate_limit', resetIso);
1032
+ }
1033
+
1034
+ let actuallyFailed = false;
1035
+ let failedJobSnapshot = null;
1036
+ await mutate((s) => {
1037
+ const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
1038
+ if (i2 >= 0) {
1039
+ const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
1040
+ if (treatAsPending) {
1041
+ resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
1042
+ } else {
1043
+ s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
1044
+ s.jobs[i2].finishedAt = new Date().toISOString();
1045
+ s.jobs[i2].exitCode = res.exitCode;
1046
+ s.jobs[i2].error = res.error || null;
1047
+ delete s.jobs[i2].runtime;
1048
+ if (s.jobs[i2].status === 'failed') {
1049
+ actuallyFailed = true;
1050
+ failedJobSnapshot = { ...s.jobs[i2] };
695
1051
  }
696
- })();
697
- inFlight.add(promise);
698
- promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
699
- };
700
-
701
- // Prime up to cap
702
- while (queue.length && inFlight.size < cap && !cancelToken.cancelled) launch(queue.shift());
703
- // Drain. If cancelled mid-group, stop launching new jobs but let
704
- // already-launched ones settle (they're rate-limited too — short).
705
- while (inFlight.size > 0) {
706
- await Promise.race(inFlight);
707
- if (cancelToken.cancelled) {
708
- await Promise.allSettled([...inFlight]);
709
- break;
710
1052
  }
711
- while (queue.length && inFlight.size < cap) launch(queue.shift());
712
1053
  }
1054
+ });
1055
+ broadcast();
1056
+
1057
+ if (actuallyFailed && failedJobSnapshot) {
1058
+ spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
1059
+ console.error('[scheduler] spawnInvestigation error', job.slug, e);
1060
+ });
713
1061
  }
1062
+ } catch (e) {
1063
+ console.error('[scheduler] spawnJob error', job.slug, e);
714
1064
  } finally {
715
- isExecuting = false;
716
- // No longer auto-disable after a run. The firePolicy now governs whether
717
- // the next batch fires automatically. Just clear the one-shot scheduledFor.
718
- await mutate((s) => { s.scheduledFor = null; });
1065
+ runningSet.delete(job.slug);
1066
+ // Each job completion is a signal to advance the queue.
1067
+ tickQueue().catch(() => {});
1068
+ }
1069
+ }
1070
+
1071
+ // Serialized ticker: prevents two concurrent tickQueue() calls from racing
1072
+ // on the same pending jobs. A simple promise tail suffices since pickNextBatch
1073
+ // is synchronous and spawnJob is fire-and-forget.
1074
+ let tickTail = Promise.resolve();
1075
+
1076
+ function tickQueue() {
1077
+ const next = tickTail.then(async () => {
1078
+ const state = readQueue();
1079
+ if (state.paused) {
1080
+ console.log('[scheduler] tickQueue skipped: paused');
1081
+ return;
1082
+ }
1083
+ if (cancelToken.cancelled) return;
1084
+
1085
+ reconcile(state);
1086
+ const cap = ENV_CAP ?? state.config.concurrencyCap;
1087
+ const batch = pickNextBatch(state.jobs, runningSet, cap);
1088
+ if (batch.length === 0) return;
1089
+
1090
+ await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
719
1091
  broadcast();
1092
+
1093
+ const { runId, dir: runDir } = pickRunDir();
1094
+ for (const job of batch) {
1095
+ if (cancelToken.cancelled) break;
1096
+ // spawnJob is fire-and-forget; it calls tickQueue() on completion.
1097
+ spawnJob(job, runId, runDir, state.config.defaultCwd).catch(() => {});
1098
+ }
1099
+ });
1100
+ tickTail = next.catch(() => {});
1101
+ return next;
1102
+ }
1103
+
1104
+ async function runDueJobs() {
1105
+ const state = readQueue();
1106
+ if (state.paused) {
1107
+ console.log('[scheduler] runDueJobs skipped: paused');
1108
+ return;
720
1109
  }
1110
+ cancelToken = { cancelled: false };
1111
+ await tickQueue();
1112
+ // Clear the one-shot scheduledFor without waiting for jobs to settle.
1113
+ await mutate((s) => { s.scheduledFor = null; });
1114
+ broadcast();
721
1115
  }
722
1116
 
723
1117
  // ---------- when-available launch logic ----------
@@ -725,16 +1119,15 @@ async function runDueJobs() {
725
1119
  async function maybeLaunchWhenAvailable(state) {
726
1120
  if (state.config.firePolicy !== 'when-available') return;
727
1121
  if (state.paused) return;
728
- if (isExecuting) return;
729
- const pending = state.jobs.filter((j) => j.status === 'pending');
1122
+ const pending = state.jobs.filter((j) => j.status === 'pending' && !runningSet.has(j.slug));
730
1123
  if (pending.length === 0) return;
731
1124
  if (cachedUtilization === null || cachedUtilization === undefined) return;
732
1125
  if (cachedUtilization >= state.config.utilizationThreshold) {
733
1126
  broadcast();
734
1127
  return;
735
1128
  }
736
- console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
737
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
1129
+ console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running ticking`);
1130
+ tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
738
1131
  }
739
1132
 
740
1133
  // ---------- poll loop with exponential backoff ----------
@@ -750,6 +1143,8 @@ async function pollLoop() {
750
1143
  backoffMs = 0;
751
1144
  backoffNextAt = null;
752
1145
  firstFailureAt = null;
1146
+ firstNon429FailureAt = null;
1147
+ lastFailureKind = null;
753
1148
  lastPollAt = Date.now();
754
1149
  lastPollOk = true;
755
1150
  persistSchedulerState();
@@ -764,6 +1159,19 @@ async function pollLoop() {
764
1159
  await clearPause('reset-recovered');
765
1160
  }
766
1161
 
1162
+ await maybeLaunchWhenAvailable(cur);
1163
+ broadcast();
1164
+ } else if (r.kind === 'meter_rate_limited') {
1165
+ // Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
1166
+ // fire available jobs anyway at utilization=0 rather than pausing the queue.
1167
+ lastPollAt = Date.now();
1168
+ lastPollOk = false;
1169
+ consecutiveFailures++;
1170
+ lastFailureKind = 'meter_rate_limited';
1171
+ // Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
1172
+ cachedUtilization = 0; // assume safe; fire any pending work
1173
+ console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
1174
+ const cur = readQueue();
767
1175
  await maybeLaunchWhenAvailable(cur);
768
1176
  broadcast();
769
1177
  } else {
@@ -773,16 +1181,19 @@ async function pollLoop() {
773
1181
  if (!firstFailureAt) firstFailureAt = Date.now();
774
1182
 
775
1183
  if (r.kind === 'auth') {
1184
+ lastFailureKind = 'auth';
776
1185
  console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
777
1186
  await setPaused('auth', null);
778
1187
  } else {
779
- // transient or config — apply exponential backoff.
1188
+ // transient or config — apply exponential backoff and count toward 30-min threshold.
1189
+ lastFailureKind = 'transient';
1190
+ if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
780
1191
  backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
781
- const totalFailureMs = Date.now() - firstFailureAt;
1192
+ const totalNon429FailureMs = Date.now() - firstNon429FailureAt;
782
1193
  console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
783
1194
 
784
- // After 30 minutes of consecutive failures, set 'network' pause.
785
- if (totalFailureMs > 30 * 60_000) {
1195
+ // After 30 minutes of consecutive non-429 failures, set 'network' pause.
1196
+ if (totalNon429FailureMs > 30 * 60_000) {
786
1197
  const cur2 = readQueue();
787
1198
  if (!cur2.paused || cur2.paused.reason === 'network') {
788
1199
  await setPaused('network', null);
@@ -798,7 +1209,9 @@ async function pollLoop() {
798
1209
  lastPollAt = Date.now();
799
1210
  lastPollOk = false;
800
1211
  consecutiveFailures++;
1212
+ lastFailureKind = 'transient';
801
1213
  if (!firstFailureAt) firstFailureAt = Date.now();
1214
+ if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
802
1215
  backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
803
1216
  backoffNextAt = Date.now() + backoffMs;
804
1217
  persistSchedulerState();
@@ -813,6 +1226,7 @@ async function pollLoop() {
813
1226
 
814
1227
  function registerScheduleHandlers() {
815
1228
  ensureDirs();
1229
+ supervisor.registerHandlers();
816
1230
 
817
1231
  ipcMain.handle('schedule:state', async () => {
818
1232
  const state = readQueue();
@@ -847,6 +1261,7 @@ function registerScheduleHandlers() {
847
1261
  lastPollAt,
848
1262
  lastPollOk,
849
1263
  consecutiveFailures,
1264
+ lastFailureKind,
850
1265
  backoffNextAt,
851
1266
  nextResetCached: cachedNextReset,
852
1267
  pausedSince: state.paused ? Date.parse(state.paused.since) : null,
@@ -855,6 +1270,14 @@ function registerScheduleHandlers() {
855
1270
  };
856
1271
  });
857
1272
 
1273
+ ipcMain.handle('schedule:force-tick', async () => {
1274
+ // Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
1275
+ // Clears any existing pause first (same semantics as run-now).
1276
+ await clearPause('run-now');
1277
+ runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error (force-tick)', e));
1278
+ return { ok: true };
1279
+ });
1280
+
858
1281
  ipcMain.handle('schedule:set-config', async (_e, partial) => {
859
1282
  const { schemas: s } = require('./ipcSchemas.cjs');
860
1283
  let validated;
@@ -864,7 +1287,11 @@ function registerScheduleHandlers() {
864
1287
  return { ok: false, error: e?.message ?? 'invalid config' };
865
1288
  }
866
1289
  const config = await mutate((state) => {
867
- state.config = { ...state.config, ...validated };
1290
+ const { supervisor: supPartial, ...rest } = validated;
1291
+ state.config = { ...state.config, ...rest };
1292
+ if (supPartial !== undefined) {
1293
+ state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
1294
+ }
868
1295
  return state.config;
869
1296
  });
870
1297
  await rescheduleTimer();
@@ -913,6 +1340,58 @@ function registerScheduleHandlers() {
913
1340
  return { ok: true, nextReset: at };
914
1341
  });
915
1342
 
1343
+ // Re-scan prds/ folder and merge into queue.json. The `schedule:state`
1344
+ // handler already reconciles on read, but this gives the renderer an
1345
+ // explicit refresh path that also broadcasts so all views update.
1346
+ ipcMain.handle('schedule:rescan', async () => {
1347
+ await mutate((state) => {
1348
+ reconcile(state);
1349
+ return null;
1350
+ });
1351
+ broadcast();
1352
+ return { ok: true };
1353
+ });
1354
+
1355
+ // Archive all pending+failed PRDs and drop their entries from queue.json.
1356
+ // Completed/running entries are kept. PRD files are moved (not deleted) to
1357
+ // prds-archived/<ISO>/ so the user can recover them. Path containment is
1358
+ // enforced — only files inside PRDS_DIR are moved.
1359
+ ipcMain.handle('schedule:clear-queue', async () => {
1360
+ ensureDirs();
1361
+ const ts = new Date().toISOString().replace(/[:.]/g, '-');
1362
+ const archiveDir = path.join(PRDS_ARCHIVE_DIR, ts);
1363
+ const state = readQueue();
1364
+ const victims = state.jobs.filter((j) => j.status === 'pending' || j.status === 'failed');
1365
+ if (victims.length === 0) {
1366
+ return { ok: true, archived: 0, archivedTo: null };
1367
+ }
1368
+ fs.mkdirSync(archiveDir, { recursive: true });
1369
+ let archived = 0;
1370
+ for (const job of victims) {
1371
+ const src = path.resolve(path.join(PRDS_DIR, `${job.slug}.md`));
1372
+ if (!src.startsWith(PRDS_DIR + path.sep)) continue;
1373
+ const dst = path.join(archiveDir, `${job.slug}.md`);
1374
+ try {
1375
+ await fsp.rename(src, dst);
1376
+ archived++;
1377
+ } catch (e) {
1378
+ // ENOENT: the .md is already gone (reconcile would drop it on next
1379
+ // read anyway). Either way, fall through and remove from queue.
1380
+ if (e?.code !== 'ENOENT') {
1381
+ console.warn('[scheduler] clear-queue: rename failed', job.slug, e?.message);
1382
+ }
1383
+ }
1384
+ }
1385
+ await mutate((s) => {
1386
+ const victimSlugs = new Set(victims.map((j) => j.slug));
1387
+ s.jobs = s.jobs.filter((j) => !victimSlugs.has(j.slug));
1388
+ reconcile(s);
1389
+ return null;
1390
+ });
1391
+ broadcast();
1392
+ return { ok: true, archived, archivedTo: archiveDir };
1393
+ });
1394
+
916
1395
  ipcMain.handle('schedule:open-folder', async () => {
917
1396
  const { shell } = require('electron');
918
1397
  await shell.openPath(ROOT);
@@ -1051,6 +1530,11 @@ async function init() {
1051
1530
  pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
1052
1531
  if (pollLoopTimer.unref) pollLoopTimer.unref();
1053
1532
 
1533
+ // Supervisor: probe running jobs for wedged poll-loops.
1534
+ if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
1535
+ supervisor.startSupervisor({ readQueue, mutate });
1536
+ }
1537
+
1054
1538
  // Heartbeat: once per minute, log queue state for 24h visibility.
1055
1539
  if (heartbeatInterval) clearInterval(heartbeatInterval);
1056
1540
  heartbeatInterval = setInterval(() => {