claude-code-session-manager 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +21 -0
  2. package/dist/assets/{cssMode-DyaNC2Cs.js → cssMode-SwUA7tV8.js} +1 -1
  3. package/dist/assets/{editor.main-BhSGi_Jw.js → editor.main-C0vxDQaJ.js} +3 -3
  4. package/dist/assets/{freemarker2-DZH3si5v.js → freemarker2-KX6gG2yg.js} +1 -1
  5. package/dist/assets/{handlebars-DvzTd6uL.js → handlebars-B9LxZbcv.js} +1 -1
  6. package/dist/assets/{html-C5GmopAN.js → html-B-8ZJyzc.js} +1 -1
  7. package/dist/assets/{htmlMode-DwnrHwx1.js → htmlMode-BrQu9A96.js} +1 -1
  8. package/dist/assets/index-CrdBAMX2.js +3044 -0
  9. package/dist/assets/index-DpbPBSiS.css +32 -0
  10. package/dist/assets/{javascript-JqHrxiCa.js → javascript-D3vhfNJL.js} +1 -1
  11. package/dist/assets/{jsonMode-8rZcy09i.js → jsonMode-CCVQ7oTr.js} +1 -1
  12. package/dist/assets/{liquid-ClpD_v7G.js → liquid-DnwPncmC.js} +1 -1
  13. package/dist/assets/{lspLanguageFeatures-u0WgQBQz.js → lspLanguageFeatures-DrwXiqW0.js} +1 -1
  14. package/dist/assets/{mdx-DtViUgdm.js → mdx-C6MKH-vG.js} +1 -1
  15. package/dist/assets/{python-CaAvhRGm.js → python-Dp1TzxJl.js} +1 -1
  16. package/dist/assets/{razor-saGNVU7l.js → razor-KRJat9pO.js} +1 -1
  17. package/dist/assets/{tsMode-HZwWTCj8.js → tsMode-BZ-CF_4O.js} +1 -1
  18. package/dist/assets/{typescript-BInV4PNE.js → typescript-CTLs4m8W.js} +1 -1
  19. package/dist/assets/{whisperWorker-ivwFFLMj.js → whisperWorker-QfIS0sPF.js} +5 -5
  20. package/dist/assets/{xml-tgO806YR.js → xml-DUE-XnsH.js} +1 -1
  21. package/dist/assets/{yaml-CHApZArv.js → yaml-DSuhPI0o.js} +1 -1
  22. package/dist/index.html +2 -2
  23. package/package.json +16 -1
  24. package/src/main/historyAggregator.cjs +208 -0
  25. package/src/main/index.cjs +4 -0
  26. package/src/main/ipcSchemas.cjs +15 -0
  27. package/src/main/lib/schedulerConfig.cjs +2 -0
  28. package/src/main/scheduler.cjs +551 -120
  29. package/src/main/supervisor.cjs +512 -0
  30. package/src/main/usage.cjs +44 -2
  31. package/src/preload/api.d.ts +59 -2
  32. package/src/preload/index.cjs +8 -0
  33. package/dist/assets/index-BGshD4Pw.js +0 -2976
  34. package/dist/assets/index-DCK87t79.css +0 -32
@@ -48,12 +48,38 @@ const { spawn } = require('node:child_process');
48
48
  const { ipcMain } = require('electron');
49
49
  const billing = require('./usage.cjs');
50
50
  const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
51
+ const supervisor = require('./supervisor.cjs');
51
52
  const {
52
53
  POLL_INTERVAL_MS,
53
54
  USAGE_REFRESH_INTERVAL_MS,
54
55
  MAX_JOB_DURATION_MS,
55
56
  } = require('./lib/schedulerConfig.cjs');
56
57
 
58
+ const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
59
+
60
+ // After the agent emits a `result` event in its JSONL stream, the parent
61
+ // `claude -p` process should exit promptly. Real-world failure (2026-05-10
62
+ // cellar-publish): an agent emitted result=success, then spawned unbounded
63
+ // `until $(curl ...)` background bashes that kept the parent alive for 22
64
+ // minutes until manual intervention. The post-result watchdog catches this:
65
+ // if the process is still alive POST_RESULT_GRACE_MS after result, SIGTERM
66
+ // the whole process group; if still alive POST_RESULT_KILL_MS after SIGTERM,
67
+ // SIGKILL. The original `result.subtype` is preserved and used to map the
68
+ // kill exit code back to 0 so legit work isn't mismarked as failed.
69
+ const POST_RESULT_GRACE_MS = 90_000;
70
+ const POST_RESULT_KILL_MS = 30_000;
71
+ const RESULT_TAIL_POLL_MS = 5_000;
72
+ const RESULT_TAIL_BYTES = 8 * 1024;
73
+
74
+ // Idle-output watchdog: if the log file mtime stops advancing for this long
75
+ // while the process is still alive, the agent is hung mid-work (network
76
+ // stall, infinite tool loop, compaction wedge). User rule: anything not
77
+ // making progress for 20 minutes is presumed stuck. SIGTERM the process
78
+ // group, then SIGKILL after POST_RESULT_KILL_MS. The scheduler logs this
79
+ // distinctly from MAX_JOB_DURATION_MS so post-mortems can tell them apart.
80
+ const IDLE_OUTPUT_KILL_MS = 20 * 60_000;
81
+ const IDLE_CHECK_INTERVAL_MS = 60_000;
82
+
57
83
  const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
58
84
  const PRDS_DIR = path.join(ROOT, 'prds');
59
85
  const RUNS_DIR = path.join(ROOT, 'runs');
@@ -63,11 +89,15 @@ const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'sc
63
89
  const HEARTBEAT_MAX_BYTES = 1024 * 1024;
64
90
  const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
65
91
 
92
+ const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
93
+ ? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
94
+ : null;
95
+
66
96
  const DEFAULT_CONFIG = {
67
97
  // Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
68
98
  enabled: false,
69
99
  offsetMinutes: 15,
70
- concurrencyCap: 5,
100
+ concurrencyCap: ENV_CAP ?? 4,
71
101
  defaultCwd: DEFAULT_PROJECT_CWD,
72
102
  // 'when-available' = poll usage and fire whenever utilization < threshold.
73
103
  // 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
@@ -76,6 +106,12 @@ const DEFAULT_CONFIG = {
76
106
  // For 'when-available'. Fire only when five_hour utilization < this percent.
77
107
  utilizationThreshold: 90,
78
108
  schemaVersion: 1,
109
+ supervisor: {
110
+ enabled: true,
111
+ intervalMinutes: 15,
112
+ maxConcurrentProbes: 2,
113
+ probeStaleThresholdMinutes: 10,
114
+ },
79
115
  };
80
116
 
81
117
  // ---------- fs helpers ----------
@@ -324,6 +360,8 @@ let consecutiveFailures = 0;
324
360
  let backoffMs = 0;
325
361
  let backoffNextAt = null;
326
362
  let firstFailureAt = null;
363
+ let firstNon429FailureAt = null; // tracks only transient/config failures; 429s don't count toward network-pause threshold
364
+ let lastFailureKind = null; // 'transient' | 'meter_rate_limited' | 'auth' | null
327
365
  let pauseClearedManuallyAt = null;
328
366
 
329
367
  // ---------- timer ----------
@@ -334,7 +372,9 @@ let resumeTimer = null;
334
372
  let pollLoopTimer = null;
335
373
  let rescheduleInterval = null;
336
374
  let heartbeatInterval = null;
337
- let isExecuting = false;
375
+ // In-memory set of slugs currently spawned in this process. Prevents
376
+ // double-spawn when runDueJobs() is called while jobs are in flight.
377
+ const runningSet = new Set();
338
378
  let cancelToken = { cancelled: false };
339
379
  let claudeBinPathCached = null;
340
380
 
@@ -569,22 +609,112 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
569
609
  cwd,
570
610
  env: childEnv,
571
611
  stdio: ['ignore', fd, fd],
612
+ // detached:true puts the child in its own process group so we can kill
613
+ // the entire descendant tree (including any stray background bashes the
614
+ // agent spawned) with `process.kill(-pid)`. Without this, child.kill()
615
+ // only kills the immediate `claude` process, leaving orphaned subprocs
616
+ // that keep the parent alive (the 2026-05-10 cellar-publish hang).
617
+ detached: true,
572
618
  });
573
619
 
574
- fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
620
+ fs.writeSync(fd, `[scheduler] spawned pid=${child.pid} (process group)\n\n`);
575
621
 
576
622
  // Fire-and-forget pid persistence — best effort.
577
623
  if (onPid) onPid(child.pid).catch(() => {});
578
624
 
625
+ // Track whether the agent has emitted a `result` event in its JSONL stream.
626
+ // null until seen; then one of "success" | "error_max_turns" | ... per the
627
+ // claude harness's result subtype taxonomy.
628
+ let agentResultSubtype = null;
629
+ let postResultTimer = null;
630
+ let postResultKillTimer = null;
631
+
632
+ const killTree = (signal) => {
633
+ // Kill the whole process group. Negative pid targets the group leader's
634
+ // group (only works because we spawned with detached:true).
635
+ try { process.kill(-child.pid, signal); return true; }
636
+ catch {
637
+ try { process.kill(child.pid, signal); return true; }
638
+ catch { return false; /* already dead */ }
639
+ }
640
+ };
641
+
642
+ // Tail the log for {"type":"result","subtype":"..."} events. When we see
643
+ // one, start the post-result grace timer — the agent has declared done,
644
+ // so the process should exit promptly. If not, something is hanging
645
+ // (the cellar-publish failure mode).
646
+ const resultTailer = setInterval(() => {
647
+ if (agentResultSubtype) return; // already seen; tailer will be cleared below
648
+ try {
649
+ const stat = fs.statSync(logPath);
650
+ if (stat.size === 0) return;
651
+ const n = Math.min(stat.size, RESULT_TAIL_BYTES);
652
+ const buf = Buffer.alloc(n);
653
+ const fdR = fs.openSync(logPath, 'r');
654
+ fs.readSync(fdR, buf, 0, n, stat.size - n);
655
+ fs.closeSync(fdR);
656
+ const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
657
+ if (!m) return;
658
+ agentResultSubtype = m[1];
659
+ fs.writeSync(fd, `\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
660
+ `starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
661
+ clearInterval(resultTailer);
662
+ postResultTimer = setTimeout(() => {
663
+ fs.writeSync(fd, `\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
664
+ `child still alive — SIGTERM process group\n`);
665
+ killTree('SIGTERM');
666
+ postResultKillTimer = setTimeout(() => {
667
+ fs.writeSync(fd, `\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
668
+ killTree('SIGKILL');
669
+ }, POST_RESULT_KILL_MS);
670
+ if (postResultKillTimer.unref) postResultKillTimer.unref();
671
+ }, POST_RESULT_GRACE_MS);
672
+ if (postResultTimer.unref) postResultTimer.unref();
673
+ } catch { /* log not readable yet; try again */ }
674
+ }, RESULT_TAIL_POLL_MS);
675
+ if (resultTailer.unref) resultTailer.unref();
676
+
579
677
  // Kill the child if it runs past the maximum allowed duration.
580
678
  const watchdog = setTimeout(() => {
581
679
  fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
582
- try { child.kill('SIGKILL'); } catch { /* already dead */ }
680
+ killTree('SIGKILL');
583
681
  }, MAX_JOB_DURATION_MS);
584
682
  if (watchdog.unref) watchdog.unref();
585
683
 
586
- child.on('error', (err) => {
684
+ // Idle-output watchdog: poll log mtime every IDLE_CHECK_INTERVAL_MS; if
685
+ // it hasn't advanced in IDLE_OUTPUT_KILL_MS, presume the agent is stuck
686
+ // and SIGTERM the process group.
687
+ let idleKillTimer = null;
688
+ const idleChecker = setInterval(() => {
689
+ try {
690
+ const stat = fs.statSync(logPath);
691
+ const idleMs = Date.now() - stat.mtimeMs;
692
+ if (idleMs > IDLE_OUTPUT_KILL_MS) {
693
+ fs.writeSync(fd, `\n[scheduler] idle-output watchdog: log mtime stalled ` +
694
+ `${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
695
+ clearInterval(idleChecker);
696
+ killTree('SIGTERM');
697
+ idleKillTimer = setTimeout(() => {
698
+ fs.writeSync(fd, `\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
699
+ killTree('SIGKILL');
700
+ }, POST_RESULT_KILL_MS);
701
+ if (idleKillTimer.unref) idleKillTimer.unref();
702
+ }
703
+ } catch { /* log not statable; skip */ }
704
+ }, IDLE_CHECK_INTERVAL_MS);
705
+ if (idleChecker.unref) idleChecker.unref();
706
+
707
+ const clearAllTimers = () => {
587
708
  clearTimeout(watchdog);
709
+ clearInterval(resultTailer);
710
+ clearInterval(idleChecker);
711
+ if (postResultTimer) clearTimeout(postResultTimer);
712
+ if (postResultKillTimer) clearTimeout(postResultKillTimer);
713
+ if (idleKillTimer) clearTimeout(idleKillTimer);
714
+ };
715
+
716
+ child.on('error', (err) => {
717
+ clearAllTimers();
588
718
  const durationMs = Date.now() - startedAt;
589
719
  fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
590
720
  closeFd();
@@ -592,132 +722,395 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
592
722
  resolve({ exitCode: -1, durationMs, error: err.message });
593
723
  });
594
724
 
595
- child.on('exit', (code) => {
596
- clearTimeout(watchdog);
725
+ child.on('exit', (code, signal) => {
726
+ clearAllTimers();
597
727
  const durationMs = Date.now() - startedAt;
598
- fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
728
+ // If we SIGTERM'd because of the post-result watchdog AND the agent had
729
+ // emitted result=success, the work succeeded; only the cleanup hung.
730
+ // Map the kill exit code to 0 so the job is marked completed, not failed.
731
+ // Node's child.on('exit') reports either code (normal) or signal (killed);
732
+ // when killed by signal, code is null. We also check 143 (128+SIGTERM)
733
+ // and 137 (128+SIGKILL) in case the process exited via signal-as-code.
734
+ let effectiveCode = code;
735
+ const killedBySignal = signal === 'SIGTERM' || signal === 'SIGKILL' || code === 143 || code === 137 || code === null;
736
+ const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
737
+ if (mappedToSuccess) {
738
+ effectiveCode = 0;
739
+ fs.writeSync(fd, `\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
740
+ `(result=success was emitted before kill)\n`);
741
+ }
742
+ fs.writeSync(fd, `\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
743
+ `duration=${Math.round(durationMs / 1000)}s\n`);
599
744
  closeFd();
600
- const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
601
- atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
602
- resolve({ exitCode: code, durationMs, rateLimited });
745
+ const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
746
+ atomicWriteJson(metaPath, {
747
+ slug: job.slug, cwd, exitCode: effectiveCode, rateLimited,
748
+ startedAt, finishedAt: Date.now(), durationMs,
749
+ agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
750
+ });
751
+ resolve({ exitCode: effectiveCode, durationMs, rateLimited });
603
752
  });
604
753
  });
605
754
  }
606
755
 
607
- async function runDueJobs() {
608
- if (isExecuting) return;
609
- isExecuting = true;
610
- cancelToken = { cancelled: false };
611
- try {
612
- const state = readQueue();
613
- if (state.paused) {
614
- console.log('[scheduler] runDueJobs skipped: paused');
615
- return;
756
+ /**
757
+ * Pick the next batch of jobs to spawn this tick.
758
+ *
759
+ * Rules:
760
+ * 1. Find the lowest parallelGroup that has pending jobs not already in
761
+ * runningSet.
762
+ * 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
763
+ * up to (cap - runningSet.size) more from the SAME group.
764
+ * 3. If the current group has NO jobs in runningSet (new group), and there
765
+ * are still jobs from an earlier group in runningSet, do nothing — wait
766
+ * for the earlier group to drain before advancing.
767
+ * 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
768
+ * AFTER a higher-numbered group was already picked, fire the late-arrival
769
+ * immediately in parallel with the active group rather than starving it
770
+ * until the active group drains. This handles the reconcile-race where
771
+ * a PRD file lands on disk between two pickNextBatch invocations.
772
+ * 5. A singleton group (unique NN, no other jobs share it) runs alone;
773
+ * no bleed into adjacent groups.
774
+ *
775
+ * Returns array of job objects to spawn. O(N) where N = pending.length.
776
+ */
777
+ function pickNextBatch(allJobs, running, cap) {
778
+ const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
779
+ if (pending.length === 0) return [];
780
+
781
+ // Groups with at least one job in flight: either tracked in runningSet
782
+ // (this process spawned it) or still marked 'running' in queue.json
783
+ // (persisted from a previous session that hasn't been orphan-reset yet).
784
+ const activeGroups = new Set();
785
+ for (const slug of running) {
786
+ const job = allJobs.find((j) => j.slug === slug);
787
+ if (job) activeGroups.add(job.parallelGroup ?? 99);
788
+ }
789
+ for (const j of allJobs) {
790
+ if (j.status === 'running' && !running.has(j.slug)) {
791
+ activeGroups.add(j.parallelGroup ?? 99);
616
792
  }
617
- reconcile(state);
618
- const pending = state.jobs.filter((j) => j.status === 'pending');
619
- if (pending.length === 0) {
620
- return;
793
+ }
794
+ // Total slots consumed: in-process spawns + queue.json running count.
795
+ const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
796
+ const effectiveRunning = Math.max(running.size, queueRunningCount);
797
+
798
+ // Lowest pending group.
799
+ const lowestPendingGroup = pending.reduce(
800
+ (min, j) => Math.min(min, j.parallelGroup ?? 99),
801
+ Infinity,
802
+ );
803
+
804
+ if (activeGroups.size > 0) {
805
+ const lowestActive = Math.min(...activeGroups);
806
+ if (lowestPendingGroup > lowestActive) {
807
+ // Earlier group still running — wait for it to drain before advancing.
808
+ console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
809
+ return [];
621
810
  }
622
- const { runId, dir: runDir } = pickRunDir();
811
+ if (lowestPendingGroup < lowestActive) {
812
+ // Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
813
+ // a higher-numbered group was already picked. Without this branch the
814
+ // pending PRD starves until the active group drains — the bug observed
815
+ // on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
816
+ // the g130 hardening trio ran. Honor priority: fire the late-arrival
817
+ // now, in parallel with the active group. (Strict serial group
818
+ // ordering still applies between groups that were both present at the
819
+ // time of picking; this only handles the reconcile-race edge case.)
820
+ const slots = cap - effectiveRunning;
821
+ if (slots <= 0) {
822
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
823
+ return [];
824
+ }
825
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
826
+ console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
827
+ return batch;
828
+ }
829
+ // Backfill slots remaining in the current group.
830
+ const slots = cap - effectiveRunning;
831
+ if (slots <= 0) {
832
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
833
+ return [];
834
+ }
835
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
836
+ if (batch.length > 0) {
837
+ console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
838
+ }
839
+ return batch;
840
+ }
841
+
842
+ // No active group — start the next group fresh.
843
+ const slots = cap - effectiveRunning;
844
+ if (slots <= 0) {
845
+ console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
846
+ return [];
847
+ }
848
+ const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
849
+ console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
850
+ return batch;
851
+ }
852
+
853
+ /**
854
+ * Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
855
+ * itself failed. The pattern matches the slug we generate in spawnInvestigation.
856
+ */
857
+ function isFixPlanSlug(slug) {
858
+ return /^\d+-fix-/.test(slug);
859
+ }
860
+
861
+ /**
862
+ * Read the last `bytes` of a file as utf8. Returns '' on error.
863
+ */
864
+ function readTail(filePath, bytes) {
865
+ try {
866
+ const stat = fs.statSync(filePath);
867
+ const n = Math.min(stat.size, bytes);
868
+ const fd = fs.openSync(filePath, 'r');
869
+ const buf = Buffer.alloc(n);
870
+ fs.readSync(fd, buf, 0, n, stat.size - n);
871
+ fs.closeSync(fd);
872
+ return buf.toString('utf8');
873
+ } catch {
874
+ return '';
875
+ }
876
+ }
877
+
878
+ /**
879
+ * Spawn an Opus investigation session for a failed job. The investigator's job
880
+ * is to read the failure log + original PRD, identify the root cause, and write
881
+ * a fix-plan PRD into prds/<NN>-fix-<base>.md. Reconcile picks it up; the next
882
+ * Sonnet slot fires it. Investigations themselves are NOT queue entries — they
883
+ * run out-of-band, so they don't consume the concurrency cap. They DO consume
884
+ * tokens, which the when-available throttle will reflect on the next poll.
885
+ *
886
+ * Skipped if the failed job is itself a fix-plan (avoids infinite recursion).
887
+ */
888
+ async function spawnInvestigation(failedJob, runDir) {
889
+ if (isFixPlanSlug(failedJob.slug)) {
890
+ console.log(`[scheduler] skip investigation: ${failedJob.slug} is itself a fix plan`);
891
+ return;
892
+ }
893
+
894
+ const failedLogPath = path.join(runDir, `${failedJob.slug}.log`);
895
+ const investigationLogPath = path.join(runDir, `${failedJob.slug}.investigation.log`);
896
+
897
+ let originalBody = '';
898
+ try {
899
+ originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
900
+ } catch {
901
+ originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
902
+ }
903
+
904
+ const logTail = readTail(failedLogPath, 16 * 1024) || '(failed to read log)';
623
905
 
624
- // Group by parallelGroup, ascending. Each group runs serially after the
625
- // previous group completes.
626
- const groups = new Map();
627
- for (const j of pending) {
628
- const g = j.parallelGroup ?? 99;
629
- if (!groups.has(g)) groups.set(g, []);
630
- groups.get(g).push(j);
906
+ const baseSlug = failedJob.slug.replace(/^\d+-/, '');
907
+ const group = failedJob.parallelGroup ?? 99;
908
+ const fixSlug = `${String(group).padStart(2, '0')}-fix-${baseSlug}`;
909
+ const fixPath = path.join(PRDS_DIR, `${fixSlug}.md`);
910
+
911
+ if (fs.existsSync(fixPath)) {
912
+ console.log(`[scheduler] skip investigation: fix plan already exists at ${fixPath}`);
913
+ return;
914
+ }
915
+
916
+ const cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
917
+ const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
918
+
919
+ # Failed job
920
+ - Slug: ${failedJob.slug}
921
+ - Title: ${failedJob.title}
922
+ - cwd: ${cwd}
923
+ - Exit code: ${failedJob.exitCode}
924
+ - Full failure log: ${failedLogPath}
925
+
926
+ # Original PRD body (this is what the job was trying to do)
927
+ \`\`\`
928
+ ${originalBody}
929
+ \`\`\`
930
+
931
+ # Last ~16KB of the failure log (stream-json format from \`claude -p\`)
932
+ \`\`\`
933
+ ${logTail}
934
+ \`\`\`
935
+
936
+ # Your task
937
+ 1. Read the full failure log at ${failedLogPath} if the tail above isn't sufficient.
938
+ 2. Read source files in ${cwd} as needed to understand the context.
939
+ 3. Identify the root cause of the failure.
940
+ 4. Write a NEW fix-plan PRD file at exactly this path:
941
+
942
+ ${fixPath}
943
+
944
+ 5. The frontmatter MUST be exactly this format (no extra keys):
945
+ \`\`\`
946
+ ---
947
+ title: Fix: <short summary of the fix>
948
+ cwd: ${cwd}
949
+ parallelGroup: ${group}
950
+ estimateMinutes: <your time estimate>
951
+ ---
952
+ \`\`\`
953
+ 6. The PRD body MUST be self-contained — \`claude -p\` runs it on a fresh Sonnet session with NO conversation context. Include:
954
+ - Root-cause analysis (what went wrong and why)
955
+ - Concrete fix steps (specific files / commands / edits)
956
+ - Verification command(s) the next agent should run to confirm the fix
957
+ - Acceptance criteria
958
+
959
+ DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
960
+
961
+ const fd = fs.openSync(investigationLogPath, 'a');
962
+ fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n\n`);
963
+
964
+ const claudeBin = resolveClaudeBin();
965
+ const childEnv = cleanChildEnv();
966
+ const child = spawn(claudeBin, [
967
+ '-p', prompt,
968
+ '--model', 'opus',
969
+ '--dangerously-skip-permissions',
970
+ '--output-format', 'stream-json',
971
+ '--verbose',
972
+ ], {
973
+ cwd,
974
+ env: childEnv,
975
+ stdio: ['ignore', fd, fd],
976
+ });
977
+
978
+ fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
979
+
980
+ const watchdog = setTimeout(() => {
981
+ fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
982
+ try { child.kill('SIGKILL'); } catch { /* already dead */ }
983
+ }, MAX_INVESTIGATION_DURATION_MS);
984
+ if (watchdog.unref) watchdog.unref();
985
+
986
+ child.on('error', (err) => {
987
+ clearTimeout(watchdog);
988
+ try { fs.writeSync(fd, `\n[scheduler] investigation error: ${err.message}\n`); } catch { /* */ }
989
+ try { fs.closeSync(fd); } catch { /* */ }
990
+ });
991
+
992
+ child.on('exit', (code) => {
993
+ clearTimeout(watchdog);
994
+ try { fs.writeSync(fd, `\n[scheduler] investigation exit code=${code}\n`); } catch { /* */ }
995
+ try { fs.closeSync(fd); } catch { /* */ }
996
+ if (fs.existsSync(fixPath)) {
997
+ console.log(`[scheduler] investigation produced fix plan: ${fixSlug}`);
998
+ } else {
999
+ console.log(`[scheduler] investigation finished WITHOUT producing fix plan (slug=${failedJob.slug}, code=${code})`);
631
1000
  }
632
- const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
1001
+ // Trigger a tick so the new fix plan is reconciled into the queue and fired.
1002
+ tickQueue().catch(() => {});
1003
+ });
1004
+ }
633
1005
 
634
- await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
1006
+ async function spawnJob(job, runId, runDir, defaultCwd) {
1007
+ runningSet.add(job.slug);
1008
+ try {
1009
+ await mutate((s) => {
1010
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
1011
+ if (idx >= 0) {
1012
+ s.jobs[idx].status = 'running';
1013
+ s.jobs[idx].runId = runId;
1014
+ s.jobs[idx].startedAt = new Date().toISOString();
1015
+ }
1016
+ });
635
1017
  broadcast();
636
1018
 
637
- for (const gk of groupKeys) {
638
- if (cancelToken.cancelled) break;
639
- const groupJobs = groups.get(gk);
640
- // Within a group: cap concurrency and run waves until all done.
641
- const cap = Math.max(1, Math.min(state.config.concurrencyCap, groupJobs.length));
642
- const queue = [...groupJobs];
643
- const inFlight = new Set();
644
-
645
- const launch = (job) => {
646
- const promise = (async () => {
647
- try {
648
- // Mark job running.
649
- await mutate((s) => {
650
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
651
- if (idx >= 0) {
652
- s.jobs[idx].status = 'running';
653
- s.jobs[idx].runId = runId;
654
- s.jobs[idx].startedAt = new Date().toISOString();
655
- }
656
- });
657
- broadcast();
658
-
659
- // Execute — onPid persists the child PID into the running state.
660
- const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
661
- await mutate((s) => {
662
- const idx = s.jobs.findIndex((x) => x.slug === job.slug);
663
- if (idx >= 0) {
664
- s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
665
- }
666
- });
667
- });
668
-
669
- // Rate-limit: pause before writing terminal status so the status
670
- // mutate below can read the pause state.
671
- if (res.rateLimited) {
672
- const resetIso = await refreshNextReset().catch(() => cachedNextReset);
673
- await setPaused('rate_limit', resetIso);
674
- }
675
-
676
- // Write terminal status; strip runtime regardless of outcome.
677
- await mutate((s) => {
678
- const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
679
- if (i2 >= 0) {
680
- const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
681
- if (treatAsPending) {
682
- resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
683
- } else {
684
- s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
685
- s.jobs[i2].finishedAt = new Date().toISOString();
686
- s.jobs[i2].exitCode = res.exitCode;
687
- s.jobs[i2].error = res.error || null;
688
- delete s.jobs[i2].runtime;
689
- }
690
- }
691
- });
692
- broadcast();
693
- } catch (e) {
694
- console.error('[scheduler] launch error', job.slug, e);
1019
+ const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
1020
+ await mutate((s) => {
1021
+ const idx = s.jobs.findIndex((x) => x.slug === job.slug);
1022
+ if (idx >= 0) {
1023
+ s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
1024
+ }
1025
+ });
1026
+ });
1027
+
1028
+ if (res.rateLimited) {
1029
+ const resetIso = await refreshNextReset().catch(() => cachedNextReset);
1030
+ await setPaused('rate_limit', resetIso);
1031
+ }
1032
+
1033
+ let actuallyFailed = false;
1034
+ let failedJobSnapshot = null;
1035
+ await mutate((s) => {
1036
+ const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
1037
+ if (i2 >= 0) {
1038
+ const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
1039
+ if (treatAsPending) {
1040
+ resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
1041
+ } else {
1042
+ s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
1043
+ s.jobs[i2].finishedAt = new Date().toISOString();
1044
+ s.jobs[i2].exitCode = res.exitCode;
1045
+ s.jobs[i2].error = res.error || null;
1046
+ delete s.jobs[i2].runtime;
1047
+ if (s.jobs[i2].status === 'failed') {
1048
+ actuallyFailed = true;
1049
+ failedJobSnapshot = { ...s.jobs[i2] };
695
1050
  }
696
- })();
697
- inFlight.add(promise);
698
- promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
699
- };
700
-
701
- // Prime up to cap
702
- while (queue.length && inFlight.size < cap && !cancelToken.cancelled) launch(queue.shift());
703
- // Drain. If cancelled mid-group, stop launching new jobs but let
704
- // already-launched ones settle (they're rate-limited too — short).
705
- while (inFlight.size > 0) {
706
- await Promise.race(inFlight);
707
- if (cancelToken.cancelled) {
708
- await Promise.allSettled([...inFlight]);
709
- break;
710
1051
  }
711
- while (queue.length && inFlight.size < cap) launch(queue.shift());
712
1052
  }
1053
+ });
1054
+ broadcast();
1055
+
1056
+ if (actuallyFailed && failedJobSnapshot) {
1057
+ spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
1058
+ console.error('[scheduler] spawnInvestigation error', job.slug, e);
1059
+ });
713
1060
  }
1061
+ } catch (e) {
1062
+ console.error('[scheduler] spawnJob error', job.slug, e);
714
1063
  } finally {
715
- isExecuting = false;
716
- // No longer auto-disable after a run. The firePolicy now governs whether
717
- // the next batch fires automatically. Just clear the one-shot scheduledFor.
718
- await mutate((s) => { s.scheduledFor = null; });
1064
+ runningSet.delete(job.slug);
1065
+ // Each job completion is a signal to advance the queue.
1066
+ tickQueue().catch(() => {});
1067
+ }
1068
+ }
1069
+
1070
+ // Serialized ticker: prevents two concurrent tickQueue() calls from racing
1071
+ // on the same pending jobs. A simple promise tail suffices since pickNextBatch
1072
+ // is synchronous and spawnJob is fire-and-forget.
1073
+ let tickTail = Promise.resolve();
1074
+
1075
+ function tickQueue() {
1076
+ const next = tickTail.then(async () => {
1077
+ const state = readQueue();
1078
+ if (state.paused) {
1079
+ console.log('[scheduler] tickQueue skipped: paused');
1080
+ return;
1081
+ }
1082
+ if (cancelToken.cancelled) return;
1083
+
1084
+ reconcile(state);
1085
+ const cap = ENV_CAP ?? state.config.concurrencyCap;
1086
+ const batch = pickNextBatch(state.jobs, runningSet, cap);
1087
+ if (batch.length === 0) return;
1088
+
1089
+ await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
719
1090
  broadcast();
1091
+
1092
+ const { runId, dir: runDir } = pickRunDir();
1093
+ for (const job of batch) {
1094
+ if (cancelToken.cancelled) break;
1095
+ // spawnJob is fire-and-forget; it calls tickQueue() on completion.
1096
+ spawnJob(job, runId, runDir, state.config.defaultCwd).catch(() => {});
1097
+ }
1098
+ });
1099
+ tickTail = next.catch(() => {});
1100
+ return next;
1101
+ }
1102
+
1103
+ async function runDueJobs() {
1104
+ const state = readQueue();
1105
+ if (state.paused) {
1106
+ console.log('[scheduler] runDueJobs skipped: paused');
1107
+ return;
720
1108
  }
1109
+ cancelToken = { cancelled: false };
1110
+ await tickQueue();
1111
+ // Clear the one-shot scheduledFor without waiting for jobs to settle.
1112
+ await mutate((s) => { s.scheduledFor = null; });
1113
+ broadcast();
721
1114
  }
722
1115
 
723
1116
  // ---------- when-available launch logic ----------
@@ -725,16 +1118,15 @@ async function runDueJobs() {
725
1118
  async function maybeLaunchWhenAvailable(state) {
726
1119
  if (state.config.firePolicy !== 'when-available') return;
727
1120
  if (state.paused) return;
728
- if (isExecuting) return;
729
- const pending = state.jobs.filter((j) => j.status === 'pending');
1121
+ const pending = state.jobs.filter((j) => j.status === 'pending' && !runningSet.has(j.slug));
730
1122
  if (pending.length === 0) return;
731
1123
  if (cachedUtilization === null || cachedUtilization === undefined) return;
732
1124
  if (cachedUtilization >= state.config.utilizationThreshold) {
733
1125
  broadcast();
734
1126
  return;
735
1127
  }
736
- console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
737
- runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
1128
+ console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running ticking`);
1129
+ tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
738
1130
  }
739
1131
 
740
1132
  // ---------- poll loop with exponential backoff ----------
@@ -750,6 +1142,8 @@ async function pollLoop() {
750
1142
  backoffMs = 0;
751
1143
  backoffNextAt = null;
752
1144
  firstFailureAt = null;
1145
+ firstNon429FailureAt = null;
1146
+ lastFailureKind = null;
753
1147
  lastPollAt = Date.now();
754
1148
  lastPollOk = true;
755
1149
  persistSchedulerState();
@@ -764,6 +1158,19 @@ async function pollLoop() {
764
1158
  await clearPause('reset-recovered');
765
1159
  }
766
1160
 
1161
+ await maybeLaunchWhenAvailable(cur);
1162
+ broadcast();
1163
+ } else if (r.kind === 'meter_rate_limited') {
1164
+ // Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
1165
+ // fire available jobs anyway at utilization=0 rather than pausing the queue.
1166
+ lastPollAt = Date.now();
1167
+ lastPollOk = false;
1168
+ consecutiveFailures++;
1169
+ lastFailureKind = 'meter_rate_limited';
1170
+ // Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
1171
+ cachedUtilization = 0; // assume safe; fire any pending work
1172
+ console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
1173
+ const cur = readQueue();
767
1174
  await maybeLaunchWhenAvailable(cur);
768
1175
  broadcast();
769
1176
  } else {
@@ -773,16 +1180,19 @@ async function pollLoop() {
773
1180
  if (!firstFailureAt) firstFailureAt = Date.now();
774
1181
 
775
1182
  if (r.kind === 'auth') {
1183
+ lastFailureKind = 'auth';
776
1184
  console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
777
1185
  await setPaused('auth', null);
778
1186
  } else {
779
- // transient or config — apply exponential backoff.
1187
+ // transient or config — apply exponential backoff and count toward 30-min threshold.
1188
+ lastFailureKind = 'transient';
1189
+ if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
780
1190
  backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
781
- const totalFailureMs = Date.now() - firstFailureAt;
1191
+ const totalNon429FailureMs = Date.now() - firstNon429FailureAt;
782
1192
  console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
783
1193
 
784
- // After 30 minutes of consecutive failures, set 'network' pause.
785
- if (totalFailureMs > 30 * 60_000) {
1194
+ // After 30 minutes of consecutive non-429 failures, set 'network' pause.
1195
+ if (totalNon429FailureMs > 30 * 60_000) {
786
1196
  const cur2 = readQueue();
787
1197
  if (!cur2.paused || cur2.paused.reason === 'network') {
788
1198
  await setPaused('network', null);
@@ -798,7 +1208,9 @@ async function pollLoop() {
798
1208
  lastPollAt = Date.now();
799
1209
  lastPollOk = false;
800
1210
  consecutiveFailures++;
1211
+ lastFailureKind = 'transient';
801
1212
  if (!firstFailureAt) firstFailureAt = Date.now();
1213
+ if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
802
1214
  backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
803
1215
  backoffNextAt = Date.now() + backoffMs;
804
1216
  persistSchedulerState();
@@ -813,6 +1225,7 @@ async function pollLoop() {
813
1225
 
814
1226
  function registerScheduleHandlers() {
815
1227
  ensureDirs();
1228
+ supervisor.registerHandlers();
816
1229
 
817
1230
  ipcMain.handle('schedule:state', async () => {
818
1231
  const state = readQueue();
@@ -847,6 +1260,7 @@ function registerScheduleHandlers() {
847
1260
  lastPollAt,
848
1261
  lastPollOk,
849
1262
  consecutiveFailures,
1263
+ lastFailureKind,
850
1264
  backoffNextAt,
851
1265
  nextResetCached: cachedNextReset,
852
1266
  pausedSince: state.paused ? Date.parse(state.paused.since) : null,
@@ -855,6 +1269,14 @@ function registerScheduleHandlers() {
855
1269
  };
856
1270
  });
857
1271
 
1272
+ ipcMain.handle('schedule:force-tick', async () => {
1273
+ // Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
1274
+ // Clears any existing pause first (same semantics as run-now).
1275
+ await clearPause('run-now');
1276
+ runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error (force-tick)', e));
1277
+ return { ok: true };
1278
+ });
1279
+
858
1280
  ipcMain.handle('schedule:set-config', async (_e, partial) => {
859
1281
  const { schemas: s } = require('./ipcSchemas.cjs');
860
1282
  let validated;
@@ -864,7 +1286,11 @@ function registerScheduleHandlers() {
864
1286
  return { ok: false, error: e?.message ?? 'invalid config' };
865
1287
  }
866
1288
  const config = await mutate((state) => {
867
- state.config = { ...state.config, ...validated };
1289
+ const { supervisor: supPartial, ...rest } = validated;
1290
+ state.config = { ...state.config, ...rest };
1291
+ if (supPartial !== undefined) {
1292
+ state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
1293
+ }
868
1294
  return state.config;
869
1295
  });
870
1296
  await rescheduleTimer();
@@ -1051,6 +1477,11 @@ async function init() {
1051
1477
  pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
1052
1478
  if (pollLoopTimer.unref) pollLoopTimer.unref();
1053
1479
 
1480
+ // Supervisor: probe running jobs for wedged poll-loops.
1481
+ if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
1482
+ supervisor.startSupervisor({ readQueue, mutate });
1483
+ }
1484
+
1054
1485
  // Heartbeat: once per minute, log queue state for 24h visibility.
1055
1486
  if (heartbeatInterval) clearInterval(heartbeatInterval);
1056
1487
  heartbeatInterval = setInterval(() => {