claude-code-session-manager 0.8.3 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/assets/{cssMode-DyaNC2Cs.js → cssMode-BCLoTYI0.js} +1 -1
- package/dist/assets/{editor.main-BhSGi_Jw.js → editor.main-UoasbVGy.js} +3 -3
- package/dist/assets/{freemarker2-DZH3si5v.js → freemarker2-dhfKZR7u.js} +1 -1
- package/dist/assets/{handlebars-DvzTd6uL.js → handlebars-DdpqwFuV.js} +1 -1
- package/dist/assets/{html-C5GmopAN.js → html-1oTJClkg.js} +1 -1
- package/dist/assets/{htmlMode-DwnrHwx1.js → htmlMode-CF1QbIg-.js} +1 -1
- package/dist/assets/index-DWDcKbgI.js +3046 -0
- package/dist/assets/index-eqxng9X2.css +32 -0
- package/dist/assets/{javascript-JqHrxiCa.js → javascript-BP_Q5MFx.js} +1 -1
- package/dist/assets/{jsonMode-8rZcy09i.js → jsonMode-BtjA-2w_.js} +1 -1
- package/dist/assets/{liquid-ClpD_v7G.js → liquid-DstuL8vm.js} +1 -1
- package/dist/assets/{lspLanguageFeatures-u0WgQBQz.js → lspLanguageFeatures-DvSiaY4f.js} +1 -1
- package/dist/assets/{mdx-DtViUgdm.js → mdx-qO-uvsJd.js} +1 -1
- package/dist/assets/{python-CaAvhRGm.js → python-CCPz_1cy.js} +1 -1
- package/dist/assets/{razor-saGNVU7l.js → razor-B7tCzkdh.js} +1 -1
- package/dist/assets/{tsMode-HZwWTCj8.js → tsMode-hUkEyjsH.js} +1 -1
- package/dist/assets/{typescript-BInV4PNE.js → typescript-BeXECzAk.js} +1 -1
- package/dist/assets/{whisperWorker-ivwFFLMj.js → whisperWorker-QfIS0sPF.js} +5 -5
- package/dist/assets/{xml-tgO806YR.js → xml-MRJd4GHf.js} +1 -1
- package/dist/assets/{yaml-CHApZArv.js → yaml-CzGliMNL.js} +1 -1
- package/dist/index.html +2 -2
- package/package.json +16 -1
- package/src/main/historyAggregator.cjs +208 -0
- package/src/main/index.cjs +4 -0
- package/src/main/ipcSchemas.cjs +15 -0
- package/src/main/lib/schedulerConfig.cjs +2 -0
- package/src/main/scheduler.cjs +604 -120
- package/src/main/supervisor.cjs +512 -0
- package/src/main/usage.cjs +44 -2
- package/src/preload/api.d.ts +64 -2
- package/src/preload/index.cjs +10 -0
- package/dist/assets/index-BGshD4Pw.js +0 -2976
- package/dist/assets/index-DCK87t79.css +0 -32
package/src/main/scheduler.cjs
CHANGED
|
@@ -48,26 +48,57 @@ const { spawn } = require('node:child_process');
|
|
|
48
48
|
const { ipcMain } = require('electron');
|
|
49
49
|
const billing = require('./usage.cjs');
|
|
50
50
|
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
|
+
const supervisor = require('./supervisor.cjs');
|
|
51
52
|
const {
|
|
52
53
|
POLL_INTERVAL_MS,
|
|
53
54
|
USAGE_REFRESH_INTERVAL_MS,
|
|
54
55
|
MAX_JOB_DURATION_MS,
|
|
55
56
|
} = require('./lib/schedulerConfig.cjs');
|
|
56
57
|
|
|
58
|
+
const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
|
|
59
|
+
|
|
60
|
+
// After the agent emits a `result` event in its JSONL stream, the parent
|
|
61
|
+
// `claude -p` process should exit promptly. Real-world failure (2026-05-10
|
|
62
|
+
// cellar-publish): an agent emitted result=success, then spawned unbounded
|
|
63
|
+
// `until $(curl ...)` background bashes that kept the parent alive for 22
|
|
64
|
+
// minutes until manual intervention. The post-result watchdog catches this:
|
|
65
|
+
// if the process is still alive POST_RESULT_GRACE_MS after result, SIGTERM
|
|
66
|
+
// the whole process group; if still alive POST_RESULT_KILL_MS after SIGTERM,
|
|
67
|
+
// SIGKILL. The original `result.subtype` is preserved and used to map the
|
|
68
|
+
// kill exit code back to 0 so legit work isn't mismarked as failed.
|
|
69
|
+
const POST_RESULT_GRACE_MS = 90_000;
|
|
70
|
+
const POST_RESULT_KILL_MS = 30_000;
|
|
71
|
+
const RESULT_TAIL_POLL_MS = 5_000;
|
|
72
|
+
const RESULT_TAIL_BYTES = 8 * 1024;
|
|
73
|
+
|
|
74
|
+
// Idle-output watchdog: if the log file mtime stops advancing for this long
|
|
75
|
+
// while the process is still alive, the agent is hung mid-work (network
|
|
76
|
+
// stall, infinite tool loop, compaction wedge). User rule: anything not
|
|
77
|
+
// making progress for 20 minutes is presumed stuck. SIGTERM the process
|
|
78
|
+
// group, then SIGKILL after POST_RESULT_KILL_MS. The scheduler logs this
|
|
79
|
+
// distinctly from MAX_JOB_DURATION_MS so post-mortems can tell them apart.
|
|
80
|
+
const IDLE_OUTPUT_KILL_MS = 20 * 60_000;
|
|
81
|
+
const IDLE_CHECK_INTERVAL_MS = 60_000;
|
|
82
|
+
|
|
57
83
|
const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
|
|
58
84
|
const PRDS_DIR = path.join(ROOT, 'prds');
|
|
59
85
|
const RUNS_DIR = path.join(ROOT, 'runs');
|
|
86
|
+
const PRDS_ARCHIVE_DIR = path.join(ROOT, 'prds-archived');
|
|
60
87
|
const QUEUE_PATH = path.join(ROOT, 'queue.json');
|
|
61
88
|
const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
|
|
62
89
|
const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
|
|
63
90
|
const HEARTBEAT_MAX_BYTES = 1024 * 1024;
|
|
64
91
|
const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
|
|
65
92
|
|
|
93
|
+
const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
|
|
94
|
+
? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
|
|
95
|
+
: null;
|
|
96
|
+
|
|
66
97
|
const DEFAULT_CONFIG = {
|
|
67
98
|
// Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
|
|
68
99
|
enabled: false,
|
|
69
100
|
offsetMinutes: 15,
|
|
70
|
-
concurrencyCap:
|
|
101
|
+
concurrencyCap: ENV_CAP ?? 4,
|
|
71
102
|
defaultCwd: DEFAULT_PROJECT_CWD,
|
|
72
103
|
// 'when-available' = poll usage and fire whenever utilization < threshold.
|
|
73
104
|
// 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
|
|
@@ -76,6 +107,12 @@ const DEFAULT_CONFIG = {
|
|
|
76
107
|
// For 'when-available'. Fire only when five_hour utilization < this percent.
|
|
77
108
|
utilizationThreshold: 90,
|
|
78
109
|
schemaVersion: 1,
|
|
110
|
+
supervisor: {
|
|
111
|
+
enabled: true,
|
|
112
|
+
intervalMinutes: 15,
|
|
113
|
+
maxConcurrentProbes: 2,
|
|
114
|
+
probeStaleThresholdMinutes: 10,
|
|
115
|
+
},
|
|
79
116
|
};
|
|
80
117
|
|
|
81
118
|
// ---------- fs helpers ----------
|
|
@@ -324,6 +361,8 @@ let consecutiveFailures = 0;
|
|
|
324
361
|
let backoffMs = 0;
|
|
325
362
|
let backoffNextAt = null;
|
|
326
363
|
let firstFailureAt = null;
|
|
364
|
+
let firstNon429FailureAt = null; // tracks only transient/config failures; 429s don't count toward network-pause threshold
|
|
365
|
+
let lastFailureKind = null; // 'transient' | 'meter_rate_limited' | 'auth' | null
|
|
327
366
|
let pauseClearedManuallyAt = null;
|
|
328
367
|
|
|
329
368
|
// ---------- timer ----------
|
|
@@ -334,7 +373,9 @@ let resumeTimer = null;
|
|
|
334
373
|
let pollLoopTimer = null;
|
|
335
374
|
let rescheduleInterval = null;
|
|
336
375
|
let heartbeatInterval = null;
|
|
337
|
-
|
|
376
|
+
// In-memory set of slugs currently spawned in this process. Prevents
|
|
377
|
+
// double-spawn when runDueJobs() is called while jobs are in flight.
|
|
378
|
+
const runningSet = new Set();
|
|
338
379
|
let cancelToken = { cancelled: false };
|
|
339
380
|
let claudeBinPathCached = null;
|
|
340
381
|
|
|
@@ -569,22 +610,112 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
569
610
|
cwd,
|
|
570
611
|
env: childEnv,
|
|
571
612
|
stdio: ['ignore', fd, fd],
|
|
613
|
+
// detached:true puts the child in its own process group so we can kill
|
|
614
|
+
// the entire descendant tree (including any stray background bashes the
|
|
615
|
+
// agent spawned) with `process.kill(-pid)`. Without this, child.kill()
|
|
616
|
+
// only kills the immediate `claude` process, leaving orphaned subprocs
|
|
617
|
+
// that keep the parent alive (the 2026-05-10 cellar-publish hang).
|
|
618
|
+
detached: true,
|
|
572
619
|
});
|
|
573
620
|
|
|
574
|
-
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
|
|
621
|
+
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid} (process group)\n\n`);
|
|
575
622
|
|
|
576
623
|
// Fire-and-forget pid persistence — best effort.
|
|
577
624
|
if (onPid) onPid(child.pid).catch(() => {});
|
|
578
625
|
|
|
626
|
+
// Track whether the agent has emitted a `result` event in its JSONL stream.
|
|
627
|
+
// null until seen; then one of "success" | "error_max_turns" | ... per the
|
|
628
|
+
// claude harness's result subtype taxonomy.
|
|
629
|
+
let agentResultSubtype = null;
|
|
630
|
+
let postResultTimer = null;
|
|
631
|
+
let postResultKillTimer = null;
|
|
632
|
+
|
|
633
|
+
const killTree = (signal) => {
|
|
634
|
+
// Kill the whole process group. Negative pid targets the group leader's
|
|
635
|
+
// group (only works because we spawned with detached:true).
|
|
636
|
+
try { process.kill(-child.pid, signal); return true; }
|
|
637
|
+
catch {
|
|
638
|
+
try { process.kill(child.pid, signal); return true; }
|
|
639
|
+
catch { return false; /* already dead */ }
|
|
640
|
+
}
|
|
641
|
+
};
|
|
642
|
+
|
|
643
|
+
// Tail the log for {"type":"result","subtype":"..."} events. When we see
|
|
644
|
+
// one, start the post-result grace timer — the agent has declared done,
|
|
645
|
+
// so the process should exit promptly. If not, something is hanging
|
|
646
|
+
// (the cellar-publish failure mode).
|
|
647
|
+
const resultTailer = setInterval(() => {
|
|
648
|
+
if (agentResultSubtype) return; // already seen; tailer will be cleared below
|
|
649
|
+
try {
|
|
650
|
+
const stat = fs.statSync(logPath);
|
|
651
|
+
if (stat.size === 0) return;
|
|
652
|
+
const n = Math.min(stat.size, RESULT_TAIL_BYTES);
|
|
653
|
+
const buf = Buffer.alloc(n);
|
|
654
|
+
const fdR = fs.openSync(logPath, 'r');
|
|
655
|
+
fs.readSync(fdR, buf, 0, n, stat.size - n);
|
|
656
|
+
fs.closeSync(fdR);
|
|
657
|
+
const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
|
|
658
|
+
if (!m) return;
|
|
659
|
+
agentResultSubtype = m[1];
|
|
660
|
+
fs.writeSync(fd, `\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
|
|
661
|
+
`starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
|
|
662
|
+
clearInterval(resultTailer);
|
|
663
|
+
postResultTimer = setTimeout(() => {
|
|
664
|
+
fs.writeSync(fd, `\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
|
|
665
|
+
`child still alive — SIGTERM process group\n`);
|
|
666
|
+
killTree('SIGTERM');
|
|
667
|
+
postResultKillTimer = setTimeout(() => {
|
|
668
|
+
fs.writeSync(fd, `\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
669
|
+
killTree('SIGKILL');
|
|
670
|
+
}, POST_RESULT_KILL_MS);
|
|
671
|
+
if (postResultKillTimer.unref) postResultKillTimer.unref();
|
|
672
|
+
}, POST_RESULT_GRACE_MS);
|
|
673
|
+
if (postResultTimer.unref) postResultTimer.unref();
|
|
674
|
+
} catch { /* log not readable yet; try again */ }
|
|
675
|
+
}, RESULT_TAIL_POLL_MS);
|
|
676
|
+
if (resultTailer.unref) resultTailer.unref();
|
|
677
|
+
|
|
579
678
|
// Kill the child if it runs past the maximum allowed duration.
|
|
580
679
|
const watchdog = setTimeout(() => {
|
|
581
680
|
fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
|
|
582
|
-
|
|
681
|
+
killTree('SIGKILL');
|
|
583
682
|
}, MAX_JOB_DURATION_MS);
|
|
584
683
|
if (watchdog.unref) watchdog.unref();
|
|
585
684
|
|
|
586
|
-
|
|
685
|
+
// Idle-output watchdog: poll log mtime every IDLE_CHECK_INTERVAL_MS; if
|
|
686
|
+
// it hasn't advanced in IDLE_OUTPUT_KILL_MS, presume the agent is stuck
|
|
687
|
+
// and SIGTERM the process group.
|
|
688
|
+
let idleKillTimer = null;
|
|
689
|
+
const idleChecker = setInterval(() => {
|
|
690
|
+
try {
|
|
691
|
+
const stat = fs.statSync(logPath);
|
|
692
|
+
const idleMs = Date.now() - stat.mtimeMs;
|
|
693
|
+
if (idleMs > IDLE_OUTPUT_KILL_MS) {
|
|
694
|
+
fs.writeSync(fd, `\n[scheduler] idle-output watchdog: log mtime stalled ` +
|
|
695
|
+
`${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
|
|
696
|
+
clearInterval(idleChecker);
|
|
697
|
+
killTree('SIGTERM');
|
|
698
|
+
idleKillTimer = setTimeout(() => {
|
|
699
|
+
fs.writeSync(fd, `\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
700
|
+
killTree('SIGKILL');
|
|
701
|
+
}, POST_RESULT_KILL_MS);
|
|
702
|
+
if (idleKillTimer.unref) idleKillTimer.unref();
|
|
703
|
+
}
|
|
704
|
+
} catch { /* log not statable; skip */ }
|
|
705
|
+
}, IDLE_CHECK_INTERVAL_MS);
|
|
706
|
+
if (idleChecker.unref) idleChecker.unref();
|
|
707
|
+
|
|
708
|
+
const clearAllTimers = () => {
|
|
587
709
|
clearTimeout(watchdog);
|
|
710
|
+
clearInterval(resultTailer);
|
|
711
|
+
clearInterval(idleChecker);
|
|
712
|
+
if (postResultTimer) clearTimeout(postResultTimer);
|
|
713
|
+
if (postResultKillTimer) clearTimeout(postResultKillTimer);
|
|
714
|
+
if (idleKillTimer) clearTimeout(idleKillTimer);
|
|
715
|
+
};
|
|
716
|
+
|
|
717
|
+
child.on('error', (err) => {
|
|
718
|
+
clearAllTimers();
|
|
588
719
|
const durationMs = Date.now() - startedAt;
|
|
589
720
|
fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
|
|
590
721
|
closeFd();
|
|
@@ -592,132 +723,395 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
592
723
|
resolve({ exitCode: -1, durationMs, error: err.message });
|
|
593
724
|
});
|
|
594
725
|
|
|
595
|
-
child.on('exit', (code) => {
|
|
596
|
-
|
|
726
|
+
child.on('exit', (code, signal) => {
|
|
727
|
+
clearAllTimers();
|
|
597
728
|
const durationMs = Date.now() - startedAt;
|
|
598
|
-
|
|
729
|
+
// If we SIGTERM'd because of the post-result watchdog AND the agent had
|
|
730
|
+
// emitted result=success, the work succeeded; only the cleanup hung.
|
|
731
|
+
// Map the kill exit code to 0 so the job is marked completed, not failed.
|
|
732
|
+
// Node's child.on('exit') reports either code (normal) or signal (killed);
|
|
733
|
+
// when killed by signal, code is null. We also check 143 (128+SIGTERM)
|
|
734
|
+
// and 137 (128+SIGKILL) in case the process exited via signal-as-code.
|
|
735
|
+
let effectiveCode = code;
|
|
736
|
+
const killedBySignal = signal === 'SIGTERM' || signal === 'SIGKILL' || code === 143 || code === 137 || code === null;
|
|
737
|
+
const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
|
|
738
|
+
if (mappedToSuccess) {
|
|
739
|
+
effectiveCode = 0;
|
|
740
|
+
fs.writeSync(fd, `\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
|
|
741
|
+
`(result=success was emitted before kill)\n`);
|
|
742
|
+
}
|
|
743
|
+
fs.writeSync(fd, `\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
|
|
744
|
+
`duration=${Math.round(durationMs / 1000)}s\n`);
|
|
599
745
|
closeFd();
|
|
600
|
-
const rateLimited =
|
|
601
|
-
atomicWriteJson(metaPath, {
|
|
602
|
-
|
|
746
|
+
const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
|
|
747
|
+
atomicWriteJson(metaPath, {
|
|
748
|
+
slug: job.slug, cwd, exitCode: effectiveCode, rateLimited,
|
|
749
|
+
startedAt, finishedAt: Date.now(), durationMs,
|
|
750
|
+
agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
|
|
751
|
+
});
|
|
752
|
+
resolve({ exitCode: effectiveCode, durationMs, rateLimited });
|
|
603
753
|
});
|
|
604
754
|
});
|
|
605
755
|
}
|
|
606
756
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
757
|
+
/**
|
|
758
|
+
* Pick the next batch of jobs to spawn this tick.
|
|
759
|
+
*
|
|
760
|
+
* Rules:
|
|
761
|
+
* 1. Find the lowest parallelGroup that has pending jobs not already in
|
|
762
|
+
* runningSet.
|
|
763
|
+
* 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
|
|
764
|
+
* up to (cap - runningSet.size) more from the SAME group.
|
|
765
|
+
* 3. If the current group has NO jobs in runningSet (new group), and there
|
|
766
|
+
* are still jobs from an earlier group in runningSet, do nothing — wait
|
|
767
|
+
* for the earlier group to drain before advancing.
|
|
768
|
+
* 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
|
|
769
|
+
* AFTER a higher-numbered group was already picked, fire the late-arrival
|
|
770
|
+
* immediately in parallel with the active group rather than starving it
|
|
771
|
+
* until the active group drains. This handles the reconcile-race where
|
|
772
|
+
* a PRD file lands on disk between two pickNextBatch invocations.
|
|
773
|
+
* 5. A singleton group (unique NN, no other jobs share it) runs alone;
|
|
774
|
+
* no bleed into adjacent groups.
|
|
775
|
+
*
|
|
776
|
+
* Returns array of job objects to spawn. O(N) where N = pending.length.
|
|
777
|
+
*/
|
|
778
|
+
function pickNextBatch(allJobs, running, cap) {
|
|
779
|
+
const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
|
|
780
|
+
if (pending.length === 0) return [];
|
|
781
|
+
|
|
782
|
+
// Groups with at least one job in flight: either tracked in runningSet
|
|
783
|
+
// (this process spawned it) or still marked 'running' in queue.json
|
|
784
|
+
// (persisted from a previous session that hasn't been orphan-reset yet).
|
|
785
|
+
const activeGroups = new Set();
|
|
786
|
+
for (const slug of running) {
|
|
787
|
+
const job = allJobs.find((j) => j.slug === slug);
|
|
788
|
+
if (job) activeGroups.add(job.parallelGroup ?? 99);
|
|
789
|
+
}
|
|
790
|
+
for (const j of allJobs) {
|
|
791
|
+
if (j.status === 'running' && !running.has(j.slug)) {
|
|
792
|
+
activeGroups.add(j.parallelGroup ?? 99);
|
|
616
793
|
}
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
794
|
+
}
|
|
795
|
+
// Total slots consumed: in-process spawns + queue.json running count.
|
|
796
|
+
const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
|
|
797
|
+
const effectiveRunning = Math.max(running.size, queueRunningCount);
|
|
798
|
+
|
|
799
|
+
// Lowest pending group.
|
|
800
|
+
const lowestPendingGroup = pending.reduce(
|
|
801
|
+
(min, j) => Math.min(min, j.parallelGroup ?? 99),
|
|
802
|
+
Infinity,
|
|
803
|
+
);
|
|
804
|
+
|
|
805
|
+
if (activeGroups.size > 0) {
|
|
806
|
+
const lowestActive = Math.min(...activeGroups);
|
|
807
|
+
if (lowestPendingGroup > lowestActive) {
|
|
808
|
+
// Earlier group still running — wait for it to drain before advancing.
|
|
809
|
+
console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
|
|
810
|
+
return [];
|
|
621
811
|
}
|
|
622
|
-
|
|
812
|
+
if (lowestPendingGroup < lowestActive) {
|
|
813
|
+
// Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
|
|
814
|
+
// a higher-numbered group was already picked. Without this branch the
|
|
815
|
+
// pending PRD starves until the active group drains — the bug observed
|
|
816
|
+
// on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
|
|
817
|
+
// the g130 hardening trio ran. Honor priority: fire the late-arrival
|
|
818
|
+
// now, in parallel with the active group. (Strict serial group
|
|
819
|
+
// ordering still applies between groups that were both present at the
|
|
820
|
+
// time of picking; this only handles the reconcile-race edge case.)
|
|
821
|
+
const slots = cap - effectiveRunning;
|
|
822
|
+
if (slots <= 0) {
|
|
823
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
|
|
824
|
+
return [];
|
|
825
|
+
}
|
|
826
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
827
|
+
console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
|
|
828
|
+
return batch;
|
|
829
|
+
}
|
|
830
|
+
// Backfill slots remaining in the current group.
|
|
831
|
+
const slots = cap - effectiveRunning;
|
|
832
|
+
if (slots <= 0) {
|
|
833
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
834
|
+
return [];
|
|
835
|
+
}
|
|
836
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
|
|
837
|
+
if (batch.length > 0) {
|
|
838
|
+
console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
|
|
839
|
+
}
|
|
840
|
+
return batch;
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// No active group — start the next group fresh.
|
|
844
|
+
const slots = cap - effectiveRunning;
|
|
845
|
+
if (slots <= 0) {
|
|
846
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
847
|
+
return [];
|
|
848
|
+
}
|
|
849
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
850
|
+
console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
|
|
851
|
+
return batch;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
/**
|
|
855
|
+
* Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
|
|
856
|
+
* itself failed. The pattern matches the slug we generate in spawnInvestigation.
|
|
857
|
+
*/
|
|
858
|
+
function isFixPlanSlug(slug) {
|
|
859
|
+
return /^\d+-fix-/.test(slug);
|
|
860
|
+
}
|
|
623
861
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
862
|
+
/**
|
|
863
|
+
* Read the last `bytes` of a file as utf8. Returns '' on error.
|
|
864
|
+
*/
|
|
865
|
+
function readTail(filePath, bytes) {
|
|
866
|
+
try {
|
|
867
|
+
const stat = fs.statSync(filePath);
|
|
868
|
+
const n = Math.min(stat.size, bytes);
|
|
869
|
+
const fd = fs.openSync(filePath, 'r');
|
|
870
|
+
const buf = Buffer.alloc(n);
|
|
871
|
+
fs.readSync(fd, buf, 0, n, stat.size - n);
|
|
872
|
+
fs.closeSync(fd);
|
|
873
|
+
return buf.toString('utf8');
|
|
874
|
+
} catch {
|
|
875
|
+
return '';
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
/**
|
|
880
|
+
* Spawn an Opus investigation session for a failed job. The investigator's job
|
|
881
|
+
* is to read the failure log + original PRD, identify the root cause, and write
|
|
882
|
+
* a fix-plan PRD into prds/<NN>-fix-<base>.md. Reconcile picks it up; the next
|
|
883
|
+
* Sonnet slot fires it. Investigations themselves are NOT queue entries — they
|
|
884
|
+
* run out-of-band, so they don't consume the concurrency cap. They DO consume
|
|
885
|
+
* tokens, which the when-available throttle will reflect on the next poll.
|
|
886
|
+
*
|
|
887
|
+
* Skipped if the failed job is itself a fix-plan (avoids infinite recursion).
|
|
888
|
+
*/
|
|
889
|
+
async function spawnInvestigation(failedJob, runDir) {
|
|
890
|
+
if (isFixPlanSlug(failedJob.slug)) {
|
|
891
|
+
console.log(`[scheduler] skip investigation: ${failedJob.slug} is itself a fix plan`);
|
|
892
|
+
return;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
const failedLogPath = path.join(runDir, `${failedJob.slug}.log`);
|
|
896
|
+
const investigationLogPath = path.join(runDir, `${failedJob.slug}.investigation.log`);
|
|
897
|
+
|
|
898
|
+
let originalBody = '';
|
|
899
|
+
try {
|
|
900
|
+
originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
|
|
901
|
+
} catch {
|
|
902
|
+
originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
const logTail = readTail(failedLogPath, 16 * 1024) || '(failed to read log)';
|
|
906
|
+
|
|
907
|
+
const baseSlug = failedJob.slug.replace(/^\d+-/, '');
|
|
908
|
+
const group = failedJob.parallelGroup ?? 99;
|
|
909
|
+
const fixSlug = `${String(group).padStart(2, '0')}-fix-${baseSlug}`;
|
|
910
|
+
const fixPath = path.join(PRDS_DIR, `${fixSlug}.md`);
|
|
911
|
+
|
|
912
|
+
if (fs.existsSync(fixPath)) {
|
|
913
|
+
console.log(`[scheduler] skip investigation: fix plan already exists at ${fixPath}`);
|
|
914
|
+
return;
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
const cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
|
|
918
|
+
const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
|
|
919
|
+
|
|
920
|
+
# Failed job
|
|
921
|
+
- Slug: ${failedJob.slug}
|
|
922
|
+
- Title: ${failedJob.title}
|
|
923
|
+
- cwd: ${cwd}
|
|
924
|
+
- Exit code: ${failedJob.exitCode}
|
|
925
|
+
- Full failure log: ${failedLogPath}
|
|
926
|
+
|
|
927
|
+
# Original PRD body (this is what the job was trying to do)
|
|
928
|
+
\`\`\`
|
|
929
|
+
${originalBody}
|
|
930
|
+
\`\`\`
|
|
931
|
+
|
|
932
|
+
# Last ~16KB of the failure log (stream-json format from \`claude -p\`)
|
|
933
|
+
\`\`\`
|
|
934
|
+
${logTail}
|
|
935
|
+
\`\`\`
|
|
936
|
+
|
|
937
|
+
# Your task
|
|
938
|
+
1. Read the full failure log at ${failedLogPath} if the tail above isn't sufficient.
|
|
939
|
+
2. Read source files in ${cwd} as needed to understand the context.
|
|
940
|
+
3. Identify the root cause of the failure.
|
|
941
|
+
4. Write a NEW fix-plan PRD file at exactly this path:
|
|
942
|
+
|
|
943
|
+
${fixPath}
|
|
944
|
+
|
|
945
|
+
5. The frontmatter MUST be exactly this format (no extra keys):
|
|
946
|
+
\`\`\`
|
|
947
|
+
---
|
|
948
|
+
title: Fix: <short summary of the fix>
|
|
949
|
+
cwd: ${cwd}
|
|
950
|
+
parallelGroup: ${group}
|
|
951
|
+
estimateMinutes: <your time estimate>
|
|
952
|
+
---
|
|
953
|
+
\`\`\`
|
|
954
|
+
6. The PRD body MUST be self-contained — \`claude -p\` runs it on a fresh Sonnet session with NO conversation context. Include:
|
|
955
|
+
- Root-cause analysis (what went wrong and why)
|
|
956
|
+
- Concrete fix steps (specific files / commands / edits)
|
|
957
|
+
- Verification command(s) the next agent should run to confirm the fix
|
|
958
|
+
- Acceptance criteria
|
|
959
|
+
|
|
960
|
+
DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
|
|
961
|
+
|
|
962
|
+
const fd = fs.openSync(investigationLogPath, 'a');
|
|
963
|
+
fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n\n`);
|
|
964
|
+
|
|
965
|
+
const claudeBin = resolveClaudeBin();
|
|
966
|
+
const childEnv = cleanChildEnv();
|
|
967
|
+
const child = spawn(claudeBin, [
|
|
968
|
+
'-p', prompt,
|
|
969
|
+
'--model', 'opus',
|
|
970
|
+
'--dangerously-skip-permissions',
|
|
971
|
+
'--output-format', 'stream-json',
|
|
972
|
+
'--verbose',
|
|
973
|
+
], {
|
|
974
|
+
cwd,
|
|
975
|
+
env: childEnv,
|
|
976
|
+
stdio: ['ignore', fd, fd],
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
|
|
980
|
+
|
|
981
|
+
const watchdog = setTimeout(() => {
|
|
982
|
+
fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
|
|
983
|
+
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
984
|
+
}, MAX_INVESTIGATION_DURATION_MS);
|
|
985
|
+
if (watchdog.unref) watchdog.unref();
|
|
986
|
+
|
|
987
|
+
child.on('error', (err) => {
|
|
988
|
+
clearTimeout(watchdog);
|
|
989
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation error: ${err.message}\n`); } catch { /* */ }
|
|
990
|
+
try { fs.closeSync(fd); } catch { /* */ }
|
|
991
|
+
});
|
|
992
|
+
|
|
993
|
+
child.on('exit', (code) => {
|
|
994
|
+
clearTimeout(watchdog);
|
|
995
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation exit code=${code}\n`); } catch { /* */ }
|
|
996
|
+
try { fs.closeSync(fd); } catch { /* */ }
|
|
997
|
+
if (fs.existsSync(fixPath)) {
|
|
998
|
+
console.log(`[scheduler] investigation produced fix plan: ${fixSlug}`);
|
|
999
|
+
} else {
|
|
1000
|
+
console.log(`[scheduler] investigation finished WITHOUT producing fix plan (slug=${failedJob.slug}, code=${code})`);
|
|
631
1001
|
}
|
|
632
|
-
|
|
1002
|
+
// Trigger a tick so the new fix plan is reconciled into the queue and fired.
|
|
1003
|
+
tickQueue().catch(() => {});
|
|
1004
|
+
});
|
|
1005
|
+
}
|
|
633
1006
|
|
|
634
|
-
|
|
1007
|
+
async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
1008
|
+
runningSet.add(job.slug);
|
|
1009
|
+
try {
|
|
1010
|
+
await mutate((s) => {
|
|
1011
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1012
|
+
if (idx >= 0) {
|
|
1013
|
+
s.jobs[idx].status = 'running';
|
|
1014
|
+
s.jobs[idx].runId = runId;
|
|
1015
|
+
s.jobs[idx].startedAt = new Date().toISOString();
|
|
1016
|
+
}
|
|
1017
|
+
});
|
|
635
1018
|
broadcast();
|
|
636
1019
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
// Rate-limit: pause before writing terminal status so the status
|
|
670
|
-
// mutate below can read the pause state.
|
|
671
|
-
if (res.rateLimited) {
|
|
672
|
-
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
673
|
-
await setPaused('rate_limit', resetIso);
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
// Write terminal status; strip runtime regardless of outcome.
|
|
677
|
-
await mutate((s) => {
|
|
678
|
-
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
679
|
-
if (i2 >= 0) {
|
|
680
|
-
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
681
|
-
if (treatAsPending) {
|
|
682
|
-
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
683
|
-
} else {
|
|
684
|
-
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
685
|
-
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
686
|
-
s.jobs[i2].exitCode = res.exitCode;
|
|
687
|
-
s.jobs[i2].error = res.error || null;
|
|
688
|
-
delete s.jobs[i2].runtime;
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
});
|
|
692
|
-
broadcast();
|
|
693
|
-
} catch (e) {
|
|
694
|
-
console.error('[scheduler] launch error', job.slug, e);
|
|
1020
|
+
const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
|
|
1021
|
+
await mutate((s) => {
|
|
1022
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1023
|
+
if (idx >= 0) {
|
|
1024
|
+
s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
|
|
1025
|
+
}
|
|
1026
|
+
});
|
|
1027
|
+
});
|
|
1028
|
+
|
|
1029
|
+
if (res.rateLimited) {
|
|
1030
|
+
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
1031
|
+
await setPaused('rate_limit', resetIso);
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
let actuallyFailed = false;
|
|
1035
|
+
let failedJobSnapshot = null;
|
|
1036
|
+
await mutate((s) => {
|
|
1037
|
+
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1038
|
+
if (i2 >= 0) {
|
|
1039
|
+
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
1040
|
+
if (treatAsPending) {
|
|
1041
|
+
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
1042
|
+
} else {
|
|
1043
|
+
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
1044
|
+
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
1045
|
+
s.jobs[i2].exitCode = res.exitCode;
|
|
1046
|
+
s.jobs[i2].error = res.error || null;
|
|
1047
|
+
delete s.jobs[i2].runtime;
|
|
1048
|
+
if (s.jobs[i2].status === 'failed') {
|
|
1049
|
+
actuallyFailed = true;
|
|
1050
|
+
failedJobSnapshot = { ...s.jobs[i2] };
|
|
695
1051
|
}
|
|
696
|
-
})();
|
|
697
|
-
inFlight.add(promise);
|
|
698
|
-
promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
|
|
699
|
-
};
|
|
700
|
-
|
|
701
|
-
// Prime up to cap
|
|
702
|
-
while (queue.length && inFlight.size < cap && !cancelToken.cancelled) launch(queue.shift());
|
|
703
|
-
// Drain. If cancelled mid-group, stop launching new jobs but let
|
|
704
|
-
// already-launched ones settle (they're rate-limited too — short).
|
|
705
|
-
while (inFlight.size > 0) {
|
|
706
|
-
await Promise.race(inFlight);
|
|
707
|
-
if (cancelToken.cancelled) {
|
|
708
|
-
await Promise.allSettled([...inFlight]);
|
|
709
|
-
break;
|
|
710
1052
|
}
|
|
711
|
-
while (queue.length && inFlight.size < cap) launch(queue.shift());
|
|
712
1053
|
}
|
|
1054
|
+
});
|
|
1055
|
+
broadcast();
|
|
1056
|
+
|
|
1057
|
+
if (actuallyFailed && failedJobSnapshot) {
|
|
1058
|
+
spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
|
|
1059
|
+
console.error('[scheduler] spawnInvestigation error', job.slug, e);
|
|
1060
|
+
});
|
|
713
1061
|
}
|
|
1062
|
+
} catch (e) {
|
|
1063
|
+
console.error('[scheduler] spawnJob error', job.slug, e);
|
|
714
1064
|
} finally {
|
|
715
|
-
|
|
716
|
-
//
|
|
717
|
-
|
|
718
|
-
|
|
1065
|
+
runningSet.delete(job.slug);
|
|
1066
|
+
// Each job completion is a signal to advance the queue.
|
|
1067
|
+
tickQueue().catch(() => {});
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
// Serialized ticker: prevents two concurrent tickQueue() calls from racing
|
|
1072
|
+
// on the same pending jobs. A simple promise tail suffices since pickNextBatch
|
|
1073
|
+
// is synchronous and spawnJob is fire-and-forget.
|
|
1074
|
+
let tickTail = Promise.resolve();
|
|
1075
|
+
|
|
1076
|
+
function tickQueue() {
|
|
1077
|
+
const next = tickTail.then(async () => {
|
|
1078
|
+
const state = readQueue();
|
|
1079
|
+
if (state.paused) {
|
|
1080
|
+
console.log('[scheduler] tickQueue skipped: paused');
|
|
1081
|
+
return;
|
|
1082
|
+
}
|
|
1083
|
+
if (cancelToken.cancelled) return;
|
|
1084
|
+
|
|
1085
|
+
reconcile(state);
|
|
1086
|
+
const cap = ENV_CAP ?? state.config.concurrencyCap;
|
|
1087
|
+
const batch = pickNextBatch(state.jobs, runningSet, cap);
|
|
1088
|
+
if (batch.length === 0) return;
|
|
1089
|
+
|
|
1090
|
+
await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
|
|
719
1091
|
broadcast();
|
|
1092
|
+
|
|
1093
|
+
const { runId, dir: runDir } = pickRunDir();
|
|
1094
|
+
for (const job of batch) {
|
|
1095
|
+
if (cancelToken.cancelled) break;
|
|
1096
|
+
// spawnJob is fire-and-forget; it calls tickQueue() on completion.
|
|
1097
|
+
spawnJob(job, runId, runDir, state.config.defaultCwd).catch(() => {});
|
|
1098
|
+
}
|
|
1099
|
+
});
|
|
1100
|
+
tickTail = next.catch(() => {});
|
|
1101
|
+
return next;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
async function runDueJobs() {
|
|
1105
|
+
const state = readQueue();
|
|
1106
|
+
if (state.paused) {
|
|
1107
|
+
console.log('[scheduler] runDueJobs skipped: paused');
|
|
1108
|
+
return;
|
|
720
1109
|
}
|
|
1110
|
+
cancelToken = { cancelled: false };
|
|
1111
|
+
await tickQueue();
|
|
1112
|
+
// Clear the one-shot scheduledFor without waiting for jobs to settle.
|
|
1113
|
+
await mutate((s) => { s.scheduledFor = null; });
|
|
1114
|
+
broadcast();
|
|
721
1115
|
}
|
|
722
1116
|
|
|
723
1117
|
// ---------- when-available launch logic ----------
|
|
@@ -725,16 +1119,15 @@ async function runDueJobs() {
|
|
|
725
1119
|
async function maybeLaunchWhenAvailable(state) {
|
|
726
1120
|
if (state.config.firePolicy !== 'when-available') return;
|
|
727
1121
|
if (state.paused) return;
|
|
728
|
-
|
|
729
|
-
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
1122
|
+
const pending = state.jobs.filter((j) => j.status === 'pending' && !runningSet.has(j.slug));
|
|
730
1123
|
if (pending.length === 0) return;
|
|
731
1124
|
if (cachedUtilization === null || cachedUtilization === undefined) return;
|
|
732
1125
|
if (cachedUtilization >= state.config.utilizationThreshold) {
|
|
733
1126
|
broadcast();
|
|
734
1127
|
return;
|
|
735
1128
|
}
|
|
736
|
-
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending —
|
|
737
|
-
|
|
1129
|
+
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running — ticking`);
|
|
1130
|
+
tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
|
|
738
1131
|
}
|
|
739
1132
|
|
|
740
1133
|
// ---------- poll loop with exponential backoff ----------
|
|
@@ -750,6 +1143,8 @@ async function pollLoop() {
|
|
|
750
1143
|
backoffMs = 0;
|
|
751
1144
|
backoffNextAt = null;
|
|
752
1145
|
firstFailureAt = null;
|
|
1146
|
+
firstNon429FailureAt = null;
|
|
1147
|
+
lastFailureKind = null;
|
|
753
1148
|
lastPollAt = Date.now();
|
|
754
1149
|
lastPollOk = true;
|
|
755
1150
|
persistSchedulerState();
|
|
@@ -764,6 +1159,19 @@ async function pollLoop() {
|
|
|
764
1159
|
await clearPause('reset-recovered');
|
|
765
1160
|
}
|
|
766
1161
|
|
|
1162
|
+
await maybeLaunchWhenAvailable(cur);
|
|
1163
|
+
broadcast();
|
|
1164
|
+
} else if (r.kind === 'meter_rate_limited') {
|
|
1165
|
+
// Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
|
|
1166
|
+
// fire available jobs anyway at utilization=0 rather than pausing the queue.
|
|
1167
|
+
lastPollAt = Date.now();
|
|
1168
|
+
lastPollOk = false;
|
|
1169
|
+
consecutiveFailures++;
|
|
1170
|
+
lastFailureKind = 'meter_rate_limited';
|
|
1171
|
+
// Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
|
|
1172
|
+
cachedUtilization = 0; // assume safe; fire any pending work
|
|
1173
|
+
console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
|
|
1174
|
+
const cur = readQueue();
|
|
767
1175
|
await maybeLaunchWhenAvailable(cur);
|
|
768
1176
|
broadcast();
|
|
769
1177
|
} else {
|
|
@@ -773,16 +1181,19 @@ async function pollLoop() {
|
|
|
773
1181
|
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
774
1182
|
|
|
775
1183
|
if (r.kind === 'auth') {
|
|
1184
|
+
lastFailureKind = 'auth';
|
|
776
1185
|
console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
|
|
777
1186
|
await setPaused('auth', null);
|
|
778
1187
|
} else {
|
|
779
|
-
// transient or config — apply exponential backoff.
|
|
1188
|
+
// transient or config — apply exponential backoff and count toward 30-min threshold.
|
|
1189
|
+
lastFailureKind = 'transient';
|
|
1190
|
+
if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
|
|
780
1191
|
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
781
|
-
const
|
|
1192
|
+
const totalNon429FailureMs = Date.now() - firstNon429FailureAt;
|
|
782
1193
|
console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
|
|
783
1194
|
|
|
784
|
-
// After 30 minutes of consecutive failures, set 'network' pause.
|
|
785
|
-
if (
|
|
1195
|
+
// After 30 minutes of consecutive non-429 failures, set 'network' pause.
|
|
1196
|
+
if (totalNon429FailureMs > 30 * 60_000) {
|
|
786
1197
|
const cur2 = readQueue();
|
|
787
1198
|
if (!cur2.paused || cur2.paused.reason === 'network') {
|
|
788
1199
|
await setPaused('network', null);
|
|
@@ -798,7 +1209,9 @@ async function pollLoop() {
|
|
|
798
1209
|
lastPollAt = Date.now();
|
|
799
1210
|
lastPollOk = false;
|
|
800
1211
|
consecutiveFailures++;
|
|
1212
|
+
lastFailureKind = 'transient';
|
|
801
1213
|
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
1214
|
+
if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
|
|
802
1215
|
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
803
1216
|
backoffNextAt = Date.now() + backoffMs;
|
|
804
1217
|
persistSchedulerState();
|
|
@@ -813,6 +1226,7 @@ async function pollLoop() {
|
|
|
813
1226
|
|
|
814
1227
|
function registerScheduleHandlers() {
|
|
815
1228
|
ensureDirs();
|
|
1229
|
+
supervisor.registerHandlers();
|
|
816
1230
|
|
|
817
1231
|
ipcMain.handle('schedule:state', async () => {
|
|
818
1232
|
const state = readQueue();
|
|
@@ -847,6 +1261,7 @@ function registerScheduleHandlers() {
|
|
|
847
1261
|
lastPollAt,
|
|
848
1262
|
lastPollOk,
|
|
849
1263
|
consecutiveFailures,
|
|
1264
|
+
lastFailureKind,
|
|
850
1265
|
backoffNextAt,
|
|
851
1266
|
nextResetCached: cachedNextReset,
|
|
852
1267
|
pausedSince: state.paused ? Date.parse(state.paused.since) : null,
|
|
@@ -855,6 +1270,14 @@ function registerScheduleHandlers() {
|
|
|
855
1270
|
};
|
|
856
1271
|
});
|
|
857
1272
|
|
|
1273
|
+
ipcMain.handle('schedule:force-tick', async () => {
|
|
1274
|
+
// Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
|
|
1275
|
+
// Clears any existing pause first (same semantics as run-now).
|
|
1276
|
+
await clearPause('run-now');
|
|
1277
|
+
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error (force-tick)', e));
|
|
1278
|
+
return { ok: true };
|
|
1279
|
+
});
|
|
1280
|
+
|
|
858
1281
|
ipcMain.handle('schedule:set-config', async (_e, partial) => {
|
|
859
1282
|
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
860
1283
|
let validated;
|
|
@@ -864,7 +1287,11 @@ function registerScheduleHandlers() {
|
|
|
864
1287
|
return { ok: false, error: e?.message ?? 'invalid config' };
|
|
865
1288
|
}
|
|
866
1289
|
const config = await mutate((state) => {
|
|
867
|
-
|
|
1290
|
+
const { supervisor: supPartial, ...rest } = validated;
|
|
1291
|
+
state.config = { ...state.config, ...rest };
|
|
1292
|
+
if (supPartial !== undefined) {
|
|
1293
|
+
state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
|
|
1294
|
+
}
|
|
868
1295
|
return state.config;
|
|
869
1296
|
});
|
|
870
1297
|
await rescheduleTimer();
|
|
@@ -913,6 +1340,58 @@ function registerScheduleHandlers() {
|
|
|
913
1340
|
return { ok: true, nextReset: at };
|
|
914
1341
|
});
|
|
915
1342
|
|
|
1343
|
+
// Re-scan prds/ folder and merge into queue.json. The `schedule:state`
|
|
1344
|
+
// handler already reconciles on read, but this gives the renderer an
|
|
1345
|
+
// explicit refresh path that also broadcasts so all views update.
|
|
1346
|
+
ipcMain.handle('schedule:rescan', async () => {
|
|
1347
|
+
await mutate((state) => {
|
|
1348
|
+
reconcile(state);
|
|
1349
|
+
return null;
|
|
1350
|
+
});
|
|
1351
|
+
broadcast();
|
|
1352
|
+
return { ok: true };
|
|
1353
|
+
});
|
|
1354
|
+
|
|
1355
|
+
// Archive all pending+failed PRDs and drop their entries from queue.json.
|
|
1356
|
+
// Completed/running entries are kept. PRD files are moved (not deleted) to
|
|
1357
|
+
// prds-archived/<ISO>/ so the user can recover them. Path containment is
|
|
1358
|
+
// enforced — only files inside PRDS_DIR are moved.
|
|
1359
|
+
ipcMain.handle('schedule:clear-queue', async () => {
|
|
1360
|
+
ensureDirs();
|
|
1361
|
+
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
|
1362
|
+
const archiveDir = path.join(PRDS_ARCHIVE_DIR, ts);
|
|
1363
|
+
const state = readQueue();
|
|
1364
|
+
const victims = state.jobs.filter((j) => j.status === 'pending' || j.status === 'failed');
|
|
1365
|
+
if (victims.length === 0) {
|
|
1366
|
+
return { ok: true, archived: 0, archivedTo: null };
|
|
1367
|
+
}
|
|
1368
|
+
fs.mkdirSync(archiveDir, { recursive: true });
|
|
1369
|
+
let archived = 0;
|
|
1370
|
+
for (const job of victims) {
|
|
1371
|
+
const src = path.resolve(path.join(PRDS_DIR, `${job.slug}.md`));
|
|
1372
|
+
if (!src.startsWith(PRDS_DIR + path.sep)) continue;
|
|
1373
|
+
const dst = path.join(archiveDir, `${job.slug}.md`);
|
|
1374
|
+
try {
|
|
1375
|
+
await fsp.rename(src, dst);
|
|
1376
|
+
archived++;
|
|
1377
|
+
} catch (e) {
|
|
1378
|
+
// ENOENT: the .md is already gone (reconcile would drop it on next
|
|
1379
|
+
// read anyway). Either way, fall through and remove from queue.
|
|
1380
|
+
if (e?.code !== 'ENOENT') {
|
|
1381
|
+
console.warn('[scheduler] clear-queue: rename failed', job.slug, e?.message);
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
await mutate((s) => {
|
|
1386
|
+
const victimSlugs = new Set(victims.map((j) => j.slug));
|
|
1387
|
+
s.jobs = s.jobs.filter((j) => !victimSlugs.has(j.slug));
|
|
1388
|
+
reconcile(s);
|
|
1389
|
+
return null;
|
|
1390
|
+
});
|
|
1391
|
+
broadcast();
|
|
1392
|
+
return { ok: true, archived, archivedTo: archiveDir };
|
|
1393
|
+
});
|
|
1394
|
+
|
|
916
1395
|
ipcMain.handle('schedule:open-folder', async () => {
|
|
917
1396
|
const { shell } = require('electron');
|
|
918
1397
|
await shell.openPath(ROOT);
|
|
@@ -1051,6 +1530,11 @@ async function init() {
|
|
|
1051
1530
|
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
|
|
1052
1531
|
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
1053
1532
|
|
|
1533
|
+
// Supervisor: probe running jobs for wedged poll-loops.
|
|
1534
|
+
if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
|
|
1535
|
+
supervisor.startSupervisor({ readQueue, mutate });
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1054
1538
|
// Heartbeat: once per minute, log queue state for 24h visibility.
|
|
1055
1539
|
if (heartbeatInterval) clearInterval(heartbeatInterval);
|
|
1056
1540
|
heartbeatInterval = setInterval(() => {
|