claude-code-session-manager 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/assets/{cssMode-DyaNC2Cs.js → cssMode-SwUA7tV8.js} +1 -1
- package/dist/assets/{editor.main-BhSGi_Jw.js → editor.main-C0vxDQaJ.js} +3 -3
- package/dist/assets/{freemarker2-DZH3si5v.js → freemarker2-KX6gG2yg.js} +1 -1
- package/dist/assets/{handlebars-DvzTd6uL.js → handlebars-B9LxZbcv.js} +1 -1
- package/dist/assets/{html-C5GmopAN.js → html-B-8ZJyzc.js} +1 -1
- package/dist/assets/{htmlMode-DwnrHwx1.js → htmlMode-BrQu9A96.js} +1 -1
- package/dist/assets/index-CrdBAMX2.js +3044 -0
- package/dist/assets/index-DpbPBSiS.css +32 -0
- package/dist/assets/{javascript-JqHrxiCa.js → javascript-D3vhfNJL.js} +1 -1
- package/dist/assets/{jsonMode-8rZcy09i.js → jsonMode-CCVQ7oTr.js} +1 -1
- package/dist/assets/{liquid-ClpD_v7G.js → liquid-DnwPncmC.js} +1 -1
- package/dist/assets/{lspLanguageFeatures-u0WgQBQz.js → lspLanguageFeatures-DrwXiqW0.js} +1 -1
- package/dist/assets/{mdx-DtViUgdm.js → mdx-C6MKH-vG.js} +1 -1
- package/dist/assets/{python-CaAvhRGm.js → python-Dp1TzxJl.js} +1 -1
- package/dist/assets/{razor-saGNVU7l.js → razor-KRJat9pO.js} +1 -1
- package/dist/assets/{tsMode-HZwWTCj8.js → tsMode-BZ-CF_4O.js} +1 -1
- package/dist/assets/{typescript-BInV4PNE.js → typescript-CTLs4m8W.js} +1 -1
- package/dist/assets/{whisperWorker-ivwFFLMj.js → whisperWorker-QfIS0sPF.js} +5 -5
- package/dist/assets/{xml-tgO806YR.js → xml-DUE-XnsH.js} +1 -1
- package/dist/assets/{yaml-CHApZArv.js → yaml-DSuhPI0o.js} +1 -1
- package/dist/index.html +2 -2
- package/package.json +16 -1
- package/src/main/historyAggregator.cjs +208 -0
- package/src/main/index.cjs +4 -0
- package/src/main/ipcSchemas.cjs +15 -0
- package/src/main/lib/schedulerConfig.cjs +2 -0
- package/src/main/scheduler.cjs +551 -120
- package/src/main/supervisor.cjs +512 -0
- package/src/main/usage.cjs +44 -2
- package/src/preload/api.d.ts +59 -2
- package/src/preload/index.cjs +8 -0
- package/dist/assets/index-BGshD4Pw.js +0 -2976
- package/dist/assets/index-DCK87t79.css +0 -32
package/src/main/scheduler.cjs
CHANGED
|
@@ -48,12 +48,38 @@ const { spawn } = require('node:child_process');
|
|
|
48
48
|
const { ipcMain } = require('electron');
|
|
49
49
|
const billing = require('./usage.cjs');
|
|
50
50
|
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
|
+
const supervisor = require('./supervisor.cjs');
|
|
51
52
|
const {
|
|
52
53
|
POLL_INTERVAL_MS,
|
|
53
54
|
USAGE_REFRESH_INTERVAL_MS,
|
|
54
55
|
MAX_JOB_DURATION_MS,
|
|
55
56
|
} = require('./lib/schedulerConfig.cjs');
|
|
56
57
|
|
|
58
|
+
const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
|
|
59
|
+
|
|
60
|
+
// After the agent emits a `result` event in its JSONL stream, the parent
|
|
61
|
+
// `claude -p` process should exit promptly. Real-world failure (2026-05-10
|
|
62
|
+
// cellar-publish): an agent emitted result=success, then spawned unbounded
|
|
63
|
+
// `until $(curl ...)` background bashes that kept the parent alive for 22
|
|
64
|
+
// minutes until manual intervention. The post-result watchdog catches this:
|
|
65
|
+
// if the process is still alive POST_RESULT_GRACE_MS after result, SIGTERM
|
|
66
|
+
// the whole process group; if still alive POST_RESULT_KILL_MS after SIGTERM,
|
|
67
|
+
// SIGKILL. The original `result.subtype` is preserved and used to map the
|
|
68
|
+
// kill exit code back to 0 so legit work isn't mismarked as failed.
|
|
69
|
+
const POST_RESULT_GRACE_MS = 90_000;
|
|
70
|
+
const POST_RESULT_KILL_MS = 30_000;
|
|
71
|
+
const RESULT_TAIL_POLL_MS = 5_000;
|
|
72
|
+
const RESULT_TAIL_BYTES = 8 * 1024;
|
|
73
|
+
|
|
74
|
+
// Idle-output watchdog: if the log file mtime stops advancing for this long
|
|
75
|
+
// while the process is still alive, the agent is hung mid-work (network
|
|
76
|
+
// stall, infinite tool loop, compaction wedge). User rule: anything not
|
|
77
|
+
// making progress for 20 minutes is presumed stuck. SIGTERM the process
|
|
78
|
+
// group, then SIGKILL after POST_RESULT_KILL_MS. The scheduler logs this
|
|
79
|
+
// distinctly from MAX_JOB_DURATION_MS so post-mortems can tell them apart.
|
|
80
|
+
const IDLE_OUTPUT_KILL_MS = 20 * 60_000;
|
|
81
|
+
const IDLE_CHECK_INTERVAL_MS = 60_000;
|
|
82
|
+
|
|
57
83
|
const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
|
|
58
84
|
const PRDS_DIR = path.join(ROOT, 'prds');
|
|
59
85
|
const RUNS_DIR = path.join(ROOT, 'runs');
|
|
@@ -63,11 +89,15 @@ const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'sc
|
|
|
63
89
|
const HEARTBEAT_MAX_BYTES = 1024 * 1024;
|
|
64
90
|
const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
|
|
65
91
|
|
|
92
|
+
const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
|
|
93
|
+
? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
|
|
94
|
+
: null;
|
|
95
|
+
|
|
66
96
|
const DEFAULT_CONFIG = {
|
|
67
97
|
// Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
|
|
68
98
|
enabled: false,
|
|
69
99
|
offsetMinutes: 15,
|
|
70
|
-
concurrencyCap:
|
|
100
|
+
concurrencyCap: ENV_CAP ?? 4,
|
|
71
101
|
defaultCwd: DEFAULT_PROJECT_CWD,
|
|
72
102
|
// 'when-available' = poll usage and fire whenever utilization < threshold.
|
|
73
103
|
// 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
|
|
@@ -76,6 +106,12 @@ const DEFAULT_CONFIG = {
|
|
|
76
106
|
// For 'when-available'. Fire only when five_hour utilization < this percent.
|
|
77
107
|
utilizationThreshold: 90,
|
|
78
108
|
schemaVersion: 1,
|
|
109
|
+
supervisor: {
|
|
110
|
+
enabled: true,
|
|
111
|
+
intervalMinutes: 15,
|
|
112
|
+
maxConcurrentProbes: 2,
|
|
113
|
+
probeStaleThresholdMinutes: 10,
|
|
114
|
+
},
|
|
79
115
|
};
|
|
80
116
|
|
|
81
117
|
// ---------- fs helpers ----------
|
|
@@ -324,6 +360,8 @@ let consecutiveFailures = 0;
|
|
|
324
360
|
let backoffMs = 0;
|
|
325
361
|
let backoffNextAt = null;
|
|
326
362
|
let firstFailureAt = null;
|
|
363
|
+
let firstNon429FailureAt = null; // tracks only transient/config failures; 429s don't count toward network-pause threshold
|
|
364
|
+
let lastFailureKind = null; // 'transient' | 'meter_rate_limited' | 'auth' | null
|
|
327
365
|
let pauseClearedManuallyAt = null;
|
|
328
366
|
|
|
329
367
|
// ---------- timer ----------
|
|
@@ -334,7 +372,9 @@ let resumeTimer = null;
|
|
|
334
372
|
let pollLoopTimer = null;
|
|
335
373
|
let rescheduleInterval = null;
|
|
336
374
|
let heartbeatInterval = null;
|
|
337
|
-
|
|
375
|
+
// In-memory set of slugs currently spawned in this process. Prevents
|
|
376
|
+
// double-spawn when runDueJobs() is called while jobs are in flight.
|
|
377
|
+
const runningSet = new Set();
|
|
338
378
|
let cancelToken = { cancelled: false };
|
|
339
379
|
let claudeBinPathCached = null;
|
|
340
380
|
|
|
@@ -569,22 +609,112 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
569
609
|
cwd,
|
|
570
610
|
env: childEnv,
|
|
571
611
|
stdio: ['ignore', fd, fd],
|
|
612
|
+
// detached:true puts the child in its own process group so we can kill
|
|
613
|
+
// the entire descendant tree (including any stray background bashes the
|
|
614
|
+
// agent spawned) with `process.kill(-pid)`. Without this, child.kill()
|
|
615
|
+
// only kills the immediate `claude` process, leaving orphaned subprocs
|
|
616
|
+
// that keep the parent alive (the 2026-05-10 cellar-publish hang).
|
|
617
|
+
detached: true,
|
|
572
618
|
});
|
|
573
619
|
|
|
574
|
-
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
|
|
620
|
+
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid} (process group)\n\n`);
|
|
575
621
|
|
|
576
622
|
// Fire-and-forget pid persistence — best effort.
|
|
577
623
|
if (onPid) onPid(child.pid).catch(() => {});
|
|
578
624
|
|
|
625
|
+
// Track whether the agent has emitted a `result` event in its JSONL stream.
|
|
626
|
+
// null until seen; then one of "success" | "error_max_turns" | ... per the
|
|
627
|
+
// claude harness's result subtype taxonomy.
|
|
628
|
+
let agentResultSubtype = null;
|
|
629
|
+
let postResultTimer = null;
|
|
630
|
+
let postResultKillTimer = null;
|
|
631
|
+
|
|
632
|
+
const killTree = (signal) => {
|
|
633
|
+
// Kill the whole process group. Negative pid targets the group leader's
|
|
634
|
+
// group (only works because we spawned with detached:true).
|
|
635
|
+
try { process.kill(-child.pid, signal); return true; }
|
|
636
|
+
catch {
|
|
637
|
+
try { process.kill(child.pid, signal); return true; }
|
|
638
|
+
catch { return false; /* already dead */ }
|
|
639
|
+
}
|
|
640
|
+
};
|
|
641
|
+
|
|
642
|
+
// Tail the log for {"type":"result","subtype":"..."} events. When we see
|
|
643
|
+
// one, start the post-result grace timer — the agent has declared done,
|
|
644
|
+
// so the process should exit promptly. If not, something is hanging
|
|
645
|
+
// (the cellar-publish failure mode).
|
|
646
|
+
const resultTailer = setInterval(() => {
|
|
647
|
+
if (agentResultSubtype) return; // already seen; tailer will be cleared below
|
|
648
|
+
try {
|
|
649
|
+
const stat = fs.statSync(logPath);
|
|
650
|
+
if (stat.size === 0) return;
|
|
651
|
+
const n = Math.min(stat.size, RESULT_TAIL_BYTES);
|
|
652
|
+
const buf = Buffer.alloc(n);
|
|
653
|
+
const fdR = fs.openSync(logPath, 'r');
|
|
654
|
+
fs.readSync(fdR, buf, 0, n, stat.size - n);
|
|
655
|
+
fs.closeSync(fdR);
|
|
656
|
+
const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
|
|
657
|
+
if (!m) return;
|
|
658
|
+
agentResultSubtype = m[1];
|
|
659
|
+
fs.writeSync(fd, `\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
|
|
660
|
+
`starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
|
|
661
|
+
clearInterval(resultTailer);
|
|
662
|
+
postResultTimer = setTimeout(() => {
|
|
663
|
+
fs.writeSync(fd, `\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
|
|
664
|
+
`child still alive — SIGTERM process group\n`);
|
|
665
|
+
killTree('SIGTERM');
|
|
666
|
+
postResultKillTimer = setTimeout(() => {
|
|
667
|
+
fs.writeSync(fd, `\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
668
|
+
killTree('SIGKILL');
|
|
669
|
+
}, POST_RESULT_KILL_MS);
|
|
670
|
+
if (postResultKillTimer.unref) postResultKillTimer.unref();
|
|
671
|
+
}, POST_RESULT_GRACE_MS);
|
|
672
|
+
if (postResultTimer.unref) postResultTimer.unref();
|
|
673
|
+
} catch { /* log not readable yet; try again */ }
|
|
674
|
+
}, RESULT_TAIL_POLL_MS);
|
|
675
|
+
if (resultTailer.unref) resultTailer.unref();
|
|
676
|
+
|
|
579
677
|
// Kill the child if it runs past the maximum allowed duration.
|
|
580
678
|
const watchdog = setTimeout(() => {
|
|
581
679
|
fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
|
|
582
|
-
|
|
680
|
+
killTree('SIGKILL');
|
|
583
681
|
}, MAX_JOB_DURATION_MS);
|
|
584
682
|
if (watchdog.unref) watchdog.unref();
|
|
585
683
|
|
|
586
|
-
|
|
684
|
+
// Idle-output watchdog: poll log mtime every IDLE_CHECK_INTERVAL_MS; if
|
|
685
|
+
// it hasn't advanced in IDLE_OUTPUT_KILL_MS, presume the agent is stuck
|
|
686
|
+
// and SIGTERM the process group.
|
|
687
|
+
let idleKillTimer = null;
|
|
688
|
+
const idleChecker = setInterval(() => {
|
|
689
|
+
try {
|
|
690
|
+
const stat = fs.statSync(logPath);
|
|
691
|
+
const idleMs = Date.now() - stat.mtimeMs;
|
|
692
|
+
if (idleMs > IDLE_OUTPUT_KILL_MS) {
|
|
693
|
+
fs.writeSync(fd, `\n[scheduler] idle-output watchdog: log mtime stalled ` +
|
|
694
|
+
`${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
|
|
695
|
+
clearInterval(idleChecker);
|
|
696
|
+
killTree('SIGTERM');
|
|
697
|
+
idleKillTimer = setTimeout(() => {
|
|
698
|
+
fs.writeSync(fd, `\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
699
|
+
killTree('SIGKILL');
|
|
700
|
+
}, POST_RESULT_KILL_MS);
|
|
701
|
+
if (idleKillTimer.unref) idleKillTimer.unref();
|
|
702
|
+
}
|
|
703
|
+
} catch { /* log not statable; skip */ }
|
|
704
|
+
}, IDLE_CHECK_INTERVAL_MS);
|
|
705
|
+
if (idleChecker.unref) idleChecker.unref();
|
|
706
|
+
|
|
707
|
+
const clearAllTimers = () => {
|
|
587
708
|
clearTimeout(watchdog);
|
|
709
|
+
clearInterval(resultTailer);
|
|
710
|
+
clearInterval(idleChecker);
|
|
711
|
+
if (postResultTimer) clearTimeout(postResultTimer);
|
|
712
|
+
if (postResultKillTimer) clearTimeout(postResultKillTimer);
|
|
713
|
+
if (idleKillTimer) clearTimeout(idleKillTimer);
|
|
714
|
+
};
|
|
715
|
+
|
|
716
|
+
child.on('error', (err) => {
|
|
717
|
+
clearAllTimers();
|
|
588
718
|
const durationMs = Date.now() - startedAt;
|
|
589
719
|
fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
|
|
590
720
|
closeFd();
|
|
@@ -592,132 +722,395 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
592
722
|
resolve({ exitCode: -1, durationMs, error: err.message });
|
|
593
723
|
});
|
|
594
724
|
|
|
595
|
-
child.on('exit', (code) => {
|
|
596
|
-
|
|
725
|
+
child.on('exit', (code, signal) => {
|
|
726
|
+
clearAllTimers();
|
|
597
727
|
const durationMs = Date.now() - startedAt;
|
|
598
|
-
|
|
728
|
+
// If we SIGTERM'd because of the post-result watchdog AND the agent had
|
|
729
|
+
// emitted result=success, the work succeeded; only the cleanup hung.
|
|
730
|
+
// Map the kill exit code to 0 so the job is marked completed, not failed.
|
|
731
|
+
// Node's child.on('exit') reports either code (normal) or signal (killed);
|
|
732
|
+
// when killed by signal, code is null. We also check 143 (128+SIGTERM)
|
|
733
|
+
// and 137 (128+SIGKILL) in case the process exited via signal-as-code.
|
|
734
|
+
let effectiveCode = code;
|
|
735
|
+
const killedBySignal = signal === 'SIGTERM' || signal === 'SIGKILL' || code === 143 || code === 137 || code === null;
|
|
736
|
+
const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
|
|
737
|
+
if (mappedToSuccess) {
|
|
738
|
+
effectiveCode = 0;
|
|
739
|
+
fs.writeSync(fd, `\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
|
|
740
|
+
`(result=success was emitted before kill)\n`);
|
|
741
|
+
}
|
|
742
|
+
fs.writeSync(fd, `\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
|
|
743
|
+
`duration=${Math.round(durationMs / 1000)}s\n`);
|
|
599
744
|
closeFd();
|
|
600
|
-
const rateLimited =
|
|
601
|
-
atomicWriteJson(metaPath, {
|
|
602
|
-
|
|
745
|
+
const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
|
|
746
|
+
atomicWriteJson(metaPath, {
|
|
747
|
+
slug: job.slug, cwd, exitCode: effectiveCode, rateLimited,
|
|
748
|
+
startedAt, finishedAt: Date.now(), durationMs,
|
|
749
|
+
agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
|
|
750
|
+
});
|
|
751
|
+
resolve({ exitCode: effectiveCode, durationMs, rateLimited });
|
|
603
752
|
});
|
|
604
753
|
});
|
|
605
754
|
}
|
|
606
755
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
756
|
+
/**
|
|
757
|
+
* Pick the next batch of jobs to spawn this tick.
|
|
758
|
+
*
|
|
759
|
+
* Rules:
|
|
760
|
+
* 1. Find the lowest parallelGroup that has pending jobs not already in
|
|
761
|
+
* runningSet.
|
|
762
|
+
* 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
|
|
763
|
+
* up to (cap - runningSet.size) more from the SAME group.
|
|
764
|
+
* 3. If the current group has NO jobs in runningSet (new group), and there
|
|
765
|
+
* are still jobs from an earlier group in runningSet, do nothing — wait
|
|
766
|
+
* for the earlier group to drain before advancing.
|
|
767
|
+
* 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
|
|
768
|
+
* AFTER a higher-numbered group was already picked, fire the late-arrival
|
|
769
|
+
* immediately in parallel with the active group rather than starving it
|
|
770
|
+
* until the active group drains. This handles the reconcile-race where
|
|
771
|
+
* a PRD file lands on disk between two pickNextBatch invocations.
|
|
772
|
+
* 5. A singleton group (unique NN, no other jobs share it) runs alone;
|
|
773
|
+
* no bleed into adjacent groups.
|
|
774
|
+
*
|
|
775
|
+
* Returns array of job objects to spawn. O(N) where N = pending.length.
|
|
776
|
+
*/
|
|
777
|
+
function pickNextBatch(allJobs, running, cap) {
|
|
778
|
+
const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
|
|
779
|
+
if (pending.length === 0) return [];
|
|
780
|
+
|
|
781
|
+
// Groups with at least one job in flight: either tracked in runningSet
|
|
782
|
+
// (this process spawned it) or still marked 'running' in queue.json
|
|
783
|
+
// (persisted from a previous session that hasn't been orphan-reset yet).
|
|
784
|
+
const activeGroups = new Set();
|
|
785
|
+
for (const slug of running) {
|
|
786
|
+
const job = allJobs.find((j) => j.slug === slug);
|
|
787
|
+
if (job) activeGroups.add(job.parallelGroup ?? 99);
|
|
788
|
+
}
|
|
789
|
+
for (const j of allJobs) {
|
|
790
|
+
if (j.status === 'running' && !running.has(j.slug)) {
|
|
791
|
+
activeGroups.add(j.parallelGroup ?? 99);
|
|
616
792
|
}
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
793
|
+
}
|
|
794
|
+
// Total slots consumed: in-process spawns + queue.json running count.
|
|
795
|
+
const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
|
|
796
|
+
const effectiveRunning = Math.max(running.size, queueRunningCount);
|
|
797
|
+
|
|
798
|
+
// Lowest pending group.
|
|
799
|
+
const lowestPendingGroup = pending.reduce(
|
|
800
|
+
(min, j) => Math.min(min, j.parallelGroup ?? 99),
|
|
801
|
+
Infinity,
|
|
802
|
+
);
|
|
803
|
+
|
|
804
|
+
if (activeGroups.size > 0) {
|
|
805
|
+
const lowestActive = Math.min(...activeGroups);
|
|
806
|
+
if (lowestPendingGroup > lowestActive) {
|
|
807
|
+
// Earlier group still running — wait for it to drain before advancing.
|
|
808
|
+
console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
|
|
809
|
+
return [];
|
|
621
810
|
}
|
|
622
|
-
|
|
811
|
+
if (lowestPendingGroup < lowestActive) {
|
|
812
|
+
// Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
|
|
813
|
+
// a higher-numbered group was already picked. Without this branch the
|
|
814
|
+
// pending PRD starves until the active group drains — the bug observed
|
|
815
|
+
// on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
|
|
816
|
+
// the g130 hardening trio ran. Honor priority: fire the late-arrival
|
|
817
|
+
// now, in parallel with the active group. (Strict serial group
|
|
818
|
+
// ordering still applies between groups that were both present at the
|
|
819
|
+
// time of picking; this only handles the reconcile-race edge case.)
|
|
820
|
+
const slots = cap - effectiveRunning;
|
|
821
|
+
if (slots <= 0) {
|
|
822
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
|
|
823
|
+
return [];
|
|
824
|
+
}
|
|
825
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
826
|
+
console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
|
|
827
|
+
return batch;
|
|
828
|
+
}
|
|
829
|
+
// Backfill slots remaining in the current group.
|
|
830
|
+
const slots = cap - effectiveRunning;
|
|
831
|
+
if (slots <= 0) {
|
|
832
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
833
|
+
return [];
|
|
834
|
+
}
|
|
835
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
|
|
836
|
+
if (batch.length > 0) {
|
|
837
|
+
console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
|
|
838
|
+
}
|
|
839
|
+
return batch;
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
// No active group — start the next group fresh.
|
|
843
|
+
const slots = cap - effectiveRunning;
|
|
844
|
+
if (slots <= 0) {
|
|
845
|
+
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
846
|
+
return [];
|
|
847
|
+
}
|
|
848
|
+
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
849
|
+
console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
|
|
850
|
+
return batch;
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
/**
|
|
854
|
+
* Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
|
|
855
|
+
* itself failed. The pattern matches the slug we generate in spawnInvestigation.
|
|
856
|
+
*/
|
|
857
|
+
function isFixPlanSlug(slug) {
|
|
858
|
+
return /^\d+-fix-/.test(slug);
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
/**
|
|
862
|
+
* Read the last `bytes` of a file as utf8. Returns '' on error.
|
|
863
|
+
*/
|
|
864
|
+
function readTail(filePath, bytes) {
|
|
865
|
+
try {
|
|
866
|
+
const stat = fs.statSync(filePath);
|
|
867
|
+
const n = Math.min(stat.size, bytes);
|
|
868
|
+
const fd = fs.openSync(filePath, 'r');
|
|
869
|
+
const buf = Buffer.alloc(n);
|
|
870
|
+
fs.readSync(fd, buf, 0, n, stat.size - n);
|
|
871
|
+
fs.closeSync(fd);
|
|
872
|
+
return buf.toString('utf8');
|
|
873
|
+
} catch {
|
|
874
|
+
return '';
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
/**
|
|
879
|
+
* Spawn an Opus investigation session for a failed job. The investigator's job
|
|
880
|
+
* is to read the failure log + original PRD, identify the root cause, and write
|
|
881
|
+
* a fix-plan PRD into prds/<NN>-fix-<base>.md. Reconcile picks it up; the next
|
|
882
|
+
* Sonnet slot fires it. Investigations themselves are NOT queue entries — they
|
|
883
|
+
* run out-of-band, so they don't consume the concurrency cap. They DO consume
|
|
884
|
+
* tokens, which the when-available throttle will reflect on the next poll.
|
|
885
|
+
*
|
|
886
|
+
* Skipped if the failed job is itself a fix-plan (avoids infinite recursion).
|
|
887
|
+
*/
|
|
888
|
+
async function spawnInvestigation(failedJob, runDir) {
|
|
889
|
+
if (isFixPlanSlug(failedJob.slug)) {
|
|
890
|
+
console.log(`[scheduler] skip investigation: ${failedJob.slug} is itself a fix plan`);
|
|
891
|
+
return;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
const failedLogPath = path.join(runDir, `${failedJob.slug}.log`);
|
|
895
|
+
const investigationLogPath = path.join(runDir, `${failedJob.slug}.investigation.log`);
|
|
896
|
+
|
|
897
|
+
let originalBody = '';
|
|
898
|
+
try {
|
|
899
|
+
originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
|
|
900
|
+
} catch {
|
|
901
|
+
originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
const logTail = readTail(failedLogPath, 16 * 1024) || '(failed to read log)';
|
|
623
905
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
906
|
+
const baseSlug = failedJob.slug.replace(/^\d+-/, '');
|
|
907
|
+
const group = failedJob.parallelGroup ?? 99;
|
|
908
|
+
const fixSlug = `${String(group).padStart(2, '0')}-fix-${baseSlug}`;
|
|
909
|
+
const fixPath = path.join(PRDS_DIR, `${fixSlug}.md`);
|
|
910
|
+
|
|
911
|
+
if (fs.existsSync(fixPath)) {
|
|
912
|
+
console.log(`[scheduler] skip investigation: fix plan already exists at ${fixPath}`);
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
const cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
|
|
917
|
+
const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
|
|
918
|
+
|
|
919
|
+
# Failed job
|
|
920
|
+
- Slug: ${failedJob.slug}
|
|
921
|
+
- Title: ${failedJob.title}
|
|
922
|
+
- cwd: ${cwd}
|
|
923
|
+
- Exit code: ${failedJob.exitCode}
|
|
924
|
+
- Full failure log: ${failedLogPath}
|
|
925
|
+
|
|
926
|
+
# Original PRD body (this is what the job was trying to do)
|
|
927
|
+
\`\`\`
|
|
928
|
+
${originalBody}
|
|
929
|
+
\`\`\`
|
|
930
|
+
|
|
931
|
+
# Last ~16KB of the failure log (stream-json format from \`claude -p\`)
|
|
932
|
+
\`\`\`
|
|
933
|
+
${logTail}
|
|
934
|
+
\`\`\`
|
|
935
|
+
|
|
936
|
+
# Your task
|
|
937
|
+
1. Read the full failure log at ${failedLogPath} if the tail above isn't sufficient.
|
|
938
|
+
2. Read source files in ${cwd} as needed to understand the context.
|
|
939
|
+
3. Identify the root cause of the failure.
|
|
940
|
+
4. Write a NEW fix-plan PRD file at exactly this path:
|
|
941
|
+
|
|
942
|
+
${fixPath}
|
|
943
|
+
|
|
944
|
+
5. The frontmatter MUST be exactly this format (no extra keys):
|
|
945
|
+
\`\`\`
|
|
946
|
+
---
|
|
947
|
+
title: Fix: <short summary of the fix>
|
|
948
|
+
cwd: ${cwd}
|
|
949
|
+
parallelGroup: ${group}
|
|
950
|
+
estimateMinutes: <your time estimate>
|
|
951
|
+
---
|
|
952
|
+
\`\`\`
|
|
953
|
+
6. The PRD body MUST be self-contained — \`claude -p\` runs it on a fresh Sonnet session with NO conversation context. Include:
|
|
954
|
+
- Root-cause analysis (what went wrong and why)
|
|
955
|
+
- Concrete fix steps (specific files / commands / edits)
|
|
956
|
+
- Verification command(s) the next agent should run to confirm the fix
|
|
957
|
+
- Acceptance criteria
|
|
958
|
+
|
|
959
|
+
DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
|
|
960
|
+
|
|
961
|
+
const fd = fs.openSync(investigationLogPath, 'a');
|
|
962
|
+
fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n\n`);
|
|
963
|
+
|
|
964
|
+
const claudeBin = resolveClaudeBin();
|
|
965
|
+
const childEnv = cleanChildEnv();
|
|
966
|
+
const child = spawn(claudeBin, [
|
|
967
|
+
'-p', prompt,
|
|
968
|
+
'--model', 'opus',
|
|
969
|
+
'--dangerously-skip-permissions',
|
|
970
|
+
'--output-format', 'stream-json',
|
|
971
|
+
'--verbose',
|
|
972
|
+
], {
|
|
973
|
+
cwd,
|
|
974
|
+
env: childEnv,
|
|
975
|
+
stdio: ['ignore', fd, fd],
|
|
976
|
+
});
|
|
977
|
+
|
|
978
|
+
fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
|
|
979
|
+
|
|
980
|
+
const watchdog = setTimeout(() => {
|
|
981
|
+
fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
|
|
982
|
+
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
983
|
+
}, MAX_INVESTIGATION_DURATION_MS);
|
|
984
|
+
if (watchdog.unref) watchdog.unref();
|
|
985
|
+
|
|
986
|
+
child.on('error', (err) => {
|
|
987
|
+
clearTimeout(watchdog);
|
|
988
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation error: ${err.message}\n`); } catch { /* */ }
|
|
989
|
+
try { fs.closeSync(fd); } catch { /* */ }
|
|
990
|
+
});
|
|
991
|
+
|
|
992
|
+
child.on('exit', (code) => {
|
|
993
|
+
clearTimeout(watchdog);
|
|
994
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation exit code=${code}\n`); } catch { /* */ }
|
|
995
|
+
try { fs.closeSync(fd); } catch { /* */ }
|
|
996
|
+
if (fs.existsSync(fixPath)) {
|
|
997
|
+
console.log(`[scheduler] investigation produced fix plan: ${fixSlug}`);
|
|
998
|
+
} else {
|
|
999
|
+
console.log(`[scheduler] investigation finished WITHOUT producing fix plan (slug=${failedJob.slug}, code=${code})`);
|
|
631
1000
|
}
|
|
632
|
-
|
|
1001
|
+
// Trigger a tick so the new fix plan is reconciled into the queue and fired.
|
|
1002
|
+
tickQueue().catch(() => {});
|
|
1003
|
+
});
|
|
1004
|
+
}
|
|
633
1005
|
|
|
634
|
-
|
|
1006
|
+
async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
1007
|
+
runningSet.add(job.slug);
|
|
1008
|
+
try {
|
|
1009
|
+
await mutate((s) => {
|
|
1010
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1011
|
+
if (idx >= 0) {
|
|
1012
|
+
s.jobs[idx].status = 'running';
|
|
1013
|
+
s.jobs[idx].runId = runId;
|
|
1014
|
+
s.jobs[idx].startedAt = new Date().toISOString();
|
|
1015
|
+
}
|
|
1016
|
+
});
|
|
635
1017
|
broadcast();
|
|
636
1018
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
// Rate-limit: pause before writing terminal status so the status
|
|
670
|
-
// mutate below can read the pause state.
|
|
671
|
-
if (res.rateLimited) {
|
|
672
|
-
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
673
|
-
await setPaused('rate_limit', resetIso);
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
// Write terminal status; strip runtime regardless of outcome.
|
|
677
|
-
await mutate((s) => {
|
|
678
|
-
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
679
|
-
if (i2 >= 0) {
|
|
680
|
-
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
681
|
-
if (treatAsPending) {
|
|
682
|
-
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
683
|
-
} else {
|
|
684
|
-
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
685
|
-
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
686
|
-
s.jobs[i2].exitCode = res.exitCode;
|
|
687
|
-
s.jobs[i2].error = res.error || null;
|
|
688
|
-
delete s.jobs[i2].runtime;
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
});
|
|
692
|
-
broadcast();
|
|
693
|
-
} catch (e) {
|
|
694
|
-
console.error('[scheduler] launch error', job.slug, e);
|
|
1019
|
+
const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
|
|
1020
|
+
await mutate((s) => {
|
|
1021
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1022
|
+
if (idx >= 0) {
|
|
1023
|
+
s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
|
|
1024
|
+
}
|
|
1025
|
+
});
|
|
1026
|
+
});
|
|
1027
|
+
|
|
1028
|
+
if (res.rateLimited) {
|
|
1029
|
+
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
1030
|
+
await setPaused('rate_limit', resetIso);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
let actuallyFailed = false;
|
|
1034
|
+
let failedJobSnapshot = null;
|
|
1035
|
+
await mutate((s) => {
|
|
1036
|
+
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1037
|
+
if (i2 >= 0) {
|
|
1038
|
+
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
1039
|
+
if (treatAsPending) {
|
|
1040
|
+
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
1041
|
+
} else {
|
|
1042
|
+
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
1043
|
+
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
1044
|
+
s.jobs[i2].exitCode = res.exitCode;
|
|
1045
|
+
s.jobs[i2].error = res.error || null;
|
|
1046
|
+
delete s.jobs[i2].runtime;
|
|
1047
|
+
if (s.jobs[i2].status === 'failed') {
|
|
1048
|
+
actuallyFailed = true;
|
|
1049
|
+
failedJobSnapshot = { ...s.jobs[i2] };
|
|
695
1050
|
}
|
|
696
|
-
})();
|
|
697
|
-
inFlight.add(promise);
|
|
698
|
-
promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
|
|
699
|
-
};
|
|
700
|
-
|
|
701
|
-
// Prime up to cap
|
|
702
|
-
while (queue.length && inFlight.size < cap && !cancelToken.cancelled) launch(queue.shift());
|
|
703
|
-
// Drain. If cancelled mid-group, stop launching new jobs but let
|
|
704
|
-
// already-launched ones settle (they're rate-limited too — short).
|
|
705
|
-
while (inFlight.size > 0) {
|
|
706
|
-
await Promise.race(inFlight);
|
|
707
|
-
if (cancelToken.cancelled) {
|
|
708
|
-
await Promise.allSettled([...inFlight]);
|
|
709
|
-
break;
|
|
710
1051
|
}
|
|
711
|
-
while (queue.length && inFlight.size < cap) launch(queue.shift());
|
|
712
1052
|
}
|
|
1053
|
+
});
|
|
1054
|
+
broadcast();
|
|
1055
|
+
|
|
1056
|
+
if (actuallyFailed && failedJobSnapshot) {
|
|
1057
|
+
spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
|
|
1058
|
+
console.error('[scheduler] spawnInvestigation error', job.slug, e);
|
|
1059
|
+
});
|
|
713
1060
|
}
|
|
1061
|
+
} catch (e) {
|
|
1062
|
+
console.error('[scheduler] spawnJob error', job.slug, e);
|
|
714
1063
|
} finally {
|
|
715
|
-
|
|
716
|
-
//
|
|
717
|
-
|
|
718
|
-
|
|
1064
|
+
runningSet.delete(job.slug);
|
|
1065
|
+
// Each job completion is a signal to advance the queue.
|
|
1066
|
+
tickQueue().catch(() => {});
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
// Serialized ticker: prevents two concurrent tickQueue() calls from racing
|
|
1071
|
+
// on the same pending jobs. A simple promise tail suffices since pickNextBatch
|
|
1072
|
+
// is synchronous and spawnJob is fire-and-forget.
|
|
1073
|
+
let tickTail = Promise.resolve();
|
|
1074
|
+
|
|
1075
|
+
function tickQueue() {
|
|
1076
|
+
const next = tickTail.then(async () => {
|
|
1077
|
+
const state = readQueue();
|
|
1078
|
+
if (state.paused) {
|
|
1079
|
+
console.log('[scheduler] tickQueue skipped: paused');
|
|
1080
|
+
return;
|
|
1081
|
+
}
|
|
1082
|
+
if (cancelToken.cancelled) return;
|
|
1083
|
+
|
|
1084
|
+
reconcile(state);
|
|
1085
|
+
const cap = ENV_CAP ?? state.config.concurrencyCap;
|
|
1086
|
+
const batch = pickNextBatch(state.jobs, runningSet, cap);
|
|
1087
|
+
if (batch.length === 0) return;
|
|
1088
|
+
|
|
1089
|
+
await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
|
|
719
1090
|
broadcast();
|
|
1091
|
+
|
|
1092
|
+
const { runId, dir: runDir } = pickRunDir();
|
|
1093
|
+
for (const job of batch) {
|
|
1094
|
+
if (cancelToken.cancelled) break;
|
|
1095
|
+
// spawnJob is fire-and-forget; it calls tickQueue() on completion.
|
|
1096
|
+
spawnJob(job, runId, runDir, state.config.defaultCwd).catch(() => {});
|
|
1097
|
+
}
|
|
1098
|
+
});
|
|
1099
|
+
tickTail = next.catch(() => {});
|
|
1100
|
+
return next;
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
async function runDueJobs() {
|
|
1104
|
+
const state = readQueue();
|
|
1105
|
+
if (state.paused) {
|
|
1106
|
+
console.log('[scheduler] runDueJobs skipped: paused');
|
|
1107
|
+
return;
|
|
720
1108
|
}
|
|
1109
|
+
cancelToken = { cancelled: false };
|
|
1110
|
+
await tickQueue();
|
|
1111
|
+
// Clear the one-shot scheduledFor without waiting for jobs to settle.
|
|
1112
|
+
await mutate((s) => { s.scheduledFor = null; });
|
|
1113
|
+
broadcast();
|
|
721
1114
|
}
|
|
722
1115
|
|
|
723
1116
|
// ---------- when-available launch logic ----------
|
|
@@ -725,16 +1118,15 @@ async function runDueJobs() {
|
|
|
725
1118
|
async function maybeLaunchWhenAvailable(state) {
|
|
726
1119
|
if (state.config.firePolicy !== 'when-available') return;
|
|
727
1120
|
if (state.paused) return;
|
|
728
|
-
|
|
729
|
-
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
1121
|
+
const pending = state.jobs.filter((j) => j.status === 'pending' && !runningSet.has(j.slug));
|
|
730
1122
|
if (pending.length === 0) return;
|
|
731
1123
|
if (cachedUtilization === null || cachedUtilization === undefined) return;
|
|
732
1124
|
if (cachedUtilization >= state.config.utilizationThreshold) {
|
|
733
1125
|
broadcast();
|
|
734
1126
|
return;
|
|
735
1127
|
}
|
|
736
|
-
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending —
|
|
737
|
-
|
|
1128
|
+
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running — ticking`);
|
|
1129
|
+
tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
|
|
738
1130
|
}
|
|
739
1131
|
|
|
740
1132
|
// ---------- poll loop with exponential backoff ----------
|
|
@@ -750,6 +1142,8 @@ async function pollLoop() {
|
|
|
750
1142
|
backoffMs = 0;
|
|
751
1143
|
backoffNextAt = null;
|
|
752
1144
|
firstFailureAt = null;
|
|
1145
|
+
firstNon429FailureAt = null;
|
|
1146
|
+
lastFailureKind = null;
|
|
753
1147
|
lastPollAt = Date.now();
|
|
754
1148
|
lastPollOk = true;
|
|
755
1149
|
persistSchedulerState();
|
|
@@ -764,6 +1158,19 @@ async function pollLoop() {
|
|
|
764
1158
|
await clearPause('reset-recovered');
|
|
765
1159
|
}
|
|
766
1160
|
|
|
1161
|
+
await maybeLaunchWhenAvailable(cur);
|
|
1162
|
+
broadcast();
|
|
1163
|
+
} else if (r.kind === 'meter_rate_limited') {
|
|
1164
|
+
// Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
|
|
1165
|
+
// fire available jobs anyway at utilization=0 rather than pausing the queue.
|
|
1166
|
+
lastPollAt = Date.now();
|
|
1167
|
+
lastPollOk = false;
|
|
1168
|
+
consecutiveFailures++;
|
|
1169
|
+
lastFailureKind = 'meter_rate_limited';
|
|
1170
|
+
// Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
|
|
1171
|
+
cachedUtilization = 0; // assume safe; fire any pending work
|
|
1172
|
+
console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
|
|
1173
|
+
const cur = readQueue();
|
|
767
1174
|
await maybeLaunchWhenAvailable(cur);
|
|
768
1175
|
broadcast();
|
|
769
1176
|
} else {
|
|
@@ -773,16 +1180,19 @@ async function pollLoop() {
|
|
|
773
1180
|
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
774
1181
|
|
|
775
1182
|
if (r.kind === 'auth') {
|
|
1183
|
+
lastFailureKind = 'auth';
|
|
776
1184
|
console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
|
|
777
1185
|
await setPaused('auth', null);
|
|
778
1186
|
} else {
|
|
779
|
-
// transient or config — apply exponential backoff.
|
|
1187
|
+
// transient or config — apply exponential backoff and count toward 30-min threshold.
|
|
1188
|
+
lastFailureKind = 'transient';
|
|
1189
|
+
if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
|
|
780
1190
|
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
781
|
-
const
|
|
1191
|
+
const totalNon429FailureMs = Date.now() - firstNon429FailureAt;
|
|
782
1192
|
console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
|
|
783
1193
|
|
|
784
|
-
// After 30 minutes of consecutive failures, set 'network' pause.
|
|
785
|
-
if (
|
|
1194
|
+
// After 30 minutes of consecutive non-429 failures, set 'network' pause.
|
|
1195
|
+
if (totalNon429FailureMs > 30 * 60_000) {
|
|
786
1196
|
const cur2 = readQueue();
|
|
787
1197
|
if (!cur2.paused || cur2.paused.reason === 'network') {
|
|
788
1198
|
await setPaused('network', null);
|
|
@@ -798,7 +1208,9 @@ async function pollLoop() {
|
|
|
798
1208
|
lastPollAt = Date.now();
|
|
799
1209
|
lastPollOk = false;
|
|
800
1210
|
consecutiveFailures++;
|
|
1211
|
+
lastFailureKind = 'transient';
|
|
801
1212
|
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
1213
|
+
if (!firstNon429FailureAt) firstNon429FailureAt = Date.now();
|
|
802
1214
|
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
803
1215
|
backoffNextAt = Date.now() + backoffMs;
|
|
804
1216
|
persistSchedulerState();
|
|
@@ -813,6 +1225,7 @@ async function pollLoop() {
|
|
|
813
1225
|
|
|
814
1226
|
function registerScheduleHandlers() {
|
|
815
1227
|
ensureDirs();
|
|
1228
|
+
supervisor.registerHandlers();
|
|
816
1229
|
|
|
817
1230
|
ipcMain.handle('schedule:state', async () => {
|
|
818
1231
|
const state = readQueue();
|
|
@@ -847,6 +1260,7 @@ function registerScheduleHandlers() {
|
|
|
847
1260
|
lastPollAt,
|
|
848
1261
|
lastPollOk,
|
|
849
1262
|
consecutiveFailures,
|
|
1263
|
+
lastFailureKind,
|
|
850
1264
|
backoffNextAt,
|
|
851
1265
|
nextResetCached: cachedNextReset,
|
|
852
1266
|
pausedSince: state.paused ? Date.parse(state.paused.since) : null,
|
|
@@ -855,6 +1269,14 @@ function registerScheduleHandlers() {
|
|
|
855
1269
|
};
|
|
856
1270
|
});
|
|
857
1271
|
|
|
1272
|
+
ipcMain.handle('schedule:force-tick', async () => {
|
|
1273
|
+
// Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
|
|
1274
|
+
// Clears any existing pause first (same semantics as run-now).
|
|
1275
|
+
await clearPause('run-now');
|
|
1276
|
+
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error (force-tick)', e));
|
|
1277
|
+
return { ok: true };
|
|
1278
|
+
});
|
|
1279
|
+
|
|
858
1280
|
ipcMain.handle('schedule:set-config', async (_e, partial) => {
|
|
859
1281
|
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
860
1282
|
let validated;
|
|
@@ -864,7 +1286,11 @@ function registerScheduleHandlers() {
|
|
|
864
1286
|
return { ok: false, error: e?.message ?? 'invalid config' };
|
|
865
1287
|
}
|
|
866
1288
|
const config = await mutate((state) => {
|
|
867
|
-
|
|
1289
|
+
const { supervisor: supPartial, ...rest } = validated;
|
|
1290
|
+
state.config = { ...state.config, ...rest };
|
|
1291
|
+
if (supPartial !== undefined) {
|
|
1292
|
+
state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
|
|
1293
|
+
}
|
|
868
1294
|
return state.config;
|
|
869
1295
|
});
|
|
870
1296
|
await rescheduleTimer();
|
|
@@ -1051,6 +1477,11 @@ async function init() {
|
|
|
1051
1477
|
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
|
|
1052
1478
|
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
1053
1479
|
|
|
1480
|
+
// Supervisor: probe running jobs for wedged poll-loops.
|
|
1481
|
+
if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
|
|
1482
|
+
supervisor.startSupervisor({ readQueue, mutate });
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1054
1485
|
// Heartbeat: once per minute, log queue state for 24h visibility.
|
|
1055
1486
|
if (heartbeatInterval) clearInterval(heartbeatInterval);
|
|
1056
1487
|
heartbeatInterval = setInterval(() => {
|