claude-code-session-manager 0.19.0 → 0.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/{TiptapBody-CO4q65kH.js → TiptapBody-Db7_uXrI.js} +1 -1
- package/dist/assets/{cssMode-0tbceX4i.js → cssMode-DFKJhhi6.js} +1 -1
- package/dist/assets/{freemarker2-Dv8wl_HH.js → freemarker2-DUat8x8o.js} +1 -1
- package/dist/assets/{handlebars-MzrjkW3b.js → handlebars-B2C1qhAI.js} +1 -1
- package/dist/assets/{html-C0YEYUHk.js → html-khtg0DVs.js} +1 -1
- package/dist/assets/{htmlMode-Bf9ccIo3.js → htmlMode-Jmhs-vfl.js} +1 -1
- package/dist/assets/{index-BsSklu93.css → index-BkkBX1z7.css} +1 -1
- package/dist/assets/{index-BXeFi7dA.js → index-pqnuXM14.js} +634 -624
- package/dist/assets/{javascript-BZhQgLYg.js → javascript-i1CXbgg4.js} +1 -1
- package/dist/assets/{jsonMode-XkKuSIs5.js → jsonMode-DXZaj-kR.js} +1 -1
- package/dist/assets/{liquid-B6fnroVU.js → liquid-Ds7jUF53.js} +1 -1
- package/dist/assets/{lspLanguageFeatures-BAIq7N4N.js → lspLanguageFeatures-B_15vO6X.js} +1 -1
- package/dist/assets/{mdx-DzH38OXA.js → mdx-DgrrLgTE.js} +1 -1
- package/dist/assets/{python-ak0De5ar.js → python-Cff3tPw3.js} +1 -1
- package/dist/assets/{razor-DC-IpQpX.js → razor-DlyG7FmM.js} +1 -1
- package/dist/assets/{tsMode-DaZCqNuS.js → tsMode-DRmmmttS.js} +1 -1
- package/dist/assets/{typescript-D5YkmMgh.js → typescript-DQFL2T1p.js} +1 -1
- package/dist/assets/{whisperWorker-CcsPqZUS.js → whisperWorker-Dbia1OpC.js} +15 -15
- package/dist/assets/{xml-8idHpw2C.js → xml-CwsJEzdU.js} +1 -1
- package/dist/assets/{yaml-Dm8NKlcv.js → yaml-BDsDjf-y.js} +1 -1
- package/dist/index.html +2 -2
- package/package.json +5 -2
- package/src/main/health.cjs +216 -0
- package/src/main/historyAggregator.cjs +15 -9
- package/src/main/index.cjs +7 -2
- package/src/main/ipcSchemas.cjs +43 -0
- package/src/main/kg.cjs +0 -0
- package/src/main/lib/reaperHelpers.cjs +67 -0
- package/src/main/lib/schedulerBatch.cjs +212 -0
- package/src/main/lib/schedulerConfig.cjs +9 -1
- package/src/main/scheduler.cjs +274 -125
- package/src/main/webRemote.cjs +916 -0
- package/src/preload/api.d.ts +78 -15
- package/src/preload/index.cjs +41 -8
- package/src/main/projectSkills.cjs +0 -124
package/src/main/scheduler.cjs
CHANGED
|
@@ -45,12 +45,14 @@ const fsp = require('node:fs/promises');
|
|
|
45
45
|
const path = require('node:path');
|
|
46
46
|
const os = require('node:os');
|
|
47
47
|
const { randomUUID } = require('node:crypto');
|
|
48
|
+
const { execFile } = require('node:child_process');
|
|
48
49
|
const { ipcMain } = require('electron');
|
|
49
50
|
const billing = require('./usage.cjs');
|
|
50
51
|
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
52
|
const supervisor = require('./supervisor.cjs');
|
|
52
53
|
const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
|
|
53
54
|
const { readTail } = require('./lib/fileTail.cjs');
|
|
55
|
+
const { claudePidAlive, classifyRunOutcome } = require('./lib/reaperHelpers.cjs');
|
|
54
56
|
const { openLog, withChildAndLog } = require('./lib/childWithLog.cjs');
|
|
55
57
|
const { sendIfAlive } = require('./lib/sendToRenderer.cjs');
|
|
56
58
|
const prdParser = require('./scheduler/prdParser.cjs');
|
|
@@ -62,6 +64,7 @@ const {
|
|
|
62
64
|
USAGE_REFRESH_INTERVAL_MS,
|
|
63
65
|
MAX_JOB_DURATION_MS,
|
|
64
66
|
} = require('./lib/schedulerConfig.cjs');
|
|
67
|
+
const { pickForProject, pickNextBatch, DEFAULT_PROJECT_CWD } = require('./lib/schedulerBatch.cjs');
|
|
65
68
|
|
|
66
69
|
const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
|
|
67
70
|
|
|
@@ -88,6 +91,68 @@ const RESULT_TAIL_BYTES = 8 * 1024;
|
|
|
88
91
|
const IDLE_OUTPUT_KILL_MS = 20 * 60_000;
|
|
89
92
|
const IDLE_CHECK_INTERVAL_MS = 60_000;
|
|
90
93
|
|
|
94
|
+
// Appended to every scheduled job prompt so the queue can be RELIED ON to finish
|
|
95
|
+
// work to a consistent bar: review → security-review → verify → commit. Enforced
|
|
96
|
+
// centrally here (not per-PRD) so it applies to every current and future PRD.
|
|
97
|
+
// The commit step is also backstopped by the post-run commit guard below: a
|
|
98
|
+
// clean exit that leaves uncommitted changes is downgraded to needs_review.
|
|
99
|
+
const FINISH_PROTOCOL = `
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
# SCHEDULER FINISH PROTOCOL (mandatory — runs AFTER the work above)
|
|
103
|
+
|
|
104
|
+
Once every acceptance-criteria line above is satisfied, finish in this EXACT
|
|
105
|
+
sequence. Do not stop before the commit lands; committing is part of the job.
|
|
106
|
+
|
|
107
|
+
1. CODE REVIEW — run \`/code-review --fix\` on your changes and apply the fixes it
|
|
108
|
+
surfaces (correctness first). For any finding you judge a false positive, say
|
|
109
|
+
why in your result; do not silently skip it. If \`/code-review\` is not
|
|
110
|
+
available in this environment, do an equivalent careful self-review instead.
|
|
111
|
+
2. SECURITY REVIEW — run \`/security-review\` and address every finding (or
|
|
112
|
+
justify it). If unavailable, self-review the diff for injection, secrets,
|
|
113
|
+
path traversal, and unsafe input handling.
|
|
114
|
+
3. VERIFY — run the project's OWN check commands (typecheck / lint / tests — the
|
|
115
|
+
project's CLAUDE.md names them; infer from the repo if not) and make them
|
|
116
|
+
pass. Do not assume npm; use whatever the target project uses.
|
|
117
|
+
4. COMMIT — stage and commit ALL changes with a clear conventional message:
|
|
118
|
+
\`git add -A && git commit -m "<type>(<scope>): <summary>"\`.
|
|
119
|
+
|
|
120
|
+
A job that exits with uncommitted changes is treated as INCOMPLETE and flagged
|
|
121
|
+
for review. Do NOT add work beyond the acceptance criteria — this protocol is the
|
|
122
|
+
only post-AC work. If a review finding can't be fixed within scope, commit what
|
|
123
|
+
you have, describe the finding in the commit body, and note the follow-up in your
|
|
124
|
+
final result.`;
|
|
125
|
+
|
|
126
|
+
// Parse \`git status --porcelain\` output into a list of changed paths. Pure +
|
|
127
|
+
// exported for unit testing. Each porcelain line is "XY<space>PATH" (2 status
|
|
128
|
+
// chars + space), so the path starts at index 3; rename lines ("R a -> b")
|
|
129
|
+
// keep the "a -> b" tail, which is fine for a human-facing dirty-file list.
|
|
130
|
+
function parsePorcelain(stdout) {
|
|
131
|
+
return String(stdout || '')
|
|
132
|
+
.split('\n')
|
|
133
|
+
.filter((l) => l.length > 0)
|
|
134
|
+
.map((l) => l.slice(3))
|
|
135
|
+
.filter(Boolean);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Return the list of uncommitted paths in cwd, or null when the guard does not
|
|
139
|
+
// apply (cwd is not a git work tree, git is missing, or the call errors). Never
|
|
140
|
+
// throws — a guard failure must not fail an otherwise-successful job.
|
|
141
|
+
function uncommittedChanges(cwd) {
|
|
142
|
+
return new Promise((resolve) => {
|
|
143
|
+
if (!cwd) { resolve(null); return; }
|
|
144
|
+
execFile(
|
|
145
|
+
'git',
|
|
146
|
+
['-C', cwd, 'status', '--porcelain'],
|
|
147
|
+
{ timeout: 10_000, windowsHide: true },
|
|
148
|
+
(err, stdout) => {
|
|
149
|
+
if (err) { resolve(null); return; } // not a repo / git missing → skip
|
|
150
|
+
resolve(parsePorcelain(stdout));
|
|
151
|
+
},
|
|
152
|
+
);
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
91
156
|
const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
|
|
92
157
|
const PRDS_DIR = path.join(ROOT, 'prds');
|
|
93
158
|
const RUNS_DIR = path.join(ROOT, 'runs');
|
|
@@ -96,7 +161,7 @@ const QUEUE_PATH = path.join(ROOT, 'queue.json');
|
|
|
96
161
|
const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
|
|
97
162
|
const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
|
|
98
163
|
const HEARTBEAT_MAX_BYTES = 1024 * 1024;
|
|
99
|
-
|
|
164
|
+
// DEFAULT_PROJECT_CWD imported from lib/schedulerBatch.cjs (single source of truth).
|
|
100
165
|
|
|
101
166
|
const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
|
|
102
167
|
? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
|
|
@@ -677,7 +742,9 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
677
742
|
const prdPath = path.join(PRDS_DIR, `${job.slug}.md`);
|
|
678
743
|
try {
|
|
679
744
|
const parsed = await parsePrd(prdPath);
|
|
680
|
-
|
|
745
|
+
// Centrally enforce the review → security-review → verify → commit finish
|
|
746
|
+
// sequence on every job, regardless of what the PRD body says.
|
|
747
|
+
prompt = parsed.body + FINISH_PROTOCOL;
|
|
681
748
|
} catch (e) {
|
|
682
749
|
safeLog(`[scheduler] failed to read PRD: ${e?.message}\n`);
|
|
683
750
|
closeFd();
|
|
@@ -877,121 +944,10 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
877
944
|
});
|
|
878
945
|
}
|
|
879
946
|
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
* 1. Find the lowest parallelGroup that has pending jobs not already in
|
|
885
|
-
* runningSet.
|
|
886
|
-
* 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
|
|
887
|
-
* up to (cap - runningSet.size) more from the SAME group.
|
|
888
|
-
* 3. If the current group has NO jobs in runningSet (new group), and there
|
|
889
|
-
* are still jobs from an earlier group in runningSet, do nothing — wait
|
|
890
|
-
* for the earlier group to drain before advancing.
|
|
891
|
-
* 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
|
|
892
|
-
* AFTER a higher-numbered group was already picked, fire the late-arrival
|
|
893
|
-
* immediately in parallel with the active group rather than starving it
|
|
894
|
-
* until the active group drains. This handles the reconcile-race where
|
|
895
|
-
* a PRD file lands on disk between two pickNextBatch invocations.
|
|
896
|
-
* 5. A singleton group (unique NN, no other jobs share it) runs alone;
|
|
897
|
-
* no bleed into adjacent groups.
|
|
898
|
-
*
|
|
899
|
-
* Returns array of job objects to spawn. O(N) where N = pending.length.
|
|
900
|
-
*/
|
|
901
|
-
function pickNextBatch(allJobs, running, cap) {
|
|
902
|
-
const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
|
|
903
|
-
if (pending.length === 0) return [];
|
|
904
|
-
|
|
905
|
-
// Lowest pending group (computed up-front so the failure gate can compare).
|
|
906
|
-
const lowestPendingGroup = pending.reduce(
|
|
907
|
-
(min, j) => Math.min(min, j.parallelGroup ?? 99),
|
|
908
|
-
Infinity,
|
|
909
|
-
);
|
|
910
|
-
|
|
911
|
-
// Cross-group failure gate: refuse to advance past a group with failed jobs.
|
|
912
|
-
// Without this, a failed foundation PRD (e.g. 03-doc-editor-foundation
|
|
913
|
-
// crashed with a NUL-byte spawn error on 2026-05-21) doesn't stop later
|
|
914
|
-
// groups (04, 05, 06...) from running and silently corrupting the project
|
|
915
|
-
// state. The user can re-queue the failed job (pending) or archive it to
|
|
916
|
-
// unblock the gate, but the default is to halt until the failure is
|
|
917
|
-
// acknowledged.
|
|
918
|
-
const blockingFailures = allJobs.filter((j) =>
|
|
919
|
-
(j.status === 'failed' || j.status === 'needs_review') &&
|
|
920
|
-
(j.parallelGroup ?? 99) < lowestPendingGroup,
|
|
921
|
-
);
|
|
922
|
-
if (blockingFailures.length > 0) {
|
|
923
|
-
const slugs = blockingFailures.map((j) => j.slug).join(', ');
|
|
924
|
-
console.log(`[scheduler] failure-gate: holding g${lowestPendingGroup} — ${blockingFailures.length} failed job(s) in earlier groups [${slugs}]. Reset to pending or archive to unblock.`);
|
|
925
|
-
return [];
|
|
926
|
-
}
|
|
927
|
-
|
|
928
|
-
// Groups with at least one job in flight: either tracked in runningSet
|
|
929
|
-
// (this process spawned it) or still marked 'running' in queue.json
|
|
930
|
-
// (persisted from a previous session that hasn't been orphan-reset yet).
|
|
931
|
-
const activeGroups = new Set();
|
|
932
|
-
for (const slug of running) {
|
|
933
|
-
const job = allJobs.find((j) => j.slug === slug);
|
|
934
|
-
if (job) activeGroups.add(job.parallelGroup ?? 99);
|
|
935
|
-
}
|
|
936
|
-
for (const j of allJobs) {
|
|
937
|
-
if (j.status === 'running' && !running.has(j.slug)) {
|
|
938
|
-
activeGroups.add(j.parallelGroup ?? 99);
|
|
939
|
-
}
|
|
940
|
-
}
|
|
941
|
-
// Total slots consumed: in-process spawns + queue.json running count.
|
|
942
|
-
const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
|
|
943
|
-
const effectiveRunning = Math.max(running.size, queueRunningCount);
|
|
944
|
-
|
|
945
|
-
// (lowestPendingGroup was computed up-front for the failure-gate check.)
|
|
946
|
-
|
|
947
|
-
if (activeGroups.size > 0) {
|
|
948
|
-
const lowestActive = Math.min(...activeGroups);
|
|
949
|
-
if (lowestPendingGroup > lowestActive) {
|
|
950
|
-
// Earlier group still running — wait for it to drain before advancing.
|
|
951
|
-
console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
|
|
952
|
-
return [];
|
|
953
|
-
}
|
|
954
|
-
if (lowestPendingGroup < lowestActive) {
|
|
955
|
-
// Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
|
|
956
|
-
// a higher-numbered group was already picked. Without this branch the
|
|
957
|
-
// pending PRD starves until the active group drains — the bug observed
|
|
958
|
-
// on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
|
|
959
|
-
// the g130 hardening trio ran. Honor priority: fire the late-arrival
|
|
960
|
-
// now, in parallel with the active group. (Strict serial group
|
|
961
|
-
// ordering still applies between groups that were both present at the
|
|
962
|
-
// time of picking; this only handles the reconcile-race edge case.)
|
|
963
|
-
const slots = cap - effectiveRunning;
|
|
964
|
-
if (slots <= 0) {
|
|
965
|
-
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
|
|
966
|
-
return [];
|
|
967
|
-
}
|
|
968
|
-
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
969
|
-
console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
|
|
970
|
-
return batch;
|
|
971
|
-
}
|
|
972
|
-
// Backfill slots remaining in the current group.
|
|
973
|
-
const slots = cap - effectiveRunning;
|
|
974
|
-
if (slots <= 0) {
|
|
975
|
-
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
976
|
-
return [];
|
|
977
|
-
}
|
|
978
|
-
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
|
|
979
|
-
if (batch.length > 0) {
|
|
980
|
-
console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
|
|
981
|
-
}
|
|
982
|
-
return batch;
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
// No active group — start the next group fresh.
|
|
986
|
-
const slots = cap - effectiveRunning;
|
|
987
|
-
if (slots <= 0) {
|
|
988
|
-
console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
|
|
989
|
-
return [];
|
|
990
|
-
}
|
|
991
|
-
const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
|
|
992
|
-
console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
|
|
993
|
-
return batch;
|
|
994
|
-
}
|
|
947
|
+
// pickNextBatch and pickForProject are defined in lib/schedulerBatch.cjs and
|
|
948
|
+
// required at the top of this file. Group-ordering gates are evaluated per
|
|
949
|
+
// project (keyed by cwd) so jobs in different repos run concurrently up to
|
|
950
|
+
// the cap; within one project, sequential-group semantics are preserved.
|
|
995
951
|
|
|
996
952
|
/**
|
|
997
953
|
* Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
|
|
@@ -1177,6 +1133,11 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
|
1177
1133
|
});
|
|
1178
1134
|
await broadcast();
|
|
1179
1135
|
|
|
1136
|
+
// Commit-guard baseline: snapshot the working tree BEFORE the run so the
|
|
1137
|
+
// post-run check flags only paths THIS job left dirty, not pre-existing WIP.
|
|
1138
|
+
const guardCwd = job.cwd || defaultCwd;
|
|
1139
|
+
const guardBaseline = await uncommittedChanges(guardCwd);
|
|
1140
|
+
|
|
1180
1141
|
const res = await executeJob(job, runDir, defaultCwd, async (pid, sessionId, cwd) => {
|
|
1181
1142
|
await mutate((s) => {
|
|
1182
1143
|
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
@@ -1220,6 +1181,36 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
|
1220
1181
|
}
|
|
1221
1182
|
}
|
|
1222
1183
|
|
|
1184
|
+
// Commit guard: a clean exit that left NEW uncommitted changes means the
|
|
1185
|
+
// finish protocol's COMMIT step did not run. Surface it as needs_review
|
|
1186
|
+
// instead of letting it masquerade as 'completed' (the PRD 03/04
|
|
1187
|
+
// left-uncommitted incident). Two false-positive defenses:
|
|
1188
|
+
// - baseline DELTA: only files dirtied during THIS run count, so
|
|
1189
|
+
// pre-existing user WIP is excluded; and
|
|
1190
|
+
// - sibling skip: if another job is concurrently writing the same repo,
|
|
1191
|
+
// working-tree dirt can't be attributed to this job, so skip the guard.
|
|
1192
|
+
// Non-git cwds resolve to null and are skipped (the guard is best-effort).
|
|
1193
|
+
if (res.exitCode === 0 && !res.rateLimited && (!verifyResult || verifyResult.verdict === 'clean')) {
|
|
1194
|
+
const after = await uncommittedChanges(guardCwd);
|
|
1195
|
+
if (after && after.length > 0) {
|
|
1196
|
+
const baseSet = new Set(guardBaseline || []);
|
|
1197
|
+
const newlyDirty = after.filter((p) => !baseSet.has(p));
|
|
1198
|
+
const guardState = await readQueue().catch(() => ({ jobs: [] }));
|
|
1199
|
+
const siblingRunning = (guardState.jobs || []).some(
|
|
1200
|
+
(j) => j.slug !== job.slug && j.status === 'running' && (j.cwd || defaultCwd) === guardCwd,
|
|
1201
|
+
);
|
|
1202
|
+
if (newlyDirty.length > 0 && !siblingRunning) {
|
|
1203
|
+
const sample = newlyDirty.slice(0, 3).join(', ');
|
|
1204
|
+
verifyResult = {
|
|
1205
|
+
verdict: 'uncommitted_changes',
|
|
1206
|
+
reason: `finish protocol incomplete: ${newlyDirty.length} uncommitted file(s) left in working tree (e.g. ${sample})`,
|
|
1207
|
+
downgradeTo: 'needs_review',
|
|
1208
|
+
};
|
|
1209
|
+
console.log(`[scheduler] commit-guard: ${job.slug} left ${newlyDirty.length} files uncommitted → needs_review`);
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1223
1214
|
let actuallyFailed = false;
|
|
1224
1215
|
let failedJobSnapshot = null;
|
|
1225
1216
|
await mutate((s) => {
|
|
@@ -1385,10 +1376,60 @@ async function maybeLaunchWhenAvailable(state) {
|
|
|
1385
1376
|
tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
|
|
1386
1377
|
}
|
|
1387
1378
|
|
|
1379
|
+
// ---------- dead-process reaper ----------
|
|
1380
|
+
|
|
1381
|
+
/**
|
|
1382
|
+
* Scan running jobs, identify those whose claude process is provably dead, and
|
|
1383
|
+
* finalize them to completed/failed by reading the run log. Called once per
|
|
1384
|
+
* poll cycle. Conservative: a job with no runtime.pid yet (spawn mid-flight)
|
|
1385
|
+
* is always skipped. A job whose pid is alive (claudePidAlive) is always skipped.
|
|
1386
|
+
* Exported so unit tests can invoke it directly.
|
|
1387
|
+
*/
|
|
1388
|
+
async function reapDeadRunningJobs() {
|
|
1389
|
+
try {
|
|
1390
|
+
if (runningSet.size === 0) return; // fast path: no in-flight jobs
|
|
1391
|
+
const state = await readQueue();
|
|
1392
|
+
const dead = [];
|
|
1393
|
+
for (const j of state.jobs) {
|
|
1394
|
+
if (j.status !== 'running') continue;
|
|
1395
|
+
const pid = j.runtime?.pid;
|
|
1396
|
+
if (!pid) continue; // spawn may be mid-flight; give it a cycle
|
|
1397
|
+
if (claudePidAlive(pid)) continue;
|
|
1398
|
+
const logPath = j.runId
|
|
1399
|
+
? path.join(RUNS_DIR, j.runId, `${j.slug}.log`)
|
|
1400
|
+
: null;
|
|
1401
|
+
const outcome = logPath ? classifyRunOutcome(logPath) : 'unknown';
|
|
1402
|
+
dead.push({ slug: j.slug, pid, outcome });
|
|
1403
|
+
}
|
|
1404
|
+
if (dead.length === 0) return;
|
|
1405
|
+
|
|
1406
|
+
await mutate((s) => {
|
|
1407
|
+
for (const { slug, pid, outcome } of dead) {
|
|
1408
|
+
const idx = s.jobs.findIndex((x) => x.slug === slug);
|
|
1409
|
+
if (idx < 0 || s.jobs[idx].status !== 'running') continue; // race guard
|
|
1410
|
+
const success = outcome === 'success';
|
|
1411
|
+
s.jobs[idx].status = success ? 'completed' : 'failed';
|
|
1412
|
+
s.jobs[idx].exitCode = success ? 0 : (s.jobs[idx].exitCode ?? 1);
|
|
1413
|
+
s.jobs[idx].finishedAt = new Date().toISOString();
|
|
1414
|
+
s.jobs[idx].error = success ? null : `reaped: process gone, no success result in log (${outcome})`;
|
|
1415
|
+
delete s.jobs[idx].runtime;
|
|
1416
|
+
runningSet.delete(slug);
|
|
1417
|
+
console.log(`[scheduler] reaped dead job slug=${slug} pid=${pid} outcome=${outcome}`);
|
|
1418
|
+
}
|
|
1419
|
+
});
|
|
1420
|
+
|
|
1421
|
+
await broadcast();
|
|
1422
|
+
tickQueue().catch(() => {});
|
|
1423
|
+
} catch (e) {
|
|
1424
|
+
console.warn('[scheduler] reapDeadRunningJobs error', e?.message);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
|
|
1388
1428
|
// ---------- poll loop with exponential backoff ----------
|
|
1389
1429
|
|
|
1390
1430
|
async function pollLoop() {
|
|
1391
1431
|
try {
|
|
1432
|
+
await reapDeadRunningJobs().catch(() => {});
|
|
1392
1433
|
const r = await billing.fetchUsage();
|
|
1393
1434
|
|
|
1394
1435
|
if (r.kind === 'ok') {
|
|
@@ -1736,11 +1777,22 @@ async function init() {
|
|
|
1736
1777
|
loadSchedulerState();
|
|
1737
1778
|
bootedAt = Date.now();
|
|
1738
1779
|
|
|
1739
|
-
// Boot reconciliation:
|
|
1740
|
-
//
|
|
1741
|
-
//
|
|
1742
|
-
//
|
|
1743
|
-
//
|
|
1780
|
+
// Boot reconciliation: finalize any job that was 'running' when the app died.
|
|
1781
|
+
// Check the run log first — a job that emitted result/success before the crash
|
|
1782
|
+
// should be marked 'completed', not 'failed', so it doesn't wedge the queue
|
|
1783
|
+
// via the failure-gate. Also kill any still-live orphan claude child to prevent
|
|
1784
|
+
// it from continuing to write to the project unsupervised (2026-05-21 incident).
|
|
1785
|
+
//
|
|
1786
|
+
// classifyRunOutcome calls readTail → fs.readFileSync (up to 64 KB per job).
|
|
1787
|
+
// Pre-compute all outcomes BEFORE entering the mutate lock so the blocking I/O
|
|
1788
|
+
// does not stall the event loop or hold the mutateTail chain during startup.
|
|
1789
|
+
const bootSnap = readQueueSync();
|
|
1790
|
+
const bootOutcomes = new Map();
|
|
1791
|
+
for (const j of bootSnap.jobs) {
|
|
1792
|
+
if (j.status !== 'running') continue;
|
|
1793
|
+
const logPath = j.runId ? path.join(RUNS_DIR, j.runId, `${j.slug}.log`) : null;
|
|
1794
|
+
bootOutcomes.set(j.slug, logPath ? classifyRunOutcome(logPath) : 'unknown');
|
|
1795
|
+
}
|
|
1744
1796
|
await mutate((state) => {
|
|
1745
1797
|
for (const j of state.jobs) {
|
|
1746
1798
|
if (j.status === 'running') {
|
|
@@ -1753,10 +1805,14 @@ async function init() {
|
|
|
1753
1805
|
console.log(`[scheduler] boot: SIGTERM'd orphan claude pid=${pid} for ${j.slug}`);
|
|
1754
1806
|
}
|
|
1755
1807
|
}
|
|
1756
|
-
j.
|
|
1757
|
-
|
|
1808
|
+
const outcome = bootOutcomes.get(j.slug) ?? 'unknown';
|
|
1809
|
+
const success = outcome === 'success';
|
|
1810
|
+
j.status = success ? 'completed' : 'failed';
|
|
1811
|
+
j.exitCode = success ? 0 : (j.exitCode ?? 1);
|
|
1812
|
+
j.error = success ? null : `orphaned: app restarted while running${killNote}`;
|
|
1758
1813
|
j.finishedAt = new Date().toISOString();
|
|
1759
1814
|
delete j.runtime;
|
|
1815
|
+
console.log(`[scheduler] boot reconcile: slug=${j.slug} outcome=${outcome} → ${j.status}`);
|
|
1760
1816
|
}
|
|
1761
1817
|
}
|
|
1762
1818
|
});
|
|
@@ -1833,4 +1889,97 @@ async function init() {
|
|
|
1833
1889
|
}
|
|
1834
1890
|
}
|
|
1835
1891
|
|
|
1836
|
-
|
|
1892
|
+
// remote — callable from webRemote.cjs without going through IPC.
|
|
1893
|
+
const remote = {
|
|
1894
|
+
async getState() {
|
|
1895
|
+
const state = await readQueue();
|
|
1896
|
+
await reconcile(state);
|
|
1897
|
+
await writeQueue(state);
|
|
1898
|
+
return buildScheduleStatePayload(state, { withPaths: true });
|
|
1899
|
+
},
|
|
1900
|
+
|
|
1901
|
+
async readPrd(slug) {
|
|
1902
|
+
const filePath = safeSlugPath(slug);
|
|
1903
|
+
if (!filePath) return { ok: false, error: 'invalid slug' };
|
|
1904
|
+
try {
|
|
1905
|
+
// realpath resolves symlinks; re-check boundary to block a rogue agent job
|
|
1906
|
+
// that places a symlink inside PRDS_DIR pointing outside the safe root.
|
|
1907
|
+
const real = await fsp.realpath(filePath);
|
|
1908
|
+
if (!real.startsWith(PRDS_DIR + path.sep)) {
|
|
1909
|
+
return { ok: false, error: 'invalid slug' };
|
|
1910
|
+
}
|
|
1911
|
+
const text = await fsp.readFile(real, 'utf8');
|
|
1912
|
+
return { ok: true, text };
|
|
1913
|
+
} catch (e) {
|
|
1914
|
+
return { ok: false, error: e?.message };
|
|
1915
|
+
}
|
|
1916
|
+
},
|
|
1917
|
+
|
|
1918
|
+
async readLog(slug, runId) {
|
|
1919
|
+
const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
|
|
1920
|
+
if (!logPath.startsWith(RUNS_DIR + path.sep)) {
|
|
1921
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
1922
|
+
}
|
|
1923
|
+
try {
|
|
1924
|
+
// realpath resolves symlinks; re-check boundary to block a rogue agent job
|
|
1925
|
+
// that places a symlink inside RUNS_DIR pointing outside the safe root.
|
|
1926
|
+
const real = await fsp.realpath(logPath);
|
|
1927
|
+
if (!real.startsWith(RUNS_DIR + path.sep)) {
|
|
1928
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
1929
|
+
}
|
|
1930
|
+
const text = await fsp.readFile(real, 'utf8');
|
|
1931
|
+
return { ok: true, text };
|
|
1932
|
+
} catch (e) {
|
|
1933
|
+
return { ok: false, error: e?.message };
|
|
1934
|
+
}
|
|
1935
|
+
},
|
|
1936
|
+
|
|
1937
|
+
async writePrd(slug, body) {
|
|
1938
|
+
const resolved = safeSlugPath(slug);
|
|
1939
|
+
if (!resolved) return { ok: false, error: 'invalid slug' };
|
|
1940
|
+
try {
|
|
1941
|
+
await config.writeTextAtomic(resolved, body);
|
|
1942
|
+
const stat = await fsp.stat(resolved);
|
|
1943
|
+
return { ok: true, bytesWritten: stat.size };
|
|
1944
|
+
} catch (e) {
|
|
1945
|
+
return { ok: false, error: e?.message ?? 'write failed' };
|
|
1946
|
+
}
|
|
1947
|
+
},
|
|
1948
|
+
|
|
1949
|
+
async resetJob(slug) {
|
|
1950
|
+
if (!safeSlugPath(slug)) return { ok: false, error: 'invalid slug' };
|
|
1951
|
+
const found = await mutate((state) => {
|
|
1952
|
+
const idx = state.jobs.findIndex((j) => j.slug === slug);
|
|
1953
|
+
if (idx < 0) return false;
|
|
1954
|
+
resetJobFields(state.jobs[idx]);
|
|
1955
|
+
return true;
|
|
1956
|
+
});
|
|
1957
|
+
if (!found) return { ok: false, error: 'not found' };
|
|
1958
|
+
await broadcast();
|
|
1959
|
+
return { ok: true };
|
|
1960
|
+
},
|
|
1961
|
+
|
|
1962
|
+
async runNow() {
|
|
1963
|
+
await clearPause('run-now');
|
|
1964
|
+
runDueJobs().catch((e) => logs.writeLine({
|
|
1965
|
+
level: 'error', scope: 'scheduler',
|
|
1966
|
+
message: 'runDueJobs error (remote:run-now)', meta: { error: e?.message },
|
|
1967
|
+
}));
|
|
1968
|
+
return { ok: true };
|
|
1969
|
+
},
|
|
1970
|
+
|
|
1971
|
+
async setConfig(partial) {
|
|
1972
|
+
const cfg = await mutate((state) => {
|
|
1973
|
+
const { supervisor: supPartial, ...rest } = partial;
|
|
1974
|
+
state.config = { ...state.config, ...rest };
|
|
1975
|
+
if (supPartial !== undefined) {
|
|
1976
|
+
state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
|
|
1977
|
+
}
|
|
1978
|
+
return state.config;
|
|
1979
|
+
});
|
|
1980
|
+
await rescheduleTimer();
|
|
1981
|
+
return { ok: true, config: cfg };
|
|
1982
|
+
},
|
|
1983
|
+
};
|
|
1984
|
+
|
|
1985
|
+
module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR, selectHistoryJobs, parsePorcelain, FINISH_PROTOCOL, remote, pickNextBatch, pickForProject, reapDeadRunningJobs };
|