claude-code-session-manager 0.20.0 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/assets/{TiptapBody-COZHDXvn.js → TiptapBody-Db7_uXrI.js} +1 -1
  2. package/dist/assets/{cssMode-BGlgF50F.js → cssMode-DFKJhhi6.js} +1 -1
  3. package/dist/assets/{freemarker2-CwlJczaA.js → freemarker2-DUat8x8o.js} +1 -1
  4. package/dist/assets/{handlebars-C7ChleGP.js → handlebars-B2C1qhAI.js} +1 -1
  5. package/dist/assets/{html-C0XyedAq.js → html-khtg0DVs.js} +1 -1
  6. package/dist/assets/{htmlMode-DTJsOfuO.js → htmlMode-Jmhs-vfl.js} +1 -1
  7. package/dist/assets/{index-6poesY86.css → index-BkkBX1z7.css} +1 -1
  8. package/dist/assets/{index-C4joLNKY.js → index-pqnuXM14.js} +588 -578
  9. package/dist/assets/{javascript-CPRB5GUm.js → javascript-i1CXbgg4.js} +1 -1
  10. package/dist/assets/{jsonMode-DKBN0s8-.js → jsonMode-DXZaj-kR.js} +1 -1
  11. package/dist/assets/{liquid-CJmNIgnK.js → liquid-Ds7jUF53.js} +1 -1
  12. package/dist/assets/{lspLanguageFeatures-CIIba3v8.js → lspLanguageFeatures-B_15vO6X.js} +1 -1
  13. package/dist/assets/{mdx-BOiNk1a1.js → mdx-DgrrLgTE.js} +1 -1
  14. package/dist/assets/{python-5AV3HPYJ.js → python-Cff3tPw3.js} +1 -1
  15. package/dist/assets/{razor-6iMJA6dH.js → razor-DlyG7FmM.js} +1 -1
  16. package/dist/assets/{tsMode-WJISqg3-.js → tsMode-DRmmmttS.js} +1 -1
  17. package/dist/assets/{typescript-CnA0yZf9.js → typescript-DQFL2T1p.js} +1 -1
  18. package/dist/assets/{xml-BLkNwYO2.js → xml-CwsJEzdU.js} +1 -1
  19. package/dist/assets/{yaml-D6anZ1nO.js → yaml-BDsDjf-y.js} +1 -1
  20. package/dist/index.html +2 -2
  21. package/package.json +3 -1
  22. package/src/main/historyAggregator.cjs +15 -9
  23. package/src/main/index.cjs +7 -2
  24. package/src/main/ipcSchemas.cjs +43 -0
  25. package/src/main/kg.cjs +27 -17
  26. package/src/main/lib/reaperHelpers.cjs +67 -0
  27. package/src/main/lib/schedulerBatch.cjs +212 -0
  28. package/src/main/scheduler.cjs +173 -125
  29. package/src/main/webRemote.cjs +916 -0
  30. package/src/preload/api.d.ts +50 -9
  31. package/src/preload/index.cjs +34 -5
  32. package/src/main/projectSkills.cjs +0 -124
@@ -52,6 +52,7 @@ const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
52
52
  const supervisor = require('./supervisor.cjs');
53
53
  const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
54
54
  const { readTail } = require('./lib/fileTail.cjs');
55
+ const { claudePidAlive, classifyRunOutcome } = require('./lib/reaperHelpers.cjs');
55
56
  const { openLog, withChildAndLog } = require('./lib/childWithLog.cjs');
56
57
  const { sendIfAlive } = require('./lib/sendToRenderer.cjs');
57
58
  const prdParser = require('./scheduler/prdParser.cjs');
@@ -63,6 +64,7 @@ const {
63
64
  USAGE_REFRESH_INTERVAL_MS,
64
65
  MAX_JOB_DURATION_MS,
65
66
  } = require('./lib/schedulerConfig.cjs');
67
+ const { pickForProject, pickNextBatch, DEFAULT_PROJECT_CWD } = require('./lib/schedulerBatch.cjs');
66
68
 
67
69
  const MAX_INVESTIGATION_DURATION_MS = 30 * 60_000;
68
70
 
@@ -159,7 +161,7 @@ const QUEUE_PATH = path.join(ROOT, 'queue.json');
159
161
  const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
160
162
  const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
161
163
  const HEARTBEAT_MAX_BYTES = 1024 * 1024;
162
- const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
164
+ // DEFAULT_PROJECT_CWD imported from lib/schedulerBatch.cjs (single source of truth).
163
165
 
164
166
  const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
165
167
  ? Math.max(1, Math.min(20, parseInt(process.env.SM_SCHEDULER_MAX_CONCURRENCY, 10) || 4))
@@ -942,122 +944,10 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
942
944
  });
943
945
  }
944
946
 
945
- /**
946
- * Pick the next batch of jobs to spawn this tick.
947
- *
948
- * Rules:
949
- * 1. Find the lowest parallelGroup that has pending jobs not already in
950
- * runningSet.
951
- * 2. If that group has jobs in runningSet (i.e., we're mid-group), backfill
952
- * up to (cap - runningSet.size) more from the SAME group.
953
- * 3. If the current group has NO jobs in runningSet (new group), and there
954
- * are still jobs from an earlier group in runningSet, do nothing — wait
955
- * for the earlier group to drain before advancing.
956
- * 4. **Late-arrival**: if a lower-numbered (higher-priority) PRD reconciles
957
- * AFTER a higher-numbered group was already picked, fire the late-arrival
958
- * immediately in parallel with the active group rather than starving it
959
- * until the active group drains. This handles the reconcile-race where
960
- * a PRD file lands on disk between two pickNextBatch invocations.
961
- * 5. A singleton group (unique NN, no other jobs share it) runs alone;
962
- * no bleed into adjacent groups.
963
- *
964
- * Returns array of job objects to spawn. O(N) where N = pending.length.
965
- */
966
- function pickNextBatch(allJobs, running, cap) {
967
- const pending = allJobs.filter((j) => j.status === 'pending' && !running.has(j.slug));
968
- if (pending.length === 0) return [];
969
-
970
- // Lowest pending group (computed up-front so the failure gate can compare).
971
- const lowestPendingGroup = pending.reduce(
972
- (min, j) => Math.min(min, j.parallelGroup ?? 99),
973
- Infinity,
974
- );
975
-
976
- // Cross-group failure gate: refuse to advance past a group with failed jobs.
977
- // Without this, a failed foundation PRD (e.g. 03-doc-editor-foundation
978
- // crashed with a NUL-byte spawn error on 2026-05-21) doesn't stop later
979
- // groups (04, 05, 06...) from running and silently corrupting the project
980
- // state. The user can re-queue the failed job (pending) or archive it to
981
- // unblock the gate, but the default is to halt until the failure is
982
- // acknowledged. (needs_review is NOT a blocker — it just means the job ran
983
- // but the verifier flagged something for human review.)
984
- const blockingFailures = allJobs.filter((j) =>
985
- j.status === 'failed' &&
986
- (j.parallelGroup ?? 99) < lowestPendingGroup,
987
- );
988
- if (blockingFailures.length > 0) {
989
- const slugs = blockingFailures.map((j) => j.slug).join(', ');
990
- console.log(`[scheduler] failure-gate: holding g${lowestPendingGroup} — ${blockingFailures.length} failed job(s) in earlier groups [${slugs}]. Reset to pending or archive to unblock.`);
991
- return [];
992
- }
993
-
994
- // Groups with at least one job in flight: either tracked in runningSet
995
- // (this process spawned it) or still marked 'running' in queue.json
996
- // (persisted from a previous session that hasn't been orphan-reset yet).
997
- const activeGroups = new Set();
998
- for (const slug of running) {
999
- const job = allJobs.find((j) => j.slug === slug);
1000
- if (job) activeGroups.add(job.parallelGroup ?? 99);
1001
- }
1002
- for (const j of allJobs) {
1003
- if (j.status === 'running' && !running.has(j.slug)) {
1004
- activeGroups.add(j.parallelGroup ?? 99);
1005
- }
1006
- }
1007
- // Total slots consumed: in-process spawns + queue.json running count.
1008
- const queueRunningCount = allJobs.filter((j) => j.status === 'running').length;
1009
- const effectiveRunning = Math.max(running.size, queueRunningCount);
1010
-
1011
- // (lowestPendingGroup was computed up-front for the failure-gate check.)
1012
-
1013
- if (activeGroups.size > 0) {
1014
- const lowestActive = Math.min(...activeGroups);
1015
- if (lowestPendingGroup > lowestActive) {
1016
- // Earlier group still running — wait for it to drain before advancing.
1017
- console.log(`[scheduler] concurrency: g${lowestActive} in flight, holding g${lowestPendingGroup}`);
1018
- return [];
1019
- }
1020
- if (lowestPendingGroup < lowestActive) {
1021
- // Late-arrival: a lower-numbered (higher-priority) PRD reconciled AFTER
1022
- // a higher-numbered group was already picked. Without this branch the
1023
- // pending PRD starves until the active group drains — the bug observed
1024
- // on 2026-05-10 where 118-studio-add-wave2-games (g118) was held while
1025
- // the g130 hardening trio ran. Honor priority: fire the late-arrival
1026
- // now, in parallel with the active group. (Strict serial group
1027
- // ordering still applies between groups that were both present at the
1028
- // time of picking; this only handles the reconcile-race edge case.)
1029
- const slots = cap - effectiveRunning;
1030
- if (slots <= 0) {
1031
- console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots for late-arrival g${lowestPendingGroup}`);
1032
- return [];
1033
- }
1034
- const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
1035
- console.log(`[scheduler] concurrency: firing late-arrival g${lowestPendingGroup} (${batch.length} job(s)) alongside active g${lowestActive}`);
1036
- return batch;
1037
- }
1038
- // Backfill slots remaining in the current group.
1039
- const slots = cap - effectiveRunning;
1040
- if (slots <= 0) {
1041
- console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
1042
- return [];
1043
- }
1044
- const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestActive).slice(0, slots);
1045
- if (batch.length > 0) {
1046
- console.log(`[scheduler] concurrency: backfilling ${batch.length} into g${lowestActive} (${effectiveRunning}/${cap} running)`);
1047
- }
1048
- return batch;
1049
- }
1050
-
1051
- // No active group — start the next group fresh.
1052
- const slots = cap - effectiveRunning;
1053
- if (slots <= 0) {
1054
- console.log(`[scheduler] concurrency: cap ${cap} reached (${effectiveRunning} running), no slots`);
1055
- return [];
1056
- }
1057
- const batch = pending.filter((j) => (j.parallelGroup ?? 99) === lowestPendingGroup).slice(0, slots);
1058
- console.log(`[scheduler] concurrency: starting g${lowestPendingGroup} with ${batch.length} job(s) (cap ${cap})`);
1059
- return batch;
1060
- }
947
+ // pickNextBatch and pickForProject are defined in lib/schedulerBatch.cjs and
948
+ // required at the top of this file. Group-ordering gates are evaluated per
949
+ // project (keyed by cwd) so jobs in different repos run concurrently up to
950
+ // the cap; within one project, sequential-group semantics are preserved.
1061
951
 
1062
952
  /**
1063
953
  * Recognize fix-plan slugs (NN-fix-...) so we don't recurse on a fix-plan that
@@ -1486,10 +1376,60 @@ async function maybeLaunchWhenAvailable(state) {
1486
1376
  tickQueue().catch((e) => console.error('[scheduler] tickQueue error', e));
1487
1377
  }
1488
1378
 
1379
+ // ---------- dead-process reaper ----------
1380
+
1381
+ /**
1382
+ * Scan running jobs, identify those whose claude process is provably dead, and
1383
+ * finalize them to completed/failed by reading the run log. Called once per
1384
+ * poll cycle. Conservative: a job with no runtime.pid yet (spawn mid-flight)
1385
+ * is always skipped. A job whose pid is alive (claudePidAlive) is always skipped.
1386
+ * Exported so unit tests can invoke it directly.
1387
+ */
1388
+ async function reapDeadRunningJobs() {
1389
+ try {
1390
+ if (runningSet.size === 0) return; // fast path: no in-flight jobs
1391
+ const state = await readQueue();
1392
+ const dead = [];
1393
+ for (const j of state.jobs) {
1394
+ if (j.status !== 'running') continue;
1395
+ const pid = j.runtime?.pid;
1396
+ if (!pid) continue; // spawn may be mid-flight; give it a cycle
1397
+ if (claudePidAlive(pid)) continue;
1398
+ const logPath = j.runId
1399
+ ? path.join(RUNS_DIR, j.runId, `${j.slug}.log`)
1400
+ : null;
1401
+ const outcome = logPath ? classifyRunOutcome(logPath) : 'unknown';
1402
+ dead.push({ slug: j.slug, pid, outcome });
1403
+ }
1404
+ if (dead.length === 0) return;
1405
+
1406
+ await mutate((s) => {
1407
+ for (const { slug, pid, outcome } of dead) {
1408
+ const idx = s.jobs.findIndex((x) => x.slug === slug);
1409
+ if (idx < 0 || s.jobs[idx].status !== 'running') continue; // race guard
1410
+ const success = outcome === 'success';
1411
+ s.jobs[idx].status = success ? 'completed' : 'failed';
1412
+ s.jobs[idx].exitCode = success ? 0 : (s.jobs[idx].exitCode ?? 1);
1413
+ s.jobs[idx].finishedAt = new Date().toISOString();
1414
+ s.jobs[idx].error = success ? null : `reaped: process gone, no success result in log (${outcome})`;
1415
+ delete s.jobs[idx].runtime;
1416
+ runningSet.delete(slug);
1417
+ console.log(`[scheduler] reaped dead job slug=${slug} pid=${pid} outcome=${outcome}`);
1418
+ }
1419
+ });
1420
+
1421
+ await broadcast();
1422
+ tickQueue().catch(() => {});
1423
+ } catch (e) {
1424
+ console.warn('[scheduler] reapDeadRunningJobs error', e?.message);
1425
+ }
1426
+ }
1427
+
1489
1428
  // ---------- poll loop with exponential backoff ----------
1490
1429
 
1491
1430
  async function pollLoop() {
1492
1431
  try {
1432
+ await reapDeadRunningJobs().catch(() => {});
1493
1433
  const r = await billing.fetchUsage();
1494
1434
 
1495
1435
  if (r.kind === 'ok') {
@@ -1837,11 +1777,22 @@ async function init() {
1837
1777
  loadSchedulerState();
1838
1778
  bootedAt = Date.now();
1839
1779
 
1840
- // Boot reconciliation: mark any job that was 'running' when the app died as
1841
- // 'failed', AND kill its detached claude child if still alive. Without the
1842
- // kill step the child keeps running as a zombie writing to the project on
1843
- // its own schedule, which is exactly what happened on 2026-05-21 (PID 78230
1844
- // writing PRD 05's output while the scheduler thought the job was orphaned).
1780
+ // Boot reconciliation: finalize any job that was 'running' when the app died.
1781
+ // Check the run log first a job that emitted result/success before the crash
1782
+ // should be marked 'completed', not 'failed', so it doesn't wedge the queue
1783
+ // via the failure-gate. Also kill any still-live orphan claude child to prevent
1784
+ // it from continuing to write to the project unsupervised (2026-05-21 incident).
1785
+ //
1786
+ // classifyRunOutcome calls readTail → fs.readFileSync (up to 64 KB per job).
1787
+ // Pre-compute all outcomes BEFORE entering the mutate lock so the blocking I/O
1788
+ // does not stall the event loop or hold the mutateTail chain during startup.
1789
+ const bootSnap = readQueueSync();
1790
+ const bootOutcomes = new Map();
1791
+ for (const j of bootSnap.jobs) {
1792
+ if (j.status !== 'running') continue;
1793
+ const logPath = j.runId ? path.join(RUNS_DIR, j.runId, `${j.slug}.log`) : null;
1794
+ bootOutcomes.set(j.slug, logPath ? classifyRunOutcome(logPath) : 'unknown');
1795
+ }
1845
1796
  await mutate((state) => {
1846
1797
  for (const j of state.jobs) {
1847
1798
  if (j.status === 'running') {
@@ -1854,10 +1805,14 @@ async function init() {
1854
1805
  console.log(`[scheduler] boot: SIGTERM'd orphan claude pid=${pid} for ${j.slug}`);
1855
1806
  }
1856
1807
  }
1857
- j.status = 'failed';
1858
- j.error = `orphaned: app restarted while running${killNote}`;
1808
+ const outcome = bootOutcomes.get(j.slug) ?? 'unknown';
1809
+ const success = outcome === 'success';
1810
+ j.status = success ? 'completed' : 'failed';
1811
+ j.exitCode = success ? 0 : (j.exitCode ?? 1);
1812
+ j.error = success ? null : `orphaned: app restarted while running${killNote}`;
1859
1813
  j.finishedAt = new Date().toISOString();
1860
1814
  delete j.runtime;
1815
+ console.log(`[scheduler] boot reconcile: slug=${j.slug} outcome=${outcome} → ${j.status}`);
1861
1816
  }
1862
1817
  }
1863
1818
  });
@@ -1934,4 +1889,97 @@ async function init() {
1934
1889
  }
1935
1890
  }
1936
1891
 
1937
- module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR, selectHistoryJobs, parsePorcelain, FINISH_PROTOCOL };
1892
+ // remote callable from webRemote.cjs without going through IPC.
1893
+ const remote = {
1894
+ async getState() {
1895
+ const state = await readQueue();
1896
+ await reconcile(state);
1897
+ await writeQueue(state);
1898
+ return buildScheduleStatePayload(state, { withPaths: true });
1899
+ },
1900
+
1901
+ async readPrd(slug) {
1902
+ const filePath = safeSlugPath(slug);
1903
+ if (!filePath) return { ok: false, error: 'invalid slug' };
1904
+ try {
1905
+ // realpath resolves symlinks; re-check boundary to block a rogue agent job
1906
+ // that places a symlink inside PRDS_DIR pointing outside the safe root.
1907
+ const real = await fsp.realpath(filePath);
1908
+ if (!real.startsWith(PRDS_DIR + path.sep)) {
1909
+ return { ok: false, error: 'invalid slug' };
1910
+ }
1911
+ const text = await fsp.readFile(real, 'utf8');
1912
+ return { ok: true, text };
1913
+ } catch (e) {
1914
+ return { ok: false, error: e?.message };
1915
+ }
1916
+ },
1917
+
1918
+ async readLog(slug, runId) {
1919
+ const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
1920
+ if (!logPath.startsWith(RUNS_DIR + path.sep)) {
1921
+ return { ok: false, error: 'invalid slug or runId' };
1922
+ }
1923
+ try {
1924
+ // realpath resolves symlinks; re-check boundary to block a rogue agent job
1925
+ // that places a symlink inside RUNS_DIR pointing outside the safe root.
1926
+ const real = await fsp.realpath(logPath);
1927
+ if (!real.startsWith(RUNS_DIR + path.sep)) {
1928
+ return { ok: false, error: 'invalid slug or runId' };
1929
+ }
1930
+ const text = await fsp.readFile(real, 'utf8');
1931
+ return { ok: true, text };
1932
+ } catch (e) {
1933
+ return { ok: false, error: e?.message };
1934
+ }
1935
+ },
1936
+
1937
+ async writePrd(slug, body) {
1938
+ const resolved = safeSlugPath(slug);
1939
+ if (!resolved) return { ok: false, error: 'invalid slug' };
1940
+ try {
1941
+ await config.writeTextAtomic(resolved, body);
1942
+ const stat = await fsp.stat(resolved);
1943
+ return { ok: true, bytesWritten: stat.size };
1944
+ } catch (e) {
1945
+ return { ok: false, error: e?.message ?? 'write failed' };
1946
+ }
1947
+ },
1948
+
1949
+ async resetJob(slug) {
1950
+ if (!safeSlugPath(slug)) return { ok: false, error: 'invalid slug' };
1951
+ const found = await mutate((state) => {
1952
+ const idx = state.jobs.findIndex((j) => j.slug === slug);
1953
+ if (idx < 0) return false;
1954
+ resetJobFields(state.jobs[idx]);
1955
+ return true;
1956
+ });
1957
+ if (!found) return { ok: false, error: 'not found' };
1958
+ await broadcast();
1959
+ return { ok: true };
1960
+ },
1961
+
1962
+ async runNow() {
1963
+ await clearPause('run-now');
1964
+ runDueJobs().catch((e) => logs.writeLine({
1965
+ level: 'error', scope: 'scheduler',
1966
+ message: 'runDueJobs error (remote:run-now)', meta: { error: e?.message },
1967
+ }));
1968
+ return { ok: true };
1969
+ },
1970
+
1971
+ async setConfig(partial) {
1972
+ const cfg = await mutate((state) => {
1973
+ const { supervisor: supPartial, ...rest } = partial;
1974
+ state.config = { ...state.config, ...rest };
1975
+ if (supPartial !== undefined) {
1976
+ state.config.supervisor = { ...(state.config.supervisor ?? {}), ...supPartial };
1977
+ }
1978
+ return state.config;
1979
+ });
1980
+ await rescheduleTimer();
1981
+ return { ok: true, config: cfg };
1982
+ },
1983
+ };
1984
+
1985
+ module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR, selectHistoryJobs, parsePorcelain, FINISH_PROTOCOL, remote, pickNextBatch, pickForProject, reapDeadRunningJobs };