@yemi33/minions 0.1.1950 → 0.1.1952

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dashboard/js/command-center.js +13 -2
  2. package/dashboard/js/modal-qa.js +10 -0
  3. package/dashboard/js/refresh.js +4 -0
  4. package/dashboard/js/render-dispatch.js +25 -0
  5. package/dashboard/js/render-other.js +109 -2
  6. package/dashboard/js/settings.js +1 -1
  7. package/dashboard/layout.html +2 -2
  8. package/dashboard/pages/engine.html +6 -0
  9. package/dashboard/slim.html +1987 -0
  10. package/dashboard/styles.css +8 -0
  11. package/dashboard.js +450 -40
  12. package/docs/completion-reports.md +25 -0
  13. package/docs/design-state-storage.md +1 -1
  14. package/docs/slim-ux/architecture-suggestions.md +467 -0
  15. package/docs/slim-ux/concepts.md +824 -0
  16. package/engine/ado-mcp-wrapper.js +33 -7
  17. package/engine/ado.js +123 -15
  18. package/engine/cc-worker-pool.js +41 -0
  19. package/engine/cleanup.js +71 -34
  20. package/engine/cli.js +37 -0
  21. package/engine/dispatch.js +32 -9
  22. package/engine/features.js +6 -0
  23. package/engine/gh-token.js +137 -0
  24. package/engine/github.js +166 -29
  25. package/engine/issues.js +29 -0
  26. package/engine/keep-process-sweep.js +397 -0
  27. package/engine/lifecycle.js +150 -33
  28. package/engine/playbook.js +17 -0
  29. package/engine/queries.js +71 -0
  30. package/engine/recovery.js +6 -0
  31. package/engine/shared.js +446 -14
  32. package/engine/spawn-agent.js +44 -2
  33. package/engine/timeout.js +34 -11
  34. package/engine/worktree-pool.js +410 -0
  35. package/engine.js +643 -119
  36. package/package.json +6 -3
  37. package/playbooks/review.md +2 -0
  38. package/playbooks/shared-rules.md +3 -1
  39. package/prompts/cc-system.md +24 -0
  40. package/engine/copilot-models.json +0 -5
package/engine/shared.js CHANGED
@@ -632,7 +632,23 @@ function sleepMs(ms) {
632
632
  }
633
633
  }
634
634
 
635
- const LOCK_STALE_MS = 60000; // 60 seconds force-remove locks older than this
635
+ // P-b7d4e8f2 bumped from 60_000 to 300_000 once the reaper grew a PID-liveness
636
+ // guard (below). Holders that record their pid are protected from reap up to
637
+ // 5×LOCK_STALE_MS as long as `process.kill(pid, 0)` succeeds; the bump removes
638
+ // false-positive kills of legitimate slow operations (worktree adds, large state
639
+ // rewrites) while the PID guard keeps crashed-holder recovery fast.
640
+ const LOCK_STALE_MS = 300000; // 5 minutes — force-remove locks older than this
641
+
642
+ // Shared.js-local PID liveness check. Avoids a circular require on engine/cli.js
643
+ // (which has its own isPidAlive) and engine/timeout.js (which has
644
+ // isOsPidAliveForDispatch — but that one looks up pid from a side-channel
645
+ // pid-file, whereas the lock reaper already has the holder pid in-hand from the
646
+ // lock contents).
647
+ function isPidAlive(pid) {
648
+ if (!Number.isFinite(pid) || pid <= 0) return false;
649
+ try { process.kill(pid, 0); return true; }
650
+ catch { return false; }
651
+ }
636
652
 
637
653
  function withFileLock(lockPath, fn, {
638
654
  timeoutMs = 5000,
@@ -655,20 +671,54 @@ function withFileLock(lockPath, fn, {
655
671
  const dir = path.dirname(lockPath);
656
672
  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
657
673
  fd = fs.openSync(lockPath, 'wx');
674
+ // P-b7d4e8f2 — record holder identity so the stale-lock reaper can
675
+ // distinguish a still-alive slow holder from a crashed one. Best-effort:
676
+ // the lock's existence (not its contents) provides mutual exclusion, so
677
+ // a write failure here must NOT abort acquisition.
678
+ try {
679
+ fs.writeSync(fd, JSON.stringify({ pid: process.pid, ts: Date.now() }));
680
+ } catch { /* payload is advisory; lock semantics unaffected */ }
658
681
  break;
659
682
  } catch (err) {
660
683
  if (err.code !== 'EEXIST') throw err;
661
- // Check for stale lock if lock file is older than LOCK_STALE_MS, force-remove it
684
+ // P-b7d4e8f2Stale-lock check combines mtime age with PID liveness:
685
+ // 1. If mtime <= LOCK_STALE_MS → never reap (recently active).
686
+ // 2. If JSON-parsable {pid, ts}:
687
+ // - dead PID → reap.
688
+ // - alive PID, mtime <= 5× → don't reap (legitimate slow holder).
689
+ // - alive PID, mtime > 5× → reap (last-resort guard against
690
+ // stuck holders that never released).
691
+ // 3. Legacy / empty / non-JSON lockfile → mtime-only path (reap).
662
692
  try {
663
693
  const stat = fs.statSync(lockPath);
664
- if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) {
694
+ const mtimeAge = Date.now() - stat.mtimeMs;
695
+ if (mtimeAge > LOCK_STALE_MS) {
696
+ let holderPid = null;
665
697
  try {
666
- fs.unlinkSync(lockPath);
667
- } catch (unlinkErr) {
668
- // ENOENT: another process deleted the lock between stat and unlink — safe to retry
669
- if (unlinkErr.code !== 'ENOENT') throw unlinkErr;
698
+ const raw = fs.readFileSync(lockPath, 'utf8');
699
+ const parsed = JSON.parse(raw);
700
+ if (parsed && Number.isFinite(parsed.pid) && parsed.pid > 0) {
701
+ holderPid = parsed.pid;
702
+ }
703
+ } catch { /* legacy/empty/corrupt lock → fall through to mtime-only */ }
704
+
705
+ let shouldReap;
706
+ if (holderPid !== null) {
707
+ shouldReap = !isPidAlive(holderPid) || mtimeAge > LOCK_STALE_MS * 5;
708
+ } else {
709
+ // Legacy empty/non-JSON lockfile: trust mtime alone
710
+ shouldReap = true;
711
+ }
712
+
713
+ if (shouldReap) {
714
+ try {
715
+ fs.unlinkSync(lockPath);
716
+ } catch (unlinkErr) {
717
+ // ENOENT: another process deleted the lock between stat and unlink — safe to retry
718
+ if (unlinkErr.code !== 'ENOENT') throw unlinkErr;
719
+ }
720
+ continue; // lock just removed — retry immediately
670
721
  }
671
- continue; // lock just removed — retry immediately
672
722
  }
673
723
  } catch (staleErr) {
674
724
  // ENOENT from statSync: lock file disappeared between EEXIST and stat — retry will succeed
@@ -683,10 +733,24 @@ function withFileLock(lockPath, fn, {
683
733
  }
684
734
 
685
735
  try {
686
- return fn();
736
+ const result = fn();
737
+ // P-a3f9b2c1 — Defensive: detect a thenable return and throw synchronously.
738
+ // The `finally` below releases the lock immediately after `fn()` returns;
739
+ // an async callback would let the lock be released before its body
740
+ // completes, silently breaking mutual exclusion. Clean up our own fd /
741
+ // lock first, then throw so the caller cannot ignore the failure.
742
+ if (result && typeof result.then === 'function') {
743
+ try { fs.closeSync(fd); } catch { /* cleanup */ }
744
+ try { fs.unlinkSync(lockPath); } catch { /* cleanup */ }
745
+ fd = null; // suppress double-cleanup in `finally`
746
+ throw new Error('withFileLock: fn must be synchronous; got Promise. Use synchronous operations only.');
747
+ }
748
+ return result;
687
749
  } finally {
688
- try { fs.closeSync(fd); } catch { /* cleanup */ }
689
- try { fs.unlinkSync(lockPath); } catch { /* cleanup */ }
750
+ if (fd !== null) {
751
+ try { fs.closeSync(fd); } catch { /* cleanup */ }
752
+ try { fs.unlinkSync(lockPath); } catch { /* cleanup */ }
753
+ }
690
754
  }
691
755
  }
692
756
  throw lastErr;
@@ -867,7 +931,9 @@ function writeToInbox(agentId, slug, content, _inboxDir, metadata) {
867
931
  // ── Process Spawning ────────────────────────────────────────────────────────
868
932
  // All child process calls go through these to ensure windowsHide: true
869
933
 
870
- const { execSync: _execSync, spawnSync: _spawnSync, spawn: _spawn, exec: _cbExec } = require('child_process');
934
+ const { execSync: _execSync, spawnSync: _spawnSync, spawn: _spawn, exec: _cbExec, execFile: _cbExecFile } = require('child_process');
935
+ const { promisify: _promisify } = require('util');
936
+ const _execFileAsync = _promisify(_cbExecFile);
871
937
 
872
938
  function exec(cmd, opts = {}) {
873
939
  return _execSync(cmd, { windowsHide: true, ...opts });
@@ -908,6 +974,149 @@ function execAsync(cmd, opts = {}) {
908
974
  });
909
975
  }
910
976
 
977
+ // ── Argv-form (shell:false) helpers + ref/slug validators (P-a7c4d2e8) ──────
978
+ // These eliminate shell-injection vectors in `gh`/`git` calls that previously
979
+ // interpolated untrusted PR data (slugs from PR-link regex matches, branch
980
+ // names from GitHub/ADO API responses, agent stdout, etc.) into a shell
981
+ // string. Use these instead of execAsync wherever any argument is derived
982
+ // from an untrusted source.
983
+ //
984
+ // const out = await shared.shellSafeGh(['api', `repos/${shared.validateGhSlug(slug)}/pulls/${prNum}`]);
985
+ // await shared.shellSafeGit(['fetch', 'origin', shared.validateGitRef(branch)], { cwd });
986
+ //
987
+ // Validators throw on rejection; the wrapper helpers spawn via execFile
988
+ // (shell:false) so shell metacharacters in argv elements are inert.
989
+
990
+ // Tightened beyond the spec baseline regex to also block argument-injection
991
+ // (leading dash) and unsafe ref-format quirks the shell can't help with —
992
+ // `..` traversal, `@{`, `*`/`?` globs, leading/trailing/double slashes, and
993
+ // path components ending in `.lock`. Mirrors a conservative subset of
994
+ // `git check-ref-format`.
995
+ function validateGitRef(ref) {
996
+ const fail = (why) => {
997
+ const e = new Error(`Invalid git ref (${why}): ${JSON.stringify(String(ref).slice(0, 64))}`);
998
+ throw e;
999
+ };
1000
+ if (typeof ref !== 'string') fail('not a string');
1001
+ if (ref.length === 0) fail('empty');
1002
+ if (ref.length > 256) fail('too long');
1003
+ if (!/^[A-Za-z0-9._\/-]+$/.test(ref)) fail('disallowed character');
1004
+ if (ref.startsWith('-')) fail('leading dash');
1005
+ if (ref.startsWith('/') || ref.endsWith('/')) fail('leading or trailing slash');
1006
+ if (ref.includes('//')) fail('double slash');
1007
+ if (ref.endsWith('.')) fail('trailing dot');
1008
+ if (ref.includes('@{')) fail('ref expression @{');
1009
+ // Per-component checks (split on `/`).
1010
+ for (const part of ref.split('/')) {
1011
+ if (part.length === 0) fail('empty path component');
1012
+ if (part === '..' || part === '.') fail('dot path component');
1013
+ if (part.includes('..')) fail('double-dot in component');
1014
+ if (part.endsWith('.lock')) fail('component ends with .lock');
1015
+ if (part.startsWith('.')) fail('component starts with dot');
1016
+ }
1017
+ return ref;
1018
+ }
1019
+
1020
+ function validateGhSlug(slug) {
1021
+ const fail = (why) => {
1022
+ throw new Error(`Invalid GitHub slug (${why}): ${JSON.stringify(String(slug).slice(0, 64))}`);
1023
+ };
1024
+ if (typeof slug !== 'string') fail('not a string');
1025
+ if (slug.length === 0) fail('empty');
1026
+ if (slug.length > 256) fail('too long');
1027
+ if (slug !== slug.trim()) fail('surrounding whitespace');
1028
+ if (!/^[A-Za-z0-9._-]+\/[A-Za-z0-9._-]+$/.test(slug)) fail('disallowed character or shape');
1029
+ if (slug.startsWith('-') || slug.includes('/-')) fail('leading dash in component');
1030
+ return slug;
1031
+ }
1032
+
1033
+ // W-mp6k7ywi000fa33c — pure helper. Returns { ok: boolean, reason?: string }.
1034
+ // `ok: true` when `dirPath` exists AND contains either a `.git` directory OR
1035
+ // a `.git` worktree pointer file (a real file whose first line starts with
1036
+ // `gitdir:`). Anything else — missing dir, missing `.git`, `.git` as a
1037
+ // random non-pointer file — returns `ok: false` with a human-readable reason.
1038
+ //
1039
+ // No shelling out (no `git rev-parse`); just `fs.existsSync`/`fs.statSync`
1040
+ // and a tiny content sniff for the worktree pointer case. This catches the
1041
+ // failure mode where an agent ran in a partial copy of a repo (selective
1042
+ // `cp -r`) instead of `git worktree add`, which produced a directory that
1043
+ // looks file-by-file like a worktree but has no git linkage. See
1044
+ // W-mp6ha6q9000d58a5 for the real-world incident this prevents.
1045
+ function isValidGitWorktree(dirPath) {
1046
+ if (typeof dirPath !== 'string' || dirPath.length === 0) {
1047
+ return { ok: false, reason: 'cwd missing or not a string' };
1048
+ }
1049
+ let dirStat;
1050
+ try { dirStat = fs.statSync(dirPath); }
1051
+ catch (_e) { return { ok: false, reason: 'directory does not exist: ' + dirPath }; }
1052
+ if (!dirStat.isDirectory()) {
1053
+ return { ok: false, reason: 'path is not a directory: ' + dirPath };
1054
+ }
1055
+ const gitPath = path.join(dirPath, '.git');
1056
+ let gitStat;
1057
+ try { gitStat = fs.statSync(gitPath); }
1058
+ catch (_e) { return { ok: false, reason: 'no .git directory or worktree pointer at ' + dirPath }; }
1059
+ if (gitStat.isDirectory()) return { ok: true };
1060
+ if (gitStat.isFile()) {
1061
+ // Worktree pointer files contain "gitdir: <abs path>" on the first line.
1062
+ // A `.git` file that doesn't match this shape is a normal file, not a
1063
+ // valid worktree linkage — reject it.
1064
+ let head = '';
1065
+ try { head = fs.readFileSync(gitPath, { encoding: 'utf8', flag: 'r' }).slice(0, 256); }
1066
+ catch (e) { return { ok: false, reason: '.git file unreadable: ' + e.message }; }
1067
+ if (/^gitdir:\s*\S/.test(head)) return { ok: true };
1068
+ return { ok: false, reason: '.git file present but not a worktree pointer (no "gitdir:" prefix): ' + dirPath };
1069
+ }
1070
+ return { ok: false, reason: '.git entry is neither a file nor a directory: ' + gitPath };
1071
+ }
1072
+
1073
+ function shellSafeGh(args, opts = {}) {
1074
+ if (!Array.isArray(args)) {
1075
+ return Promise.reject(new TypeError('shellSafeGh: args must be an array'));
1076
+ }
1077
+ const { timeout, ...rest } = opts;
1078
+ return _execFileAsync('gh', args, {
1079
+ windowsHide: true,
1080
+ encoding: 'utf8',
1081
+ shell: false,
1082
+ ...rest,
1083
+ timeout: timeout || 30000,
1084
+ }).then(({ stdout }) => stdout);
1085
+ }
1086
+
1087
+ function shellSafeGit(args, opts = {}) {
1088
+ if (!Array.isArray(args)) {
1089
+ return Promise.reject(new TypeError('shellSafeGit: args must be an array'));
1090
+ }
1091
+ const { timeout, ...rest } = opts;
1092
+ return _execFileAsync('git', args, {
1093
+ windowsHide: true,
1094
+ encoding: 'utf8',
1095
+ shell: false,
1096
+ ...rest,
1097
+ timeout: timeout || 30000,
1098
+ }).then(({ stdout }) => stdout);
1099
+ }
1100
+
1101
+ // Sync argv-form helper for callers that aren't async (e.g. plan
1102
+ // materialization in materializePlansAsWorkItems). Uses execFileSync with
1103
+ // shell:false so argv elements are passed verbatim.
1104
+ function shellSafeGitSync(args, opts = {}) {
1105
+ if (!Array.isArray(args)) {
1106
+ throw new TypeError('shellSafeGitSync: args must be an array');
1107
+ }
1108
+ const { timeout, ...rest } = opts;
1109
+ const { execFileSync: _execFileSync } = require('child_process');
1110
+ return _execFileSync('git', args, {
1111
+ windowsHide: true,
1112
+ encoding: 'utf8',
1113
+ shell: false,
1114
+ stdio: 'pipe',
1115
+ ...rest,
1116
+ timeout: timeout || 30000,
1117
+ });
1118
+ }
1119
+
911
1120
  /**
912
1121
  * Detect the default branch for a git repo. Tries in order:
913
1122
  * 1. The configured mainBranch (if it exists as a local or remote ref)
@@ -1136,9 +1345,18 @@ const ENGINE_DEFAULTS = {
1136
1345
  prMergeMethod: 'squash', // merge method: squash, merge, rebase
1137
1346
  ignoredCommentAuthors: [], // comments from these authors are auto-closed and never trigger fixes
1138
1347
  botCommentLogins: [], // P-a3f9b2c1: opt-in shared-minions GH login list — comments from these logins are suppressed ONLY when body matches positive-signal markers (Verification SUCCESS / VERDICT:APPROVE / noop:true). Narrower than ignoredCommentAuthors which suppresses all comments by login.
1348
+ // W-mp76pw7a001da7c1 — Per-slug GitHub PAT routing. Map of `<owner>` (or `<owner>/*`,
1349
+ // or `*` for fleet default) to a `gh auth` account name. `engine/gh-token.js`
1350
+ // resolves the right token via `gh auth token --user <account> --hostname github.com`
1351
+ // and threads it as `GH_TOKEN` for that one shell-out, so the engine never depends
1352
+ // on which gh account is globally active. Empty `{}` (default) preserves legacy
1353
+ // behavior — every `gh` call uses the ambient identity. Example:
1354
+ // { "opg-microsoft": "yemishin_microsoft", "yemi33": "yemi33", "*": "yemi33" }
1355
+ ghAccounts: {},
1139
1356
  agentBusyReassignMs: 600000, // 10min — reassign work item to another agent if preferred agent is busy beyond this threshold
1140
1357
  ccEffort: null, // effort level for CC/doc-chat (null, 'low', 'medium', 'high')
1141
1358
  enablePreDispatchEval: true, // P-d2a9f6e5: cheap LLM gate before queueing — on by default. See engine/pre-dispatch-eval.js (Ripley §3 recommendation, 2026-05-11 architecture review). Validates from acceptance_criteria when present, falls back to description when criteria are absent but description is rich (≥80 chars). Fail-open on any validator error.
1359
+ completionNonceRequired: false, // P-d2a8f6c1 (agent trust boundary F8): when true, a missing `nonce` field in the completion JSON hard-fails the dispatch with failure_class:'completion-nonce-mismatch'. Default false for one release so older agents/runtime caches that haven't picked up the prompt change degrade with a warning instead of breaking. Mismatched nonces hard-fail regardless of this flag. See docs/completion-reports.md → "Trust boundary".
1142
1360
 
1143
1361
  // ── Runtime fleet (P-3b8e5f1d) ──────────────────────────────────────────────
1144
1362
  // Single source of truth for which CLI runtime + model every spawn uses.
@@ -1181,6 +1399,38 @@ const ENGINE_DEFAULTS = {
1181
1399
  maxMeetingHumanNotesBytes: 2 * 1024, // cap human note bullet lists injected into meeting prompts
1182
1400
  maxPipelineMeetingContextBytes: 16 * 1024, // cap aggregated meeting/dependency context for pipeline plan generation
1183
1401
  notesArchiveMaxFiles: 2000, // keep notes/archive bounded during periodic cleanup
1402
+ // ── Worktree pool (W-mp73ya3e000me6c5) ─────────────────────────────────────
1403
+ // Per-project warm pool: completed worktrees are parked detached at
1404
+ // origin/<main> instead of torn down, then re-borrowed by the next dispatch
1405
+ // for the same project. Saves the cold install/build cost on heavy projects
1406
+ // (constellation: bun install + Vite warmup; minions: npm install + test cache).
1407
+ // Default off — opt-in fleet-wide via engine.worktreePoolSize or per-project
1408
+ // via projects[].worktreePoolSize. See engine/worktree-pool.js + CLAUDE.md
1409
+ // "Worktree pool" section for the lifecycle and edge cases.
1410
+ worktreePoolSize: 0, // 0 = disabled (default); per-project override beats this
1411
+ worktreePoolIdleTtlMs: 6 * 3600 * 1000, // 6h — idle entries past TTL are evicted by cleanup
1412
+ // ── keep_processes (W-mp68q6ke0010de68) ────────────────────────────────────
1413
+ // Opt-in per-WI (`meta.keep_processes: true`) feature that lets an agent
1414
+ // declare specific descendant PIDs the engine MUST NOT reap on close. The
1415
+ // agent writes `agents/<id>/keep-pids.json` before exiting; spawn-agent's
1416
+ // close handler subtracts those PIDs from the reap set and the MCP sweep
1417
+ // adds them as anchors so the reachability walk doesn't classify their
1418
+ // children as stray. Hard caps below bound abuse and audit-log churn.
1419
+ keepProcesses: {
1420
+ enabled: true, // global kill switch; default ON since the feature is opt-in per-WI
1421
+ maxPerAgent: 5, // max PIDs honored per keep-pids.json file
1422
+ maxTtlMinutes: 1440, // 24h hard cap on expires_at
1423
+ defaultTtlMinutes: 60, // default TTL when meta.keep_processes_ttl_minutes is unset
1424
+ sweepEvery: 30, // ticks between TTL/dead-PID sweeps
1425
+ // W-mp6k7ywi000fa33c — when true (default), validateKeepPidsRecord rejects
1426
+ // a keep-pids.json whose `cwd` does not look like a real git worktree
1427
+ // (no `.git` dir or worktree-pointer file). Catches the failure mode where
1428
+ // an agent runs in a partial copy of a repo (selective `cp -r`) instead
1429
+ // of using `git worktree add`. Per-WI override: set
1430
+ // `meta.keep_processes_skip_workdir_check: true` for legitimate non-git
1431
+ // keep_processes use cases.
1432
+ requireGitWorkdir: true,
1433
+ },
1184
1434
  // Backward-compat: keep `engine.claude.*` field family deprecation tracker. Listed here so preflight
1185
1435
  // knows which subkeys to flag as deprecated. Do not consume `claude.*` in new code — use the runtime
1186
1436
  // adapter system (engine/runtimes/) and the resolveAgent*/resolveCc* helpers instead.
@@ -1586,6 +1836,27 @@ const WORK_TYPE = {
1586
1836
  MEETING: 'meeting', EXPLORE: 'explore', ASK: 'ask', TEST: 'test', DOCS: 'docs',
1587
1837
  };
1588
1838
 
1839
+ // Work types whose dispatch path requires a per-project git worktree. The
1840
+ // engine's spawnAgent uses the project's `localPath` as the worktree root —
1841
+ // without an owning project the rootDir falls back to MINIONS_DIR's parent,
1842
+ // which on Windows can collapse to a drive root and forever-fail
1843
+ // assertWorktreeOutsideProject. The dashboard ingress (POST /api/work-items
1844
+ // and /api/work-items/retry) refuses to create or re-spawn a project-less WI
1845
+ // of any type in this set when PROJECTS.length !== 1.
1846
+ //
1847
+ // Complement of engine.js READ_ONLY_ROOT_TASK_TYPES; `docs` is intentionally
1848
+ // also exempt because docs edits run at the Minions root, not in a project
1849
+ // worktree.
1850
+ const WORKTREE_REQUIRING_TYPES = new Set([
1851
+ WORK_TYPE.FIX,
1852
+ WORK_TYPE.IMPLEMENT,
1853
+ WORK_TYPE.IMPLEMENT_LARGE,
1854
+ WORK_TYPE.TEST,
1855
+ WORK_TYPE.VERIFY,
1856
+ WORK_TYPE.REVIEW,
1857
+ WORK_TYPE.DECOMPOSE,
1858
+ ]);
1859
+
1589
1860
  const PLAN_STATUS = {
1590
1861
  ACTIVE: 'active', AWAITING_APPROVAL: 'awaiting-approval', APPROVED: 'approved',
1591
1862
  PAUSED: 'paused', REJECTED: 'rejected', COMPLETED: 'completed',
@@ -1806,6 +2077,9 @@ const FAILURE_CLASS = {
1806
2077
  NETWORK_ERROR: 'network-error', // API rate limit, DNS, connectivity
1807
2078
  OUT_OF_CONTEXT: 'out-of-context', // Context window exhausted (token limit, context length)
1808
2079
  MAX_TURNS: 'max-turns', // Claude CLI error_max_turns — work in progress, retryable
2080
+ COMPLETION_NONCE_MISMATCH: 'completion-nonce-mismatch', // P-d2a8f6c1: completion JSON nonce did not match the per-spawn value injected via MINIONS_COMPLETION_NONCE — treat as forged/untrusted; ignore PR/noop/status fields from the report
2081
+ WORKTREE_PREFLIGHT: 'worktree-preflight', // Pre-spawn worktree validation rejected (nested-in-project, drive-root collapse) — never retryable
2082
+ INVALID_KEEP_PROCESSES_WORKDIR: 'invalid-keep-processes-workdir', // W-mp6k7ywi000fa33c: keep-pids.json declared a cwd that is not a real git worktree (likely a selective copy of the repo) — never retryable; agent must rerun in a real worktree
1809
2083
  UNKNOWN: 'unknown', // Unclassified failure
1810
2084
  };
1811
2085
  const ESCALATION_POLICY = {
@@ -1817,7 +2091,7 @@ const ESCALATION_POLICY = {
1817
2091
  };
1818
2092
 
1819
2093
  // Structured completion protocol — fields agents must produce in ```completion blocks
1820
- const COMPLETION_FIELDS = ['status', 'summary', 'files_changed', 'tests', 'pr', 'not_changed', 'failure_class', 'retryable', 'needs_rerun', 'verdict', 'artifacts'];
2094
+ const COMPLETION_FIELDS = ['status', 'summary', 'files_changed', 'tests', 'pr', 'not_changed', 'failure_class', 'retryable', 'needs_rerun', 'verdict', 'artifacts', 'nonce'];
1821
2095
 
1822
2096
  const DEFAULT_AGENT_METRICS = {
1823
2097
  tasksCompleted: 0, tasksErrored: 0,
@@ -2532,6 +2806,110 @@ function assertWorktreeOutsideProject(worktreePath, projectRoot) {
2532
2806
  throw err;
2533
2807
  }
2534
2808
 
2809
+ /**
2810
+ * Resolve the project root directory used as the parent for git worktree paths
2811
+ * during dispatch. Centralizes the fallback that engine spawnAgent used to do
2812
+ * inline (`project.localPath ? path.resolve(project.localPath) : path.resolve(MINIONS_DIR, '..')`).
2813
+ *
2814
+ * Why this helper exists: when a central work item (no `project` field) is
2815
+ * dispatched and MINIONS_DIR sits one level below a drive/filesystem root
2816
+ * (e.g. `D:\squad`), `path.resolve(MINIONS_DIR, '..')` collapses to the drive
2817
+ * root (`D:\`). The downstream worktree path then evaluates to `D:\worktrees\…`
2818
+ * which IS inside `D:\`, so `assertWorktreeOutsideProject` correctly rejects it
2819
+ * — but the dispatch loops forever because the throw happens in spawnAgent
2820
+ * without surfacing as a non-retryable failure (W-mp62taw2000ubcc3).
2821
+ *
2822
+ * Detect the collapse explicitly here and throw with a clear, actionable code
2823
+ * (`WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT`). Callers should map this to a
2824
+ * non-retryable WORKTREE_PREFLIGHT failure so the dispatch fails fast instead
2825
+ * of silently re-dispatching every tick.
2826
+ *
2827
+ * @param {string|null|undefined} localPath — `project.localPath`, if any
2828
+ * @param {string} minionsDir — the MINIONS_DIR fallback anchor
2829
+ * @returns {string} — absolute path to use as rootDir
2830
+ * @throws {Error} — code WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT on collapse
2831
+ */
2832
+ function resolveProjectRootDir(localPath, minionsDir) {
2833
+ if (localPath) return path.resolve(String(localPath));
2834
+ if (!minionsDir) {
2835
+ const err = new Error('Cannot resolve project rootDir: no project.localPath and no MINIONS_DIR provided.');
2836
+ err.code = 'WORKTREE_ROOTDIR_MISSING_BASE';
2837
+ throw err;
2838
+ }
2839
+ const fallback = path.resolve(String(minionsDir), '..');
2840
+ // path.parse(p).root === p means we hit the drive root (Windows `D:\`,
2841
+ // POSIX `/`, or UNC `\\server\share\`). A drive root is never a legitimate
2842
+ // project root for worktree placement — every sibling like `D:\worktrees\…`
2843
+ // is technically "inside" the drive root and would be rejected.
2844
+ if (path.parse(fallback).root === fallback) {
2845
+ const err = new Error(
2846
+ `Cannot resolve project rootDir for dispatch — MINIONS_DIR="${minionsDir}" parent collapses ` +
2847
+ `to filesystem/drive root "${fallback}", which cannot host worktrees. ` +
2848
+ `Either attach the work item to a project (POST /api/work-items with "project") or ` +
2849
+ `move MINIONS_DIR deeper than one directory below the drive root.`
2850
+ );
2851
+ err.code = 'WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT';
2852
+ throw err;
2853
+ }
2854
+ return fallback;
2855
+ }
2856
+
2857
+ // ── Spawn cwd vs worktree placement (W-mp73x32w000l143d) ──────────────────────
2858
+ // Work types that don't need a git worktree — they read repo state but don't
2859
+ // produce code changes. Centralized here so engine.js spawnAgent and any
2860
+ // future caller (e.g. pipeline preflight) can share one definition.
2861
+ //
2862
+ // `docs` is intentionally NOT in this set: docs edits run at the Minions root
2863
+ // without a project worktree but ARE write-capable (they push commits to the
2864
+ // minions repo itself). It's the complement of WORKTREE_REQUIRING_TYPES minus
2865
+ // that one odd case.
2866
+ const READ_ONLY_ROOT_TASK_TYPES = new Set(['meeting', 'ask', 'explore', 'plan-to-prd', 'plan']);
2867
+
2868
+ /**
2869
+ * Resolve the agent's working directory and (when needed) the parent dir for
2870
+ * git worktree placement. Decouples the two concerns that spawnAgent used to
2871
+ * conflate (W-mp73x32w000l143d):
2872
+ *
2873
+ * 1. **cwd** — where the agent process actually runs. For read-only types
2874
+ * this is the project root (or MINIONS_DIR fallback for rootless WIs).
2875
+ * For code-mutating types this is the worktree placement parent until
2876
+ * `git worktree add` succeeds, after which spawnAgent reassigns it to
2877
+ * the worktree path.
2878
+ *
2879
+ * 2. **worktreeRootDir** — the parent directory `git worktree add` is run
2880
+ * from. Only meaningful when a worktree will actually be created. For
2881
+ * read-only types we deliberately return `null` so the caller can skip
2882
+ * the drive-root preflight that fires when MINIONS_DIR sits one level
2883
+ * below a filesystem root (resolveProjectRootDir's collapse case).
2884
+ *
2885
+ * NOTE: Pipeline branches (engine.js `isPipelineBranchName`) override this —
2886
+ * they always need a worktree even for read-only types because the worktree
2887
+ * IS the pipeline's isolated workspace. The caller must detect the pipeline
2888
+ * branch case and recompute worktreeRootDir via `resolveProjectRootDir`.
2889
+ *
2890
+ * @param {{ localPath?: string|null }|null|undefined} project
2891
+ * @param {string} type — work type (e.g. 'fix', 'explore', 'meeting')
2892
+ * @param {string} minionsDir — MINIONS_DIR fallback anchor
2893
+ * @returns {{ cwd: string|null, worktreeRootDir: string|null }}
2894
+ * - For read-only types: { cwd: <project dir or MINIONS_DIR>, worktreeRootDir: null }
2895
+ * - For code-mutating types: { cwd: null, worktreeRootDir: <project root> }
2896
+ * (caller defaults cwd to worktreeRootDir before worktree creation)
2897
+ * @throws {Error} WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT (code-mutating only)
2898
+ * or WORKTREE_ROOTDIR_MISSING_BASE if neither anchor present.
2899
+ */
2900
+ function resolveSpawnPaths(project, type, minionsDir) {
2901
+ const isReadOnly = READ_ONLY_ROOT_TASK_TYPES.has(type);
2902
+ if (isReadOnly) {
2903
+ if (project?.localPath) return { cwd: path.resolve(String(project.localPath)), worktreeRootDir: null };
2904
+ if (minionsDir) return { cwd: path.resolve(String(minionsDir)), worktreeRootDir: null };
2905
+ const err = new Error('Cannot resolve cwd for read-only spawn: no project.localPath and no MINIONS_DIR provided.');
2906
+ err.code = 'WORKTREE_ROOTDIR_MISSING_BASE';
2907
+ throw err;
2908
+ }
2909
+ const worktreeRootDir = resolveProjectRootDir(project?.localPath, minionsDir);
2910
+ return { cwd: null, worktreeRootDir };
2911
+ }
2912
+
2535
2913
  // ── HTTP Origin Allowlist & Security Headers ─────────────────────────────────
2536
2914
  // Pure helpers used by dashboard.js to gate mutating requests against an
2537
2915
  // explicit allowlist of local origins and to attach uniform security response
@@ -3329,6 +3707,49 @@ function listAllProcesses() {
3329
3707
  return process.platform === 'win32' ? _winListProcesses() : _unixListProcesses();
3330
3708
  }
3331
3709
 
3710
+ // Cross-check a single PID's command line for a Minions agent invocation
3711
+ // (`claude` or `copilot`, including the `node spawn-agent.js --runtime <name>`
3712
+ // wrapper and `gh copilot` fallback). Used by orphan/recycled-PID safety:
3713
+ // - engine/cleanup.js: gate before killing a PID found in engine/tmp/pid-*.pid
3714
+ // - engine/timeout.js: gate before parking a dispatch as still-alive when
3715
+ // the OS PID is alive but may belong to an unrelated recycled-PID process
3716
+ //
3717
+ // Windows: PowerShell Get-CimInstance for the full CommandLine of one PID.
3718
+ // Linux: /proc/<pid>/cmdline (NUL-separated).
3719
+ // macOS / when /proc isn't available: fallback `ps -p <pid> -o command=`.
3720
+ //
3721
+ // Returns false when the PID is invalid, the process doesn't exist, the
3722
+ // command line can't be read, or the cmdline contains neither `claude` nor
3723
+ // `copilot`. False is the safe default for both call sites: cleanup falls
3724
+ // through to "skip kill" and timeout falls through to "treat PID as dead".
3725
+ function isProcessCommandLineMatchingAgent(pid) {
3726
+ const n = Number(pid);
3727
+ if (!Number.isInteger(n) || n <= 0) return false;
3728
+ let cmdline = '';
3729
+ try {
3730
+ if (process.platform === 'win32') {
3731
+ const out = _execSync(
3732
+ `powershell -NoProfile -NonInteractive -Command "(Get-CimInstance Win32_Process -Filter 'ProcessId=${n}').CommandLine"`,
3733
+ { stdio: ['ignore', 'pipe', 'ignore'], timeout: 5000, windowsHide: true, encoding: 'utf8' }
3734
+ );
3735
+ cmdline = String(out || '').trim();
3736
+ } else {
3737
+ try {
3738
+ const buf = fs.readFileSync(`/proc/${n}/cmdline`);
3739
+ cmdline = buf.toString('utf8').replace(/\0/g, ' ').trim();
3740
+ } catch {
3741
+ try {
3742
+ cmdline = String(_execSync(`ps -p ${n} -o command=`,
3743
+ { stdio: ['ignore', 'pipe', 'ignore'], timeout: 3000, encoding: 'utf8' }) || '').trim();
3744
+ } catch { return false; }
3745
+ }
3746
+ }
3747
+ } catch { return false; }
3748
+ if (!cmdline) return false;
3749
+ const lower = cmdline.toLowerCase();
3750
+ return lower.includes('claude') || lower.includes('copilot');
3751
+ }
3752
+
3332
3753
  function _buildChildMap(processes) {
3333
3754
  const childMap = new Map();
3334
3755
  for (const p of processes) {
@@ -3755,6 +4176,12 @@ module.exports = {
3755
4176
  exec,
3756
4177
  execAsync,
3757
4178
  execSilent,
4179
+ shellSafeGh,
4180
+ shellSafeGit,
4181
+ shellSafeGitSync,
4182
+ validateGitRef,
4183
+ validateGhSlug,
4184
+ isValidGitWorktree,
3758
4185
  resolveMainBranch,
3759
4186
  run,
3760
4187
  runFile,
@@ -3770,7 +4197,7 @@ module.exports = {
3770
4197
  runtimeConfigWarnings,
3771
4198
  projectWorkSourceWarnings,
3772
4199
  backfillProjectWorkSourceDefaults,
3773
- WI_STATUS, DONE_STATUSES, PLAN_TERMINAL_STATUSES, WORK_TYPE, PLAN_STATUS, PRD_ITEM_STATUS, PRD_MATERIALIZABLE, PR_STATUS, PR_POLLABLE_STATUSES, PR_PENDING_REASON, DISPATCH_RESULT, trackReviewMetric, queuePlanToPrd, extractPlanDeclaredProject,
4200
+ WI_STATUS, DONE_STATUSES, PLAN_TERMINAL_STATUSES, WORK_TYPE, WORKTREE_REQUIRING_TYPES, PLAN_STATUS, PRD_ITEM_STATUS, PRD_MATERIALIZABLE, PR_STATUS, PR_POLLABLE_STATUSES, PR_PENDING_REASON, DISPATCH_RESULT, trackReviewMetric, queuePlanToPrd, extractPlanDeclaredProject,
3774
4201
  WATCH_STATUS, WATCH_TARGET_TYPE, WATCH_CONDITION, WATCH_ABSOLUTE_CONDITIONS, WATCH_ACTION_TYPE,
3775
4202
  WATCH_STALLED_DEFAULT_TICKS, WATCH_STUCK_STAGE_DEFAULT_TICKS,
3776
4203
  PIPELINE_STATUS, STAGE_TYPE, MEETING_STATUS, AGENT_STATUS,
@@ -3830,6 +4257,9 @@ module.exports = {
3830
4257
  isPathInsideOrEqual,
3831
4258
  parseWorktreePorcelain,
3832
4259
  assertWorktreeOutsideProject,
4260
+ resolveProjectRootDir,
4261
+ resolveSpawnPaths,
4262
+ READ_ONLY_ROOT_TASK_TYPES,
3833
4263
  isLiveCommandCenterPath,
3834
4264
  describeCcProtectedPaths,
3835
4265
  renderCcSystemPrompt,
@@ -3855,6 +4285,7 @@ module.exports = {
3855
4285
  killImmediate,
3856
4286
  killByPidImmediate,
3857
4287
  killByPidsImmediate,
4288
+ isProcessCommandLineMatchingAgent,
3858
4289
  listAllProcesses,
3859
4290
  listProcessDescendants,
3860
4291
  listProcessReachable,
@@ -3862,6 +4293,7 @@ module.exports = {
3862
4293
  _purgeReservedFiles, // exported for testing
3863
4294
  _WIN_RESERVED_NAMES, // exported for testing
3864
4295
  LOCK_STALE_MS,
4296
+ isPidAlive,
3865
4297
  flushLogs,
3866
4298
  redactSecrets,
3867
4299
  slugify,
@@ -38,6 +38,7 @@ const path = require('path');
38
38
  const { runFile, cleanChildEnv, killGracefully, killImmediate, killByPidsImmediate, listProcessDescendants, ts, resolveEngineCacheDir } = require('./shared');
39
39
  const { resolveRuntime } = require('./runtimes');
40
40
  const { acquireAdoTokenSync, isLikelyAdoToken } = require('./ado-token');
41
+ const keepProcessSweep = require('./keep-process-sweep');
41
42
 
42
43
  // ─── Pure helpers (exported for tests) ──────────────────────────────────────
43
44
 
@@ -534,8 +535,49 @@ function main() {
534
535
  if (trackedDescendants.size || gotFirstOutput) {
535
536
  snapshotDescendants();
536
537
  if (trackedDescendants.size) {
537
- const reaped = killByPidsImmediate([...trackedDescendants]);
538
- try { fs.appendFileSync(debugPath, `DESCENDANTS reaped=${reaped}/${trackedDescendants.size}\n`); } catch {}
538
+ // W-mp68q6ke0010de68 opt-in keep_processes flag: agents whose work
539
+ // item carried `meta.keep_processes: true` may have written
540
+ // `agents/<id>/keep-pids.json` declaring specific descendant PIDs the
541
+ // engine MUST NOT reap. Resolve agentId from MINIONS_LIVE_OUTPUT_PATH
542
+ // (engine.js:1451 sets it to agents/<agentId>/live-output.log) and
543
+ // subtract validated, alive PIDs from the kill set. Missing or
544
+ // invalid file → fall through to today's behavior (reap everything).
545
+ let toKillPids = [...trackedDescendants];
546
+ let kept = [];
547
+ let keepRecord = null;
548
+ let keepReason = null;
549
+ try {
550
+ const liveOut = process.env.MINIONS_LIVE_OUTPUT_PATH;
551
+ const agentId = liveOut ? path.basename(path.dirname(liveOut)) : '';
552
+ if (agentId) {
553
+ // W-mp6k7ywi000fa33c — per-WI override (set by engine when
554
+ // meta.keep_processes_skip_workdir_check is true) bypasses the
555
+ // requireGitWorkdir check so legitimate non-git keep_processes
556
+ // use cases (e.g., a daemon under /tmp) still anchor.
557
+ const reapOpts = process.env.MINIONS_KEEP_PROCESSES_SKIP_WORKDIR_CHECK === '1'
558
+ ? { requireGitWorkdir: false }
559
+ : {};
560
+ const plan = keepProcessSweep.computeReapPlan(toKillPids, agentId, reapOpts);
561
+ toKillPids = plan.toKill;
562
+ kept = plan.kept;
563
+ keepRecord = plan.record;
564
+ keepReason = plan.reason;
565
+ }
566
+ } catch (e) {
567
+ try { fs.appendFileSync(debugPath, `KEEP-PIDS error: ${e.message}\n`); } catch {}
568
+ }
569
+ if (kept.length) {
570
+ try {
571
+ fs.appendFileSync(
572
+ debugPath,
573
+ `KEPT pids=[${kept.join(',')}] purpose="${(keepRecord?.purpose || '').slice(0, 200)}" wi=${keepRecord?.wi_id || ''}\n`,
574
+ );
575
+ } catch {}
576
+ } else if (keepReason) {
577
+ try { fs.appendFileSync(debugPath, `KEEP-PIDS skipped: ${keepReason}\n`); } catch {}
578
+ }
579
+ const reaped = toKillPids.length ? killByPidsImmediate(toKillPids) : 0;
580
+ try { fs.appendFileSync(debugPath, `DESCENDANTS reaped=${reaped}/${toKillPids.length} kept=${kept.length}\n`); } catch {}
539
581
  }
540
582
  }
541
583
  // Prefer the 'exit' event's code/signal when present — see note above.