@yemi33/minions 0.1.1949 → 0.1.1951

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dashboard/js/command-center.js +9 -0
  2. package/dashboard/js/modal-qa.js +10 -0
  3. package/dashboard/js/refresh.js +4 -0
  4. package/dashboard/js/render-dispatch.js +25 -0
  5. package/dashboard/js/render-other.js +109 -2
  6. package/dashboard/js/settings.js +1 -1
  7. package/dashboard/layout.html +2 -2
  8. package/dashboard/pages/engine.html +6 -0
  9. package/dashboard/slim.html +1987 -0
  10. package/dashboard/styles.css +8 -0
  11. package/dashboard.js +450 -40
  12. package/docs/completion-reports.md +25 -0
  13. package/docs/design-state-storage.md +1 -1
  14. package/docs/slim-ux/architecture-suggestions.md +467 -0
  15. package/docs/slim-ux/concepts.md +824 -0
  16. package/engine/ado-mcp-wrapper.js +33 -7
  17. package/engine/ado.js +123 -15
  18. package/engine/cc-worker-pool.js +41 -0
  19. package/engine/cleanup.js +71 -34
  20. package/engine/cli.js +37 -0
  21. package/engine/dispatch.js +32 -9
  22. package/engine/features.js +6 -0
  23. package/engine/gh-token.js +137 -0
  24. package/engine/github.js +166 -29
  25. package/engine/issues.js +29 -0
  26. package/engine/keep-process-sweep.js +397 -0
  27. package/engine/lifecycle.js +150 -33
  28. package/engine/playbook.js +17 -0
  29. package/engine/queries.js +71 -0
  30. package/engine/recovery.js +6 -0
  31. package/engine/shared.js +481 -30
  32. package/engine/spawn-agent.js +44 -2
  33. package/engine/timeout.js +34 -11
  34. package/engine/worktree-pool.js +410 -0
  35. package/engine.js +643 -119
  36. package/package.json +6 -3
  37. package/playbooks/review.md +2 -0
  38. package/playbooks/shared-rules.md +3 -1
  39. package/prompts/cc-system.md +24 -0
  40. package/engine/copilot-models.json +0 -5
package/engine.js CHANGED
@@ -23,6 +23,7 @@
23
23
 
24
24
  const fs = require('fs');
25
25
  const path = require('path');
26
+ const crypto = require('crypto');
26
27
  const shared = require('./engine/shared');
27
28
  const { exec, execAsync, execSilent, runFile, ts, ENGINE_DEFAULTS,
28
29
  WI_STATUS, DONE_STATUSES, WORK_TYPE, PLAN_STATUS, PRD_ITEM_STATUS, PRD_MATERIALIZABLE, PR_STATUS, DISPATCH_RESULT, AGENT_STATUS,
@@ -103,7 +104,9 @@ const mutatePullRequests = shared.mutatePullRequests;
103
104
  const withFileLock = shared.withFileLock;
104
105
 
105
106
  const CHECKPOINT_CAP_FAIL_REASON = 'Exceeded 3 checkpoint-resumes; manual intervention required';
106
- const READ_ONLY_ROOT_TASK_TYPES = new Set(['meeting', 'ask', 'explore', 'plan-to-prd', 'plan']);
107
+ // W-mp73x32w000l143d: shared.READ_ONLY_ROOT_TASK_TYPES is the canonical set;
108
+ // re-aliased here for the existing call sites in this file.
109
+ const READ_ONLY_ROOT_TASK_TYPES = shared.READ_ONLY_ROOT_TASK_TYPES;
107
110
 
108
111
  function isPipelineBranchName(branchName) {
109
112
  return typeof branchName === 'string' && branchName.startsWith('pipeline/');
@@ -123,6 +126,10 @@ const steering = require('./engine/steering');
123
126
 
124
127
  const { runCleanup } = require('./engine/cleanup');
125
128
 
129
+ // ─── Worktree pool (W-mp73ya3e000me6c5 — opt-in cross-branch warm reuse) ────
130
+
131
+ const worktreePool = require('./engine/worktree-pool');
132
+
126
133
  // ─── State Readers (delegated to engine/queries.js) ─────────────────────────
127
134
 
128
135
  const { getConfig, getControl, getDispatch, getNotes,
@@ -201,7 +208,7 @@ async function pruneAncestorDeps(deps, gitOpts, cwd) {
201
208
  for (let j = 0; j < deps.length; j++) {
202
209
  if (i === j || ancestorIndices.has(j)) continue;
203
210
  try {
204
- await execAsync(`git merge-base --is-ancestor "origin/${deps[i].branch}" "origin/${deps[j].branch}"`, { ...gitOpts, cwd });
211
+ await shared.shellSafeGit(['merge-base', '--is-ancestor', `origin/${deps[i].branch}`, `origin/${deps[j].branch}`], { ...gitOpts, cwd });
205
212
  // deps[i] is an ancestor of deps[j] — prune deps[i]
206
213
  ancestorIndices.add(i);
207
214
  break;
@@ -221,14 +228,14 @@ async function preflightMergeSimulation(deps, mainRef, gitOpts, cwd) {
221
228
  for (let i = 0; i < deps.length; i++) {
222
229
  const depBranch = deps[i].branch;
223
230
  try {
224
- const result = await execAsync(`git merge-tree --write-tree "${currentRef}" "origin/${depBranch}"`, { ...gitOpts, cwd });
231
+ const result = await shared.shellSafeGit(['merge-tree', '--write-tree', currentRef, `origin/${depBranch}`], { ...gitOpts, cwd });
225
232
  const treeSha = (typeof result === 'string' ? result : (result.stdout?.toString?.() || '')).trim().split('\n')[0];
226
233
  if (!treeSha) return { ok: true }; // can't parse tree SHA, skip pre-flight
227
234
  // Create temp commit to chain for next dep (skip for last dep — no chaining needed)
228
235
  if (i < deps.length - 1) {
229
236
  try {
230
- const commitResult = await execAsync(
231
- `git commit-tree "${treeSha}" -p "${currentRef}" -p "origin/${depBranch}" -m "preflight-merge"`,
237
+ const commitResult = await shared.shellSafeGit(
238
+ ['commit-tree', treeSha, '-p', currentRef, '-p', `origin/${depBranch}`, '-m', 'preflight-merge'],
232
239
  { ...gitOpts, cwd }
233
240
  );
234
241
  const commitSha = (typeof commitResult === 'string' ? commitResult : (commitResult.stdout?.toString?.() || '')).trim();
@@ -501,8 +508,8 @@ async function syncReusedWorktree(rootDir, worktreePath, branchName, gitOpts = {
501
508
  // even on slow links.
502
509
  let onOrigin = true;
503
510
  try {
504
- await execAsync(
505
- `git ls-remote --exit-code --heads origin "${branchName}"`,
511
+ await shared.shellSafeGit(
512
+ ['ls-remote', '--exit-code', '--heads', 'origin', branchName],
506
513
  { ...gitOpts, cwd: rootDir, timeout: 5000 },
507
514
  );
508
515
  } catch (e) {
@@ -515,15 +522,15 @@ async function syncReusedWorktree(rootDir, worktreePath, branchName, gitOpts = {
515
522
  log('info', `Branch ${branchName} not on origin yet — first push pending; skipping fetch/pull`);
516
523
  return { skipped: true, reason: 'no-upstream' };
517
524
  }
518
- try { await execAsync(`git fetch origin "${branchName}"`, { ...gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
519
- try { await execAsync(`git pull origin "${branchName}"`, { ...gitOpts, cwd: worktreePath }); } catch (e) { log('warn', 'git: ' + e.message); }
525
+ try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ...gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
526
+ try { await shared.shellSafeGit(['pull', 'origin', branchName], { ...gitOpts, cwd: worktreePath }); } catch (e) { log('warn', 'git: ' + e.message); }
520
527
  return { skipped: false };
521
528
  }
522
529
 
523
530
  // Find an existing worktree already checked out on a given branch
524
531
  async function findExistingWorktree(repoDir, branchName) {
525
532
  try {
526
- const out = await execAsync(`git worktree list --porcelain`, { cwd: repoDir, timeout: 10000 });
533
+ const out = await shared.shellSafeGit(['worktree', 'list', '--porcelain'], { cwd: repoDir, timeout: 10000 });
527
534
  const found = shared.parseWorktreePorcelain(out).find(w => w.branch === branchName);
528
535
  if (found && fs.existsSync(found.path)) return found.path;
529
536
  } catch (e) { log('warn', 'git: ' + e.message); }
@@ -553,17 +560,24 @@ function removeStaleIndexLock(rootDir) {
553
560
  } catch (e) { log('warn', 'git: ' + e.message); }
554
561
  }
555
562
 
556
- async function runWorktreeAdd(rootDir, worktreePath, args, gitOpts, worktreeCreateRetries) {
563
+ async function runWorktreeAdd(rootDir, worktreePath, addArgs, gitOpts, worktreeCreateRetries) {
564
+ // P-a7c4d2e8 (F3): argv-form `git worktree add` — `addArgs` is an array of
565
+ // additional arguments (typically a branch name, optionally preceded by
566
+ // `-b <newBranch>`). The worktreePath is passed via `--` to disambiguate
567
+ // from refs and prevent option-style argument injection.
568
+ if (!Array.isArray(addArgs)) {
569
+ throw new TypeError('runWorktreeAdd: addArgs must be an array');
570
+ }
557
571
  let lastErr = null;
558
572
  const retries = Math.max(0, Number(worktreeCreateRetries) || 0);
559
573
  for (let attempt = 0; attempt <= retries; attempt++) {
560
574
  try {
561
575
  if (attempt > 0) {
562
- try { await execAsync('git worktree prune', { ...gitOpts, cwd: rootDir, timeout: 15000 }); } catch (e) { log('warn', 'git: ' + e.message); }
576
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ...gitOpts, cwd: rootDir, timeout: 15000 }); } catch (e) { log('warn', 'git: ' + e.message); }
563
577
  removeStaleIndexLock(rootDir);
564
578
  log('warn', `Retrying git worktree add (attempt ${attempt + 1}/${retries + 1}) for ${path.basename(worktreePath)}`);
565
579
  }
566
- await execAsync(`git worktree add "${worktreePath}" ${args}`, { ...gitOpts, cwd: rootDir });
580
+ await shared.shellSafeGit(['worktree', 'add', worktreePath, ...addArgs], { ...gitOpts, cwd: rootDir });
567
581
  return;
568
582
  } catch (err) {
569
583
  lastErr = err;
@@ -584,7 +598,7 @@ async function pruneStaleWorktreeForBranch(rootDir, branchName, gitOpts) {
584
598
  if (!branchName) return 0;
585
599
  let trees = [];
586
600
  try {
587
- const out = await execAsync(`git worktree list --porcelain`, { ...gitOpts, cwd: rootDir, timeout: 10000 });
601
+ const out = await shared.shellSafeGit(['worktree', 'list', '--porcelain'], { ...gitOpts, cwd: rootDir, timeout: 10000 });
588
602
  trees = shared.parseWorktreePorcelain(out);
589
603
  } catch (e) {
590
604
  log('warn', `pruneStaleWorktreeForBranch list: ${e.message?.split('\n')[0]}`);
@@ -595,14 +609,14 @@ async function pruneStaleWorktreeForBranch(rootDir, branchName, gitOpts) {
595
609
  let removed = 0;
596
610
  for (const w of stale) {
597
611
  try {
598
- await execAsync(`git worktree remove -f -f "${w.path}"`, { ...gitOpts, cwd: rootDir, timeout: 15000 });
612
+ await shared.shellSafeGit(['worktree', 'remove', '-f', '-f', w.path], { ...gitOpts, cwd: rootDir, timeout: 15000 });
599
613
  removed++;
600
614
  log('warn', `Removed stale worktree entry for ${branchName} at missing path ${w.path}${w.locked ? ' (was locked)' : ''}`);
601
615
  } catch (e) {
602
616
  log('warn', `git worktree remove -f -f failed for stale ${w.path}: ${e.message?.split('\n')[0]}`);
603
617
  }
604
618
  }
605
- try { await execAsync(`git worktree prune`, { ...gitOpts, cwd: rootDir, timeout: 10000 }); } catch { /* best-effort */ }
619
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ...gitOpts, cwd: rootDir, timeout: 10000 }); } catch { /* best-effort */ }
606
620
  return removed;
607
621
  }
608
622
 
@@ -717,8 +731,8 @@ async function recoverPartialWorktree(rootDir, worktreePath, branchName, gitOpts
717
731
  if (existingWt && fs.existsSync(existingWt)) return true;
718
732
  if (!fs.existsSync(worktreePath)) return false;
719
733
  try {
720
- await execAsync(`git -C "${worktreePath}" rev-parse --is-inside-work-tree`, { ...gitOpts, timeout: 10000 });
721
- await execAsync(`git -C "${worktreePath}" rev-parse --abbrev-ref HEAD`, { ...gitOpts, timeout: 10000 });
734
+ await shared.shellSafeGit(['-C', worktreePath, 'rev-parse', '--is-inside-work-tree'], { ...gitOpts, timeout: 10000 });
735
+ await shared.shellSafeGit(['-C', worktreePath, 'rev-parse', '--abbrev-ref', 'HEAD'], { ...gitOpts, timeout: 10000 });
722
736
  log('warn', `Recovered partially-created worktree for ${branchName} at ${worktreePath}`);
723
737
  return true;
724
738
  } catch {
@@ -755,12 +769,73 @@ async function spawnAgent(dispatchItem, config) {
755
769
  }
756
770
  const metaProjectFields = metaProject && typeof metaProject === 'object' ? metaProject : {};
757
771
  const project = projectResolution.project ? { ...projectResolution.project, ...metaProjectFields } : {};
758
- const rootDir = project.localPath ? path.resolve(project.localPath) : path.resolve(MINIONS_DIR, '..');
759
-
760
- // Determine working directory
761
- let cwd = rootDir;
772
+ // W-mp73x32w000l143d: decouple agent cwd from worktree placement.
773
+ // resolveSpawnPaths returns:
774
+ // - read-only types: { cwd: <project dir or MINIONS_DIR>, worktreeRootDir: null }
775
+ // (no drive-root preflight — these tasks don't need a worktree)
776
+ // - code-mutating types: { cwd: null, worktreeRootDir: <project root> }
777
+ // (caller defaults cwd to worktreeRootDir; drive-root collapse throws
778
+ // WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT — same fail-fast behavior as
779
+ // the legacy resolveProjectRootDir call this replaced).
780
+ // Pipeline branches force a worktree even for read-only types — handled
781
+ // immediately after the resolver call below.
782
+ const _preBranchName = meta?.branch ? sanitizeBranch(meta.branch) : null;
783
+ let cwd, worktreeRootDir;
784
+ try {
785
+ ({ cwd, worktreeRootDir } = shared.resolveSpawnPaths(project, type, MINIONS_DIR));
786
+ } catch (rootErr) {
787
+ if (rootErr?.code === 'WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT' || rootErr?.code === 'WORKTREE_ROOTDIR_MISSING_BASE') {
788
+ log('error', `spawnAgent: project rootDir resolution failed for ${id}: ${rootErr.message}`);
789
+ // Prompt files haven't been written yet at this point — no cleanup needed.
790
+ completeDispatch(
791
+ id,
792
+ DISPATCH_RESULT.ERROR,
793
+ rootErr.message.slice(0, 800),
794
+ 'Pre-spawn worktree preflight rejected — see failure_class for the specific cause.',
795
+ { failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
796
+ );
797
+ cleanupTempAgent(agentId);
798
+ return null;
799
+ }
800
+ throw rootErr;
801
+ }
802
+ // Pipeline branches need a worktree even for read-only types (the worktree
803
+ // IS the pipeline's isolated workspace). When we detect a pipeline branch
804
+ // on a read-only type, recompute worktreeRootDir so the worktree creation
805
+ // block has a placement parent — and so the drive-root preflight still fires.
806
+ if (worktreeRootDir === null && isPipelineBranchName(_preBranchName)) {
807
+ try {
808
+ worktreeRootDir = shared.resolveProjectRootDir(project.localPath, MINIONS_DIR);
809
+ } catch (rootErr) {
810
+ if (rootErr?.code === 'WORKTREE_ROOTDIR_COLLAPSED_TO_DRIVE_ROOT' || rootErr?.code === 'WORKTREE_ROOTDIR_MISSING_BASE') {
811
+ log('error', `spawnAgent: pipeline-branch rootDir resolution failed for ${id}: ${rootErr.message}`);
812
+ completeDispatch(
813
+ id,
814
+ DISPATCH_RESULT.ERROR,
815
+ rootErr.message.slice(0, 800),
816
+ 'Pre-spawn worktree preflight rejected — see failure_class for the specific cause.',
817
+ { failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
818
+ );
819
+ cleanupTempAgent(agentId);
820
+ return null;
821
+ }
822
+ throw rootErr;
823
+ }
824
+ }
825
+ // Legacy local alias: downstream git ops (worktree add, prune, fetch) and
826
+ // the `cwd === rootDir` safety warn at line ~1387 reference `rootDir`. For
827
+ // read-only rootless tasks (no worktree, no branch) this is null — the
828
+ // rootDir-referencing code paths only run inside `if (branchName)` /
829
+ // `if (worktreePath)` guards, so a null rootDir is safe there.
830
+ const rootDir = worktreeRootDir;
831
+
832
+ // Determine working directory. For code-mutating types the resolver
833
+ // returned cwd: null and we default to the worktree placement parent
834
+ // (matches legacy behavior — reassigned to the worktreePath after
835
+ // `git worktree add` succeeds at line ~1078).
836
+ if (cwd == null) cwd = rootDir;
762
837
  let worktreePath = null;
763
- let branchName = meta?.branch ? sanitizeBranch(meta.branch) : null;
838
+ let branchName = _preBranchName;
764
839
  const worktreeCreateTimeout = Math.max(60000, Number(engineConfig.worktreeCreateTimeout) || ENGINE_DEFAULTS.worktreeCreateTimeout);
765
840
  const worktreeCreateRetries = Math.max(0, Math.min(3, Number(engineConfig.worktreeCreateRetries) || ENGINE_DEFAULTS.worktreeCreateRetries));
766
841
  const _gitOpts = { stdio: 'pipe', timeout: 30000, windowsHide: true, env: shared.gitEnv() };
@@ -778,14 +853,24 @@ async function spawnAgent(dispatchItem, config) {
778
853
  safeUnlink(completionReportPath);
779
854
  } catch (e) { log('warn', `completion report setup: ${e.message}`); }
780
855
  }
856
+ // P-d2a8f6c1 (agent trust boundary F8): per-spawn cryptographic nonce. The
857
+ // engine injects this into the agent env as MINIONS_COMPLETION_NONCE and
858
+ // requires the agent to echo it back in the completion JSON. On parse, the
859
+ // engine compares report.nonce against the in-memory value below; on
860
+ // mismatch the report is treated as forged (e.g. a prompt-injected agent
861
+ // writing into a sibling agent's completion path) and discarded. See
862
+ // engine/lifecycle.js:runPostCompletionHooks and docs/completion-reports.md.
863
+ const completionNonce = crypto.randomBytes(16).toString('hex');
781
864
  const completionReportInstruction = completionReportPath ? [
782
865
  '## Completion Report',
783
866
  '',
784
867
  `Before exiting, write a JSON completion report to: \`${completionReportPath}\``,
785
868
  '',
786
- 'Use this shape: {"status":"success|partial|failed","summary":"...","verdict":"approved|changes-requested|null","pr":"PR URL or id if relevant","failure_class":"...","retryable":true|false,"needs_rerun":true|false,"artifacts":[{"type":"note|plan|prd|pr|file","path":"relative/path/or/url","title":"short label"}]}.',
869
+ 'Use this shape: {"status":"success|partial|failed","summary":"...","verdict":"approved|changes-requested|null","pr":"PR URL or id if relevant","failure_class":"...","retryable":true|false,"needs_rerun":true|false,"nonce":"<value of MINIONS_COMPLETION_NONCE env var>","artifacts":[{"type":"note|plan|prd|pr|file","path":"relative/path/or/url","title":"short label"}]}.',
787
870
  'This report is the primary completion signal; fenced completion blocks are only a fallback.',
788
871
  '',
872
+ `**Trust nonce (REQUIRED):** copy the exact value of the \`MINIONS_COMPLETION_NONCE\` environment variable into the report's \`nonce\` field. The engine validates this on read; mismatched or missing nonces are treated as untrusted and the dispatch is failed with \`failure_class: 'completion-nonce-mismatch'\`. Do not invent, regenerate, or share this value across reports.`,
873
+ '',
789
874
  ].join('\n') : '';
790
875
  const buildFullTaskPrompt = (promptBody) => {
791
876
  const taskPromptWithSteering = pendingSteering.prompt
@@ -807,6 +892,22 @@ async function spawnAgent(dispatchItem, config) {
807
892
  const sysPromptPath = path.join(tmpDir, `sysprompt-${safeId}.md`);
808
893
  safeWrite(sysPromptPath, systemPrompt);
809
894
  const _cleanupPromptFiles = () => { safeUnlink(promptPath); safeUnlink(sysPromptPath); };
895
+ // Convert a WORKTREE_NESTED_IN_PROJECT throw into a fail-fast non-retryable
896
+ // dispatch failure (W-mp62taw2000ubcc3). The error's `.code` is set by
897
+ // shared.assertWorktreeOutsideProject so we don't have to parse the message.
898
+ // Returns truthy when the caller should `return null` from spawnAgent.
899
+ const _failWorktreePreflight = (assertErr) => {
900
+ log('error', `spawnAgent: worktree preflight rejected for ${id}: ${assertErr.message}`);
901
+ _cleanupPromptFiles();
902
+ completeDispatch(
903
+ id,
904
+ DISPATCH_RESULT.ERROR,
905
+ assertErr.message.slice(0, 800),
906
+ 'Pre-spawn worktree preflight rejected — see failure_class for the specific cause. Recompute will produce the same rejection until the underlying configuration changes.',
907
+ { failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
908
+ );
909
+ cleanupTempAgent(agentId);
910
+ };
810
911
  _phaseT.afterPrompt = Date.now();
811
912
 
812
913
  if (branchName) {
@@ -819,8 +920,13 @@ async function spawnAgent(dispatchItem, config) {
819
920
  worktreePath = path.resolve(rootDir, engineConfig.worktreeRoot || '../worktrees', wtDirName);
820
921
  // Refuse to spawn into a worktree path that's inside the project root —
821
922
  // nested worktrees cause glob/grep to match both copies (mirror writes).
822
- // Throws on violation; caught by the outer try/catch which fails dispatch.
823
- shared.assertWorktreeOutsideProject(worktreePath, rootDir);
923
+ // WORKTREE_NESTED_IN_PROJECT is non-retryable: the recompute on the next
924
+ // tick will produce the same path. Fail fast (W-mp62taw2000ubcc3).
925
+ try { shared.assertWorktreeOutsideProject(worktreePath, rootDir); }
926
+ catch (assertErr) {
927
+ if (assertErr?.code === 'WORKTREE_NESTED_IN_PROJECT') { _failWorktreePreflight(assertErr); return null; }
928
+ throw assertErr;
929
+ }
824
930
 
825
931
  // If branch is already checked out in an existing worktree, reuse it
826
932
  _phaseT.findExistingStart = Date.now();
@@ -830,7 +936,11 @@ async function spawnAgent(dispatchItem, config) {
830
936
  // Same guard for reuse — a previously-created bad worktree must not
831
937
  // be silently reused either; the cleanup sweep flags these so the
832
938
  // operator can remove them.
833
- shared.assertWorktreeOutsideProject(existingWt, rootDir);
939
+ try { shared.assertWorktreeOutsideProject(existingWt, rootDir); }
940
+ catch (assertErr) {
941
+ if (assertErr?.code === 'WORKTREE_NESTED_IN_PROJECT') { _failWorktreePreflight(assertErr); return null; }
942
+ throw assertErr;
943
+ }
834
944
  worktreePath = existingWt;
835
945
  log('info', `Reusing existing worktree for ${branchName}: ${existingWt}`);
836
946
  // Probe origin first — locally-created branches that were never pushed
@@ -840,30 +950,84 @@ async function spawnAgent(dispatchItem, config) {
840
950
  await syncReusedWorktree(rootDir, existingWt, branchName, _gitOpts);
841
951
  _phaseT.reuseSyncEnd = Date.now();
842
952
  } else if (READ_ONLY_ROOT_TASK_TYPES.has(type) && !isPipelineBranchName(branchName)) {
843
- // Read-only tasks — no worktree needed, run in rootDir
844
- log('info', `${type}: read-only task, no worktree needed — running in rootDir`);
953
+ // Read-only tasks — no worktree needed, run in cwd from resolveSpawnPaths
954
+ // (project.localPath or MINIONS_DIR). W-mp73x32w000l143d.
955
+ log('info', `${type}: read-only task, no worktree needed — running in cwd ${cwd}`);
845
956
  branchName = null;
846
957
  worktreePath = null;
847
958
  } else {
848
959
  _phaseT.createWorktreeStart = Date.now();
960
+
961
+ // ── Pool borrow (W-mp73ya3e000me6c5) ────────────────────────────────
962
+ // Try to borrow a warm worktree from the per-project pool BEFORE the
963
+ // existing fresh-create path. Default-off (`worktreePoolSize: 0`); when
964
+ // enabled, this saves the cold install/build cost on heavy projects.
965
+ // Borrow only fires when the branch is brand-new (no upstream yet) so
966
+ // we don't disrupt fix-tasks targeting existing PR branches. Any git
967
+ // failure during the checkout evicts the entry and falls through to
968
+ // the unchanged fresh-create logic.
969
+ let borrowedFromPool = false;
970
+ const _isSharedForPool = meta?.branchStrategy === 'shared-branch' || meta?.useExistingBranch;
971
+ const _poolProject = project.name || 'default';
972
+ const _poolSize = worktreePool.getProjectPoolSize(_poolProject, config);
973
+ if (_poolSize > 0 && !_isSharedForPool && branchName) {
974
+ let _branchOnRemote = true;
975
+ try {
976
+ await shared.shellSafeGit(
977
+ ['ls-remote', '--exit-code', '--heads', 'origin', branchName],
978
+ { ..._gitOpts, cwd: rootDir, timeout: 5000 },
979
+ );
980
+ } catch (e) { if (e && e.code === 2) _branchOnRemote = false; }
981
+ if (!_branchOnRemote) {
982
+ const borrowed = worktreePool.tryBorrow(_poolProject, id);
983
+ if (borrowed && borrowed.path && fs.existsSync(borrowed.path)) {
984
+ try { shared.assertWorktreeOutsideProject(borrowed.path, rootDir); }
985
+ catch (assertErr) {
986
+ if (assertErr?.code === 'WORKTREE_NESTED_IN_PROJECT') {
987
+ worktreePool.evictEntry(borrowed.path, 'nested-in-project');
988
+ _failWorktreePreflight(assertErr); return null;
989
+ }
990
+ throw assertErr;
991
+ }
992
+ try {
993
+ const _mainRef = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
994
+ await shared.shellSafeGit(['fetch', 'origin', _mainRef], { ..._gitOpts, cwd: rootDir, timeout: 30000 });
995
+ // -B force-creates/resets the branch so a stale local ref from a
996
+ // prior occupant does not block the checkout.
997
+ await shared.shellSafeGit(['checkout', '-B', branchName, `origin/${_mainRef}`], { ..._gitOpts, cwd: borrowed.path, timeout: 30000 });
998
+ worktreePath = borrowed.path;
999
+ borrowedFromPool = true;
1000
+ log('info', `worktree-pool: borrowed warm worktree for ${_poolProject}/${branchName}: ${borrowed.path}`);
1001
+ } catch (borrowErr) {
1002
+ log('warn', `worktree-pool: borrow checkout failed for ${branchName} at ${borrowed.path}: ${borrowErr.message} — evicting and falling through to fresh create`);
1003
+ worktreePool.evictEntry(borrowed.path, 'borrow-checkout-failed');
1004
+ }
1005
+ }
1006
+ }
1007
+ }
1008
+
849
1009
  try {
850
1010
  if (!fs.existsSync(worktreePath)) {
851
1011
  const isSharedBranch = meta?.branchStrategy === 'shared-branch' || meta?.useExistingBranch;
852
1012
  // Prune stale worktree entries before creating (handles leftover entries from crashed runs)
853
- try { await execAsync(`git worktree prune`, { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
1013
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
854
1014
  // Remove stale index.lock before creating worktree (Windows crashes can leave this behind)
855
1015
  removeStaleIndexLock(rootDir);
856
1016
 
857
1017
  if (isSharedBranch) {
858
1018
  log('info', `Creating worktree for shared branch: ${worktreePath} on ${branchName}`);
859
- try { await execAsync(`git fetch origin "${branchName}"`, { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1019
+ try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
860
1020
  try {
861
- await runWorktreeAdd(rootDir, worktreePath, `"${branchName}"`, _worktreeGitOpts, worktreeCreateRetries);
1021
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
862
1022
  } catch (eShared) {
863
1023
  if (eShared.message?.includes('already used by worktree') || eShared.message?.includes('already checked out')) {
864
1024
  const existingWtPath = await findExistingWorktree(rootDir, branchName);
865
1025
  if (existingWtPath && fs.existsSync(existingWtPath)) {
866
- shared.assertWorktreeOutsideProject(existingWtPath, rootDir);
1026
+ try { shared.assertWorktreeOutsideProject(existingWtPath, rootDir); }
1027
+ catch (assertErr) {
1028
+ if (assertErr?.code === 'WORKTREE_NESTED_IN_PROJECT') { _failWorktreePreflight(assertErr); return null; }
1029
+ throw assertErr;
1030
+ }
867
1031
  log('info', `Shared branch ${branchName} already checked out at ${existingWtPath} — reusing`);
868
1032
  worktreePath = existingWtPath;
869
1033
  } else {
@@ -873,42 +1037,42 @@ async function spawnAgent(dispatchItem, config) {
873
1037
  const pruned = await pruneStaleWorktreeForBranch(rootDir, branchName, _gitOpts);
874
1038
  if (pruned > 0) {
875
1039
  log('info', `Pruned ${pruned} stale worktree entry(ies) for shared branch ${branchName}; retrying worktree add`);
876
- await runWorktreeAdd(rootDir, worktreePath, `"${branchName}"`, _worktreeGitOpts, 0);
1040
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
877
1041
  } else { throw eShared; }
878
1042
  }
879
1043
  } else if (eShared.message?.includes('invalid reference') || eShared.message?.includes('not a valid ref')) {
880
1044
  // Branch doesn't exist yet (first item in plan) — create it from main
881
1045
  const mainRef = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
882
1046
  log('info', `Shared branch ${branchName} not found — creating from ${mainRef}`);
883
- await runWorktreeAdd(rootDir, worktreePath, `-b "${branchName}" ${mainRef}`, _worktreeGitOpts, worktreeCreateRetries);
1047
+ await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, mainRef], _worktreeGitOpts, worktreeCreateRetries);
884
1048
  } else { throw eShared; }
885
1049
  }
886
1050
  } else {
887
1051
  log('info', `Creating worktree: ${worktreePath} on branch ${branchName}`);
888
1052
  const mainRef = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
889
1053
  try {
890
- await runWorktreeAdd(rootDir, worktreePath, `-b "${branchName}" ${mainRef}`, _worktreeGitOpts, worktreeCreateRetries);
1054
+ await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, mainRef], _worktreeGitOpts, worktreeCreateRetries);
891
1055
  } catch (e1) {
892
1056
  const branchExists = e1.message?.includes('already exists');
893
1057
  log('warn', `Worktree -b failed for ${branchName}: ${e1.message?.split('\n')[0]}`);
894
1058
  if (!branchExists) {
895
1059
  // Transient error (lock, timeout) — prune, clean, and retry -b once more
896
1060
  log('info', `Retrying -b create after prune for ${branchName}`);
897
- try { await execAsync(`git worktree prune`, { ..._gitOpts, cwd: rootDir, timeout: 15000 }); } catch { /* optional */ }
1061
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 15000 }); } catch { /* optional */ }
898
1062
  removeStaleIndexLock(rootDir);
899
1063
  // Clean up partial worktree directory from failed attempt
900
1064
  try { if (fs.existsSync(worktreePath)) fs.rmSync(worktreePath, { recursive: true, force: true }); } catch { /* optional */ }
901
1065
  try {
902
- await runWorktreeAdd(rootDir, worktreePath, `-b "${branchName}" ${mainRef}`, _worktreeGitOpts, 0);
1066
+ await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, mainRef], _worktreeGitOpts, 0);
903
1067
  } catch (e1b) {
904
1068
  log('error', `Worktree -b retry also failed for ${branchName}: ${e1b.message?.split('\n')[0]}`);
905
1069
  throw e1b;
906
1070
  }
907
1071
  } else {
908
1072
  // Branch already exists — try checkout without -b
909
- try { await execAsync(`git fetch origin "${branchName}"`, { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1073
+ try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
910
1074
  try {
911
- await runWorktreeAdd(rootDir, worktreePath, `"${branchName}"`, _worktreeGitOpts, worktreeCreateRetries);
1075
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
912
1076
  log('info', `Reusing existing branch: ${branchName}`);
913
1077
  } catch (e2) {
914
1078
  // "already checked out" or "already used by worktree" — find and reuse or recover
@@ -930,17 +1094,21 @@ async function spawnAgent(dispatchItem, config) {
930
1094
  log('warn', `Branch ${branchName} actively used by another agent at ${existingWtPath} — cannot create worktree`);
931
1095
  throw e2;
932
1096
  }
933
- shared.assertWorktreeOutsideProject(existingWtPath, rootDir);
1097
+ try { shared.assertWorktreeOutsideProject(existingWtPath, rootDir); }
1098
+ catch (assertErr) {
1099
+ if (assertErr?.code === 'WORKTREE_NESTED_IN_PROJECT') { _failWorktreePreflight(assertErr); return null; }
1100
+ throw assertErr;
1101
+ }
934
1102
  log('info', `Branch ${branchName} already checked out at ${existingWtPath} — reusing`);
935
1103
  worktreePath = existingWtPath;
936
1104
  } else if (existingWtPath && !fs.existsSync(existingWtPath)) {
937
1105
  log('warn', `Branch ${branchName} tracked in missing dir ${existingWtPath} — pruning and recreating`);
938
- try { await execAsync(`git worktree prune`, { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
939
- await runWorktreeAdd(rootDir, worktreePath, `"${branchName}"`, _worktreeGitOpts, worktreeCreateRetries);
1106
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
1107
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
940
1108
  log('info', `Recovered worktree for ${branchName} after stale entry prune`);
941
1109
  } else {
942
- try { await execAsync(`git worktree prune`, { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
943
- await runWorktreeAdd(rootDir, worktreePath, `"${branchName}"`, _worktreeGitOpts, worktreeCreateRetries);
1110
+ try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
1111
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
944
1112
  }
945
1113
  } else {
946
1114
  throw e2;
@@ -951,7 +1119,7 @@ async function spawnAgent(dispatchItem, config) {
951
1119
  }
952
1120
  } else if (meta?.branchStrategy === 'shared-branch') {
953
1121
  log('info', `Pulling latest on shared branch ${branchName}`);
954
- try { await execAsync(`git pull origin "${branchName}"`, { ..._gitOpts, cwd: worktreePath }); } catch (e) { log('warn', 'git: ' + e.message); }
1122
+ try { await shared.shellSafeGit(['pull', 'origin', branchName], { ..._gitOpts, cwd: worktreePath }); } catch (e) { log('warn', 'git: ' + e.message); }
955
1123
  }
956
1124
  } catch (err) {
957
1125
  if (await recoverPartialWorktree(rootDir, worktreePath, branchName, _gitOpts)) {
@@ -1027,7 +1195,7 @@ async function spawnAgent(dispatchItem, config) {
1027
1195
  }
1028
1196
  const fetchResults = await Promise.allSettled(
1029
1197
  fetchable.map(({ branch: depBranch }) =>
1030
- execAsync(`git fetch origin "${depBranch}"`, { ..._gitOpts, cwd: rootDir }).then(() => depBranch)
1198
+ shared.shellSafeGit(['fetch', 'origin', depBranch], { ..._gitOpts, cwd: rootDir }).then(() => depBranch)
1031
1199
  )
1032
1200
  );
1033
1201
  const hasFetchFailures = fetchResults.some(r => r.status === 'rejected');
@@ -1047,10 +1215,10 @@ async function spawnAgent(dispatchItem, config) {
1047
1215
  // If remote ref missing, check if branch exists locally and push it (#782)
1048
1216
  if (errMsg.includes('couldn\'t find remote ref') || errMsg.includes('not found in upstream')) {
1049
1217
  try {
1050
- await execAsync(`git rev-parse --verify "refs/heads/${failedBranch}"`, { ..._gitOpts, cwd: rootDir });
1218
+ await shared.shellSafeGit(['rev-parse', '--verify', `refs/heads/${failedBranch}`], { ..._gitOpts, cwd: rootDir });
1051
1219
  // Branch exists locally — push it to origin
1052
1220
  log('info', `Dependency ${failedBranch} exists locally but not on remote — pushing to origin`);
1053
- await execAsync(`git push origin "${failedBranch}"`, { ..._gitOpts, cwd: rootDir, timeout: 60000 });
1221
+ await shared.shellSafeGit(['push', 'origin', failedBranch], { ..._gitOpts, cwd: rootDir, timeout: 60000 });
1054
1222
  log('info', `Successfully pushed local-only dependency branch ${failedBranch} to origin`);
1055
1223
  recoveredBranches.add(failedBranch);
1056
1224
  continue;
@@ -1089,7 +1257,7 @@ async function spawnAgent(dispatchItem, config) {
1089
1257
  const ancestorChecks = await Promise.all(
1090
1258
  prunedDeps.map(async ({ branch: depBranch }) => {
1091
1259
  try {
1092
- await execAsync(`git merge-base --is-ancestor "origin/${depBranch}" HEAD`, { ..._gitOpts, cwd: worktreePath });
1260
+ await shared.shellSafeGit(['merge-base', '--is-ancestor', `origin/${depBranch}`, 'HEAD'], { ..._gitOpts, cwd: worktreePath });
1093
1261
  return true;
1094
1262
  } catch (_) { return false; }
1095
1263
  })
@@ -1122,9 +1290,9 @@ async function spawnAgent(dispatchItem, config) {
1122
1290
  let stashed = false;
1123
1291
  if (!depMergeFailed && !skipDepMerge && prunedDeps.length > 0) {
1124
1292
  try {
1125
- const statusOut = (await execAsync('git status --porcelain', { ..._gitOpts, cwd: worktreePath })).stdout.toString().trim();
1293
+ const statusOut = (await shared.shellSafeGit(['status', '--porcelain'], { ..._gitOpts, cwd: worktreePath })).stdout.toString().trim();
1126
1294
  if (statusOut) {
1127
- await execAsync('git stash push --include-untracked -m "engine: stash before dep re-merge"', { ..._gitOpts, cwd: worktreePath });
1295
+ await shared.shellSafeGit(['stash', 'push', '--include-untracked', '-m', 'engine: stash before dep re-merge'], { ..._gitOpts, cwd: worktreePath });
1128
1296
  stashed = true;
1129
1297
  log('info', `Stashed uncommitted changes in ${branchName} before dep merge`);
1130
1298
  }
@@ -1135,27 +1303,27 @@ async function spawnAgent(dispatchItem, config) {
1135
1303
  if (!depMergeFailed && !skipDepMerge) {
1136
1304
  for (const { branch: depBranch, prId } of prunedDeps) {
1137
1305
  try {
1138
- await execAsync(`git merge "origin/${depBranch}" --no-edit`, { ..._gitOpts, cwd: worktreePath });
1306
+ await shared.shellSafeGit(['merge', `origin/${depBranch}`, '--no-edit'], { ..._gitOpts, cwd: worktreePath });
1139
1307
  log('info', `Merged dependency branch ${depBranch} (${prId}) into worktree ${branchName}`);
1140
1308
  } catch (mergeErr) {
1141
1309
  // Merge failed — possibly due to diverged history from a force-pushed (rebased) dep branch.
1142
1310
  // Abort partial merge, reset worktree to clean main base, and re-merge all deps from scratch.
1143
1311
  log('warn', `Merge of ${depBranch} into ${branchName} failed: ${mergeErr.message} — attempting reset and re-merge of all deps`);
1144
- try { await execAsync(`git merge --abort`, { ..._gitOpts, cwd: worktreePath }); } catch (_) { /* no merge in progress */ }
1312
+ try { await shared.shellSafeGit(['merge', '--abort'], { ..._gitOpts, cwd: worktreePath }); } catch (_) { /* no merge in progress */ }
1145
1313
  const mainRef = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
1146
1314
  try {
1147
- await execAsync(`git reset --hard "origin/${mainRef}"`, { ..._gitOpts, cwd: worktreePath });
1315
+ await shared.shellSafeGit(['reset', '--hard', `origin/${mainRef}`], { ..._gitOpts, cwd: worktreePath });
1148
1316
  log('info', `Reset worktree ${branchName} to origin/${mainRef} for clean dep re-merge`);
1149
1317
  // Re-merge ALL pruned dep branches from scratch on clean base
1150
1318
  for (const { branch: reBranch, prId: rePrId } of prunedDeps) {
1151
- await execAsync(`git merge "origin/${reBranch}" --no-edit`, { ..._gitOpts, cwd: worktreePath });
1319
+ await shared.shellSafeGit(['merge', `origin/${reBranch}`, '--no-edit'], { ..._gitOpts, cwd: worktreePath });
1152
1320
  log('info', `Re-merged dependency branch ${reBranch} (${rePrId}) into worktree ${branchName}`);
1153
1321
  }
1154
1322
  log('info', `Successfully re-merged all ${prunedDeps.length} dep branches after reset for ${branchName}`);
1155
1323
  } catch (resetErr) {
1156
1324
  const errOutput = (resetErr.message || '') + '\n' + (resetErr.stdout?.toString?.() || '') + '\n' + (resetErr.stderr?.toString?.() || '');
1157
1325
  log('warn', `Failed to reset and re-merge deps for ${branchName}: ${resetErr.message}`);
1158
- try { await execAsync(`git merge --abort`, { ..._gitOpts, cwd: worktreePath }); } catch (_) { /* no merge in progress */ }
1326
+ try { await shared.shellSafeGit(['merge', '--abort'], { ..._gitOpts, cwd: worktreePath }); } catch (_) { /* no merge in progress */ }
1159
1327
  // Post-mortem: incremental simulation to identify which dep caused the conflict (#958)
1160
1328
  // Uses same chained merge-tree approach as pre-flight to catch inter-dep conflicts
1161
1329
  const pmMainRef = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
@@ -1172,8 +1340,8 @@ async function spawnAgent(dispatchItem, config) {
1172
1340
  for (const { branch: reBranch2 } of prunedDeps) {
1173
1341
  try {
1174
1342
  const mainRef2 = sanitizeBranch(shared.resolveMainBranch(rootDir, project.mainBranch));
1175
- const mergeBase = (await execAsync(`git merge-base "origin/${mainRef2}" "origin/${reBranch2}"`, { ..._gitOpts, cwd: rootDir })).stdout.toString().trim();
1176
- const treeResult = await execAsync(`git merge-tree "${mergeBase}" "origin/${mainRef2}" "origin/${reBranch2}"`, { ..._gitOpts, cwd: rootDir });
1343
+ const mergeBase = (await shared.shellSafeGit(['merge-base', `origin/${mainRef2}`, `origin/${reBranch2}`], { ..._gitOpts, cwd: rootDir })).stdout.toString().trim();
1344
+ const treeResult = await shared.shellSafeGit(['merge-tree', mergeBase, `origin/${mainRef2}`, `origin/${reBranch2}`], { ..._gitOpts, cwd: rootDir });
1177
1345
  const treeOutput = treeResult.stdout?.toString?.() || '';
1178
1346
  if (treeOutput.includes('<<<<<<<') || treeOutput.includes('changed in both')) {
1179
1347
  depConflictBranch = reBranch2;
@@ -1196,7 +1364,7 @@ async function spawnAgent(dispatchItem, config) {
1196
1364
  // Restore stashed changes after dep merge (#973)
1197
1365
  if (stashed) {
1198
1366
  try {
1199
- await execAsync('git stash pop', { ..._gitOpts, cwd: worktreePath });
1367
+ await shared.shellSafeGit(['stash', 'pop'], { ..._gitOpts, cwd: worktreePath });
1200
1368
  log('info', `Restored stashed changes in ${branchName} after dep merge`);
1201
1369
  } catch (popErr) {
1202
1370
  log('warn', `git stash pop failed in ${branchName}: ${popErr.message} — stash preserved for agent`);
@@ -1407,7 +1575,16 @@ async function spawnAgent(dispatchItem, config) {
1407
1575
  // Spawn the claude process
1408
1576
  const childEnv = shared.cleanChildEnv();
1409
1577
  if (completionReportPath) childEnv.MINIONS_COMPLETION_REPORT = completionReportPath;
1578
+ if (completionNonce) childEnv.MINIONS_COMPLETION_NONCE = completionNonce;
1410
1579
  childEnv.MINIONS_REPO_HOST = getRepoHost(project);
1580
+ // W-mp6k7ywi000fa33c — per-WI override that bypasses the requireGitWorkdir
1581
+ // check on agents/<id>/keep-pids.json. Plumbed via env so spawn-agent's
1582
+ // close handler (which runs in the subprocess and computes the reap plan)
1583
+ // can honor it without re-reading the WI meta.
1584
+ if (meta?.item?.meta?.keep_processes_skip_workdir_check
1585
+ || dispatchItem.meta?.keep_processes_skip_workdir_check) {
1586
+ childEnv.MINIONS_KEEP_PROCESSES_SKIP_WORKDIR_CHECK = '1';
1587
+ }
1411
1588
 
1412
1589
  if (getRepoHost(project) === 'ado') {
1413
1590
  // Inject cached ADO token so ADO agents skip re-authentication (#998).
@@ -1445,18 +1622,30 @@ async function spawnAgent(dispatchItem, config) {
1445
1622
  }
1446
1623
  } catch { /* rotation is best-effort — overwrite still happens below */ }
1447
1624
 
1448
- // Stamp the log synchronously before spawn. If the synchronous write throws,
1449
- // we still attempt to spawn (log-file stamp failure must not block the agent),
1450
- // but we note it so the diagnostic trail isn't silent either.
1625
+ // Setup-and-spawn block guarded by partial-setup cleanup (P-f4d2e8a1).
1626
+ // If anything between log-stamping and activeProcesses.set throws, the
1627
+ // catch below tears down every artifact we managed to create so the work
1628
+ // item isn't left "stuck active" with an orphan PID file, log fd, or
1629
+ // realActivityMap entry. The dispatch loop's existing null-return /
1630
+ // throw handler (engine.js ~5184) already re-queues the work item;
1631
+ // here we just guarantee in-memory + on-disk state is consistent.
1632
+ let pidFilePath, logFd, registeredInActivityMap, registeredInActiveProcesses, proc;
1633
+ // The PID file at this path is written asynchronously by the spawned
1634
+ // child (engine/spawn-agent.js:423) once it starts. We compute the path
1635
+ // upfront so the catch block can unlink it whether spawn fails before
1636
+ // the child wrote it (no-op) or after (real cleanup).
1637
+ pidFilePath = promptPath.replace(/prompt-/, 'pid-').replace(/\.md$/, '.pid');
1451
1638
  try {
1452
- safeWrite(liveOutputPath, `# Live output for ${agentId} ${id}\n# Started: ${startedAt}\n# Task: ${dispatchItem.task}\n[${new Date().toISOString()}] spawn: agent=${agentId} item=${id}\n\n`);
1453
- } catch (stubErr) {
1454
- log('warn', `Failed to stamp live-output stub for ${agentId} (${id}): ${stubErr.message}`);
1455
- }
1639
+ // Stamp the log synchronously before spawn. If the synchronous write throws,
1640
+ // we still attempt to spawn (log-file stamp failure must not block the agent),
1641
+ // but we note it so the diagnostic trail isn't silent either.
1642
+ try {
1643
+ safeWrite(liveOutputPath, `# Live output for ${agentId} — ${id}\n# Started: ${startedAt}\n# Task: ${dispatchItem.task}\n[${new Date().toISOString()}] spawn: agent=${agentId} item=${id}\n\n`);
1644
+ } catch (stubErr) {
1645
+ log('warn', `Failed to stamp live-output stub for ${agentId} (${id}): ${stubErr.message}`);
1646
+ }
1456
1647
 
1457
- let proc;
1458
- _phaseT.spawnCallStart = Date.now();
1459
- try {
1648
+ _phaseT.spawnCallStart = Date.now();
1460
1649
  // `detached: true` puts the agent in its own process group (POSIX) / job
1461
1650
  // object (Windows), so when the engine dies — gracefully via stop, abruptly
1462
1651
  // via taskkill, or because of a crash — the agent keeps running and can be
@@ -1470,41 +1659,65 @@ async function spawnAgent(dispatchItem, config) {
1470
1659
  detached: true,
1471
1660
  });
1472
1661
  _phaseT.spawnCallEnd = Date.now();
1662
+
1663
+ // Seed realActivityMap and stamp PID immediately — BEFORE any handlers (#W-mo25loq8kjer).
1664
+ // Why NOW, not later in the function:
1665
+ // 1. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
1666
+ // on synchronous spawn failures. Seeding before registering handlers ensures delete sees a value
1667
+ // to clear rather than leaving an absent-then-absent no-op that downstream code must guard.
1668
+ // 2. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
1669
+ // before first write" (stub-only log) from "process started and is hung" (stub + pid line).
1670
+ realActivityMap.set(id, Date.now());
1671
+ registeredInActivityMap = true;
1672
+ try {
1673
+ fs.appendFileSync(liveOutputPath, `[${new Date().toISOString()}] pid: ${proc.pid ?? 'unknown'}\n`);
1674
+ } catch { /* log stamp is best-effort — don't block spawn on fs failure */ }
1675
+
1676
+ const initialProcInfo = {
1677
+ proc,
1678
+ agentId,
1679
+ startedAt,
1680
+ runtimeName,
1681
+ sessionId: cachedSessionId,
1682
+ _completionNonce: completionNonce,
1683
+ ...(cachedSessionId ? {
1684
+ _runtimeResumeAt: Date.now(),
1685
+ _runtimeResumeAwaitingFirstOutput: true,
1686
+ } : {}),
1687
+ _pendingSteeringFiles: pendingSteering.entries,
1688
+ };
1689
+ activeProcesses.set(id, initialProcInfo);
1690
+ registeredInActiveProcesses = true;
1473
1691
  } catch (spawnErr) {
1474
- // Synchronous spawn failure record it to the (already-stamped) log so the
1475
- // orphan detector's "logSize > stub-only" check can tell this apart from a
1476
- // hung process. Then rethrow so the dispatch loop handles it normally.
1477
- try { fs.appendFileSync(liveOutputPath, `[${new Date().toISOString()}] spawn-failed: ${spawnErr.message}\n[process-exit] spawn-failed\n`); } catch { /* cleanup-only best effort */ }
1692
+ // Partial-setup cleanup (P-f4d2e8a1): tear down every artifact in the
1693
+ // reverse order it was created. Each step is conditional + best-effort
1694
+ // so cleanup itself never throws on top of the original error.
1695
+ if (proc === undefined) {
1696
+ // Synchronous spawn failure — record it to the (already-stamped) log so the
1697
+ // orphan detector's "logSize > stub-only" check can tell this apart from a
1698
+ // hung process. Preserves the diagnostic the prior inline catch wrote.
1699
+ try { fs.appendFileSync(liveOutputPath, `[${new Date().toISOString()}] spawn-failed: ${spawnErr.message}\n[process-exit] spawn-failed\n`); } catch { /* cleanup-only best effort */ }
1700
+ } else if (proc && typeof proc.kill === 'function') {
1701
+ // spawn() returned a handle but a later registration step threw —
1702
+ // kill the orphan child so it doesn't run unmonitored.
1703
+ try { proc.kill('SIGKILL'); } catch { /* already exited */ }
1704
+ }
1705
+ if (registeredInActiveProcesses) {
1706
+ try { activeProcesses.delete(id); } catch { /* map.delete never throws but be defensive */ }
1707
+ }
1708
+ if (registeredInActivityMap) {
1709
+ try { realActivityMap.delete(id); } catch { /* defensive */ }
1710
+ }
1711
+ if (logFd !== undefined) {
1712
+ try { fs.closeSync(logFd); } catch { /* fd may already be closed */ }
1713
+ }
1714
+ if (pidFilePath) {
1715
+ try { safeUnlink(pidFilePath); } catch { /* may not exist yet */ }
1716
+ }
1478
1717
  cleanupTempAgent(agentId);
1479
1718
  throw spawnErr;
1480
1719
  }
1481
1720
 
1482
- // Seed realActivityMap and stamp PID immediately — BEFORE any handlers (#W-mo25loq8kjer).
1483
- // Why NOW, not later in the function:
1484
- // 1. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
1485
- // on synchronous spawn failures. Seeding before registering handlers ensures delete sees a value
1486
- // to clear rather than leaving an absent-then-absent no-op that downstream code must guard.
1487
- // 2. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
1488
- // before first write" (stub-only log) from "process started and is hung" (stub + pid line).
1489
- realActivityMap.set(id, Date.now());
1490
- try {
1491
- fs.appendFileSync(liveOutputPath, `[${new Date().toISOString()}] pid: ${proc.pid ?? 'unknown'}\n`);
1492
- } catch { /* log stamp is best-effort — don't block spawn on fs failure */ }
1493
-
1494
- const initialProcInfo = {
1495
- proc,
1496
- agentId,
1497
- startedAt,
1498
- runtimeName,
1499
- sessionId: cachedSessionId,
1500
- ...(cachedSessionId ? {
1501
- _runtimeResumeAt: Date.now(),
1502
- _runtimeResumeAwaitingFirstOutput: true,
1503
- } : {}),
1504
- _pendingSteeringFiles: pendingSteering.entries,
1505
- };
1506
- activeProcesses.set(id, initialProcInfo);
1507
-
1508
1721
  // Emit per-phase timing for spawn-latency analysis. One structured line per
1509
1722
  // dispatch; grep `[spawn-timing]` to aggregate. Null phases didn't run for
1510
1723
  // this dispatch (e.g. stale_head only runs for fix tasks; dep_* only when
@@ -1677,6 +1890,10 @@ async function spawnAgent(dispatchItem, config) {
1677
1890
  const spawnScript = path.join(ENGINE_DIR, 'spawn-agent.js');
1678
1891
  const childEnv = shared.cleanChildEnv();
1679
1892
  if (completionReportPath) childEnv.MINIONS_COMPLETION_REPORT = completionReportPath;
1893
+ // P-d2a8f6c1: preserve the per-dispatch nonce across steering resume so
1894
+ // the agent's completion JSON still validates after the resumed turn.
1895
+ // The dispatch id is the unit of trust, not the spawn instance.
1896
+ if (completionNonce) childEnv.MINIONS_COMPLETION_NONCE = completionNonce;
1680
1897
  childEnv.MINIONS_LIVE_OUTPUT_PATH = liveOutputPath;
1681
1898
  childEnv.MINIONS_REPO_HOST = getRepoHost(project);
1682
1899
  if (getRepoHost(project) === 'ado') {
@@ -1686,6 +1903,11 @@ async function spawnAgent(dispatchItem, config) {
1686
1903
  if (adoToken) childEnv.MINIONS_ADO_TOKEN = adoToken;
1687
1904
  } catch { /* non-fatal */ }
1688
1905
  }
1906
+ // W-mp6k7ywi000fa33c — propagate keep_processes workdir-check override across steering resume.
1907
+ if (dispatchItem.meta?.item?.meta?.keep_processes_skip_workdir_check
1908
+ || dispatchItem.meta?.keep_processes_skip_workdir_check) {
1909
+ childEnv.MINIONS_KEEP_PROCESSES_SKIP_WORKDIR_CHECK = '1';
1910
+ }
1689
1911
  let resumeProc;
1690
1912
  try {
1691
1913
  // detached so the resumed steering session also survives engine death (matches initial spawn)
@@ -1716,6 +1938,8 @@ async function spawnAgent(dispatchItem, config) {
1716
1938
  startedAt: procInfo.startedAt,
1717
1939
  runtimeName,
1718
1940
  sessionId: steerSessionId,
1941
+ // P-d2a8f6c1: keep the per-dispatch nonce alive across the steering resume.
1942
+ _completionNonce: procInfo._completionNonce || completionNonce,
1719
1943
  _runtimeResumeAt: Date.now(),
1720
1944
  _runtimeResumeAwaitingFirstOutput: true,
1721
1945
  _pendingSteeringFiles: mergePendingSteeringEntries(
@@ -1851,30 +2075,127 @@ async function spawnAgent(dispatchItem, config) {
1851
2075
  }
1852
2076
 
1853
2077
  // Parse output and run all post-completion hooks
1854
- const { resultSummary, autoRecovered, completionContractFailure, structuredCompletion, agentReportedFailure, agentRetryable } = await runPostCompletionHooks(dispatchItem, agentId, code, stdout, config);
2078
+ // P-d2a8f6c1: hand the per-spawn nonce to lifecycle so it can validate the
2079
+ // report's `nonce` field. Read it from activeProcesses BEFORE any later
2080
+ // delete clears the entry.
2081
+ const expectedNonce = activeProcesses.get(id)?._completionNonce || null;
2082
+ const completionNonceRequired = engineConfig.completionNonceRequired ?? ENGINE_DEFAULTS.completionNonceRequired;
2083
+ const { resultSummary, autoRecovered, completionContractFailure, structuredCompletion, agentReportedFailure, agentRetryable, nonceMismatch } = await runPostCompletionHooks(dispatchItem, agentId, code, stdout, config, { expectedNonce, completionNonceRequired });
1855
2084
  const retryableDecision = typeof agentRetryable === 'boolean' ? agentRetryable : failureInfo.retryable;
1856
2085
 
2086
+ // W-mp6k7ywi000fa33c — keep_processes acceptance gate. When the work
2087
+ // item carried `meta.keep_processes: true` and produced a keep-pids.json
2088
+ // sidecar whose `cwd` does not look like a real git worktree (default
2089
+ // `requireGitWorkdir: true` in ENGINE_DEFAULTS.keepProcesses), reject
2090
+ // the file and force the dispatch to fail with a dedicated failure
2091
+ // class. spawn-agent's close handler has already reaped the kept PIDs
2092
+ // (the same validation runs there via computeReapPlan), so the engine's
2093
+ // job here is just (a) flip the dispatch outcome to ERROR, (b) emit an
2094
+ // inbox alert that the responsible agent will see on its next dispatch,
2095
+ // and (c) delete the now-rejected sidecar so it does not accumulate.
2096
+ //
2097
+ // Per-WI override: `meta.keep_processes_skip_workdir_check: true` skips
2098
+ // the gate entirely (legitimate non-git keep_processes use cases).
2099
+ let keepProcessesWorkdirFailure = null;
2100
+ {
2101
+ const _wiMeta = dispatchItem.meta?.item?.meta || {};
2102
+ const _kpEnabled = !!_wiMeta.keep_processes
2103
+ || !!dispatchItem.meta?.keep_processes;
2104
+ const _kpSkipWorkdir = !!_wiMeta.keep_processes_skip_workdir_check
2105
+ || !!dispatchItem.meta?.keep_processes_skip_workdir_check;
2106
+ if (_kpEnabled && !_kpSkipWorkdir && ENGINE_DEFAULTS.keepProcesses?.requireGitWorkdir !== false) {
2107
+ try {
2108
+ const keepProcessSweep = require('./engine/keep-process-sweep');
2109
+ const evalResult = keepProcessSweep.evaluateKeepPidsAcceptance(agentId, { requireGitWorkdir: true });
2110
+ if (evalResult.exists && evalResult.isWorkdirRejection) {
2111
+ keepProcessesWorkdirFailure = {
2112
+ reason: evalResult.reason,
2113
+ cwd: evalResult.recordedCwd || '',
2114
+ filePath: evalResult.filePath,
2115
+ };
2116
+ // Delete the sidecar so it does not anchor stale PIDs on later
2117
+ // sweeps and does not show up as "malformed" forever.
2118
+ try { fs.unlinkSync(evalResult.filePath); } catch (_e) { /* gone or busy */ }
2119
+ log('warn', `keep-processes acceptance: REJECTED ${agentId} (${id}) — ${evalResult.reason}; PIDs reaped by spawn-agent, sidecar deleted`);
2120
+ // Emit inbox alert so the agent sees this on its next turn.
2121
+ try {
2122
+ const wiId = dispatchItem.meta?.item?.id || '';
2123
+ const slug = `keep-processes-workdir-${agentId}`;
2124
+ const alertBody = [
2125
+ `# keep_processes setup REJECTED for ${agentId}`,
2126
+ '',
2127
+ `Your kept-PIDs setup at \`${evalResult.recordedCwd || '<unknown>'}\` failed validation: ${evalResult.reason}.`,
2128
+ 'The directory is not a git worktree. PIDs were NOT protected and will be reaped.',
2129
+ '',
2130
+ wiId ? `Work item: ${wiId}` : '',
2131
+ `Agent: ${agentId}`,
2132
+ `Dispatch: ${id}`,
2133
+ '',
2134
+ 'Why this matters: a keep_processes work item that runs in a non-git directory',
2135
+ 'is almost always a partial copy of a repo (a selective `cp -r`). The Minions',
2136
+ 'cleanup sweep cannot reason about such directories safely; later sweeps may',
2137
+ 'rmSync subdirs treating them as separate worktrees. Re-run the work item',
2138
+ 'inside a real `git worktree add` directory, or set',
2139
+ '`meta.keep_processes_skip_workdir_check: true` on the work item if you',
2140
+ 'genuinely intend to keep PIDs alive in a non-git directory.',
2141
+ '',
2142
+ ].join('\n');
2143
+ writeInboxAlert(slug, alertBody);
2144
+ } catch (alertErr) {
2145
+ log('warn', `keep-processes acceptance: failed to emit inbox alert for ${agentId}: ${alertErr.message}`);
2146
+ }
2147
+ } else if (evalResult.exists && !evalResult.accepted) {
2148
+ // Non-workdir validation failure (oversize pids, bad TTL, etc.) —
2149
+ // already handled by validateKeepPidsRecord; just log for audit.
2150
+ log('warn', `keep-processes acceptance: ${agentId} (${id}) sidecar rejected — ${evalResult.reason} (not a workdir failure)`);
2151
+ }
2152
+ } catch (e) {
2153
+ log('warn', `keep-processes acceptance check failed for ${agentId} (${id}): ${e.message}`);
2154
+ }
2155
+ }
2156
+ }
2157
+
1857
2158
  // Move from active to completed in dispatch (single source of truth for agent status)
1858
2159
  // autoRecovered: agent failed after creating PRs — treat as success
1859
2160
  const hardContractFail = completionContractFailure?.severity === 'hard'
1860
2161
  || completionContractFailure?.nonTerminal === true;
1861
- const effectiveResult = hardContractFail ? DISPATCH_RESULT.ERROR : (((code === 0 && !agentReportedFailure) || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR);
2162
+ // P-d2a8f6c1: nonce mismatch (or missing+required) is a security failure
2163
+ // override effectiveResult to ERROR and surface the dedicated failure_class.
2164
+ // We mark the work item as failed (processWorkItemFailure NOT suppressed)
2165
+ // so the dispatch is not silently retried by the auto-recovery path.
2166
+ const nonceFail = nonceMismatch && nonceMismatch.severity === 'hard';
2167
+ // W-mp6k7ywi000fa33c — keep_processes workdir rejection is a hard
2168
+ // failure: the agent's claim that "everything was set up correctly" is
2169
+ // structurally false. Force ERROR so the dispatch is not silently treated
2170
+ // as success even when exit code is 0.
2171
+ const keepProcessesWorkdirFail = !!keepProcessesWorkdirFailure;
2172
+ const effectiveResult = (hardContractFail || nonceFail || keepProcessesWorkdirFail)
2173
+ ? DISPATCH_RESULT.ERROR
2174
+ : (((code === 0 && !agentReportedFailure) || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR);
1862
2175
  const finalCompletionReportPath = structuredCompletion?._path || dispatchItem.meta?.completionReportPath || shared.dispatchCompletionReportPath(id);
1863
2176
  const completionOpts = {
1864
2177
  ...(finalCompletionReportPath ? { completionReportPath: finalCompletionReportPath } : {}),
1865
2178
  ...(structuredCompletion ? { structuredCompletion } : {}),
1866
2179
  };
1867
- const completeOpts = hardContractFail
1868
- ? { ...completionOpts, processWorkItemFailure: false }
1869
- : (effectiveResult === DISPATCH_RESULT.ERROR ? {
1870
- ...completionOpts,
1871
- ...(failureClass ? { failureClass } : {}),
1872
- ...(typeof retryableDecision === 'boolean' ? { agentRetryable: retryableDecision } : {}),
1873
- ...(structuredCompletion?.failure_class ? { failureClass: structuredCompletion.failure_class } : {}),
1874
- } : completionOpts);
2180
+ const completeOpts = keepProcessesWorkdirFail
2181
+ ? { ...completionOpts, failureClass: FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR, agentRetryable: false }
2182
+ : (nonceFail
2183
+ ? { ...completionOpts, failureClass: nonceMismatch.failureClass, agentRetryable: false }
2184
+ : (hardContractFail
2185
+ ? { ...completionOpts, processWorkItemFailure: false }
2186
+ : (effectiveResult === DISPATCH_RESULT.ERROR ? {
2187
+ ...completionOpts,
2188
+ ...(failureClass ? { failureClass } : {}),
2189
+ ...(typeof retryableDecision === 'boolean' ? { agentRetryable: retryableDecision } : {}),
2190
+ ...(structuredCompletion?.failure_class ? { failureClass: structuredCompletion.failure_class } : {}),
2191
+ } : completionOpts)));
1875
2192
  // Extract last 5 non-empty stderr lines as error context when exit code is non-zero
1876
2193
  let errorReason = '';
1877
- if (hardContractFail) {
2194
+ if (keepProcessesWorkdirFail) {
2195
+ errorReason = `invalid_keep_processes_workdir: ${keepProcessesWorkdirFailure.reason} (cwd=${keepProcessesWorkdirFailure.cwd || '<unknown>'})`.slice(0, 300);
2196
+ } else if (nonceFail) {
2197
+ errorReason = nonceMismatch.reason || 'completion nonce mismatch';
2198
+ } else if (hardContractFail) {
1878
2199
  errorReason = completionContractFailure.reason || 'PR attachment contract failed';
1879
2200
  } else if (agentReportedFailure) {
1880
2201
  errorReason = structuredCompletion
@@ -1894,8 +2215,76 @@ async function spawnAgent(dispatchItem, config) {
1894
2215
  const hint = diagnoseEmptyOutput(failureClass, code, elapsedMs);
1895
2216
  if (hint) errorReason = errorReason ? `${hint} ${errorReason}` : hint;
1896
2217
  }
2218
+
2219
+ // ── Pool return (W-mp73ya3e000me6c5) ────────────────────────────────────
2220
+ // Park a healthy worktree as IDLE in the per-project pool so the next
2221
+ // dispatch can reuse it without paying the cold install/build cost. Run
2222
+ // BEFORE completeDispatch so the dispatch is still in dispatch.active
2223
+ // during the git ops — otherwise pruneStale() racing on another tick
2224
+ // could see borrowedBy as orphaned and evict the entry mid-return.
2225
+ // Skipped when keep_processes PIDs are still alive: the worktree may be
2226
+ // the cwd of a left-running dev server or watcher.
2227
+ if (effectiveResult === DISPATCH_RESULT.SUCCESS && worktreePath && fs.existsSync(worktreePath)) {
2228
+ let _keepPidsAlive = false;
2229
+ try {
2230
+ const _ks = require('./engine/keep-process-sweep');
2231
+ const _anchorRes = _ks.getActiveAnchorPidsForAgent(agentId);
2232
+ if (_anchorRes && _anchorRes.pids && _anchorRes.pids.size > 0) _keepPidsAlive = true;
2233
+ } catch (_e) { /* keep-process-sweep import optional — fall through */ }
2234
+
2235
+ const _projForReturn = project?.name || 'default';
2236
+ const _poolSizeReturn = worktreePool.getProjectPoolSize(_projForReturn, config);
2237
+ if (!_keepPidsAlive && _poolSizeReturn > 0) {
2238
+ try {
2239
+ const _mainRefRet = sanitizeBranch(shared.resolveMainBranch(rootDir, project?.mainBranch));
2240
+ await shared.shellSafeGit(['reset', '--hard', 'HEAD'], { ..._gitOpts, cwd: worktreePath, timeout: 30000 });
2241
+ // -fd preserves gitignored files (node_modules, .vite, .next caches) — that's the whole point.
2242
+ await shared.shellSafeGit(['clean', '-fd'], { ..._gitOpts, cwd: worktreePath, timeout: 30000 });
2243
+ await shared.shellSafeGit(['fetch', 'origin', _mainRefRet], { ..._gitOpts, cwd: rootDir, timeout: 30000 });
2244
+ // Detach at origin/<main> — local main is typically checked out in
2245
+ // the project root and git refuses two checkouts of the same branch.
2246
+ await shared.shellSafeGit(['checkout', '--detach', `origin/${_mainRefRet}`], { ..._gitOpts, cwd: worktreePath, timeout: 30000 });
2247
+ const _outcome = worktreePool.returnToPool(_projForReturn, worktreePath, {
2248
+ poolSize: _poolSizeReturn,
2249
+ branch: branchName || '',
2250
+ });
2251
+ log('info', `worktree-pool: return outcome=${_outcome} for ${_projForReturn} at ${worktreePath}`);
2252
+ if (_outcome === 'rejected') {
2253
+ // Capacity rejected — drop any stale entry so cleanup can reap normally.
2254
+ worktreePool.evictEntry(worktreePath, 'capacity-rejected');
2255
+ }
2256
+ } catch (returnErr) {
2257
+ log('warn', `worktree-pool: return failed for ${worktreePath}: ${returnErr.message} — evicting from pool`);
2258
+ worktreePool.evictEntry(worktreePath, 'return-git-failed');
2259
+ }
2260
+ } else if (_keepPidsAlive) {
2261
+ // Skip the pool — the worktree is in use by left-running processes.
2262
+ // Make sure no stale entry lingers (defensive).
2263
+ worktreePool.evictEntry(worktreePath, 'keep-processes-alive');
2264
+ }
2265
+ }
2266
+
1897
2267
  completeDispatch(id, effectiveResult, errorReason, resultSummary, completeOpts);
1898
2268
 
2269
+ // W-mp6k7ywi000fa33c — surface the workdir-rejection on the WI so the
2270
+ // dashboard pending-reason area shows the missing structure instead of
2271
+ // a bare failure_class label. _pendingReason on a failed item is treated
2272
+ // by the dashboard as "additional context" rather than a queue gate.
2273
+ if (keepProcessesWorkdirFailure && dispatchItem.meta?.item?.id) {
2274
+ try {
2275
+ const wiPath = resolveWorkItemPath(dispatchItem.meta);
2276
+ if (wiPath) {
2277
+ mutateJsonFileLocked(wiPath, data => {
2278
+ if (!Array.isArray(data)) return data;
2279
+ const wi = data.find(i => i.id === dispatchItem.meta.item.id);
2280
+ if (!wi) return data;
2281
+ wi._pendingReason = `invalid_keep_processes_workdir: ${keepProcessesWorkdirFailure.reason}`.slice(0, 500);
2282
+ return data;
2283
+ });
2284
+ }
2285
+ } catch (e) { log('warn', `keep-processes acceptance: failed to set _pendingReason: ${e.message}`); }
2286
+ }
2287
+
1899
2288
  // Cleanup temp files (including PID file now that dispatch is complete)
1900
2289
  try { fs.unlinkSync(sysPromptPath); } catch { /* cleanup */ }
1901
2290
  try { fs.unlinkSync(promptPath); } catch { /* cleanup */ }
@@ -1974,6 +2363,7 @@ async function spawnAgent(dispatchItem, config) {
1974
2363
  startedAt,
1975
2364
  runtimeName,
1976
2365
  sessionId: existingProcInfo.sessionId || cachedSessionId,
2366
+ _completionNonce: existingProcInfo._completionNonce || completionNonce,
1977
2367
  _pendingSteeringFiles: mergePendingSteeringEntries(existingProcInfo._pendingSteeringFiles, pendingSteering.entries),
1978
2368
  });
1979
2369
 
@@ -2288,6 +2678,51 @@ function safePrdFilenameForProject(projectName, suffix) {
2288
2678
  return path.basename(resolved);
2289
2679
  }
2290
2680
 
2681
+ /**
2682
+ * Atomically reserve a unique PRD filename in `prdDir` (P-9b7e5d3c).
2683
+ *
2684
+ * Replaces the racy "fs.existsSync(...) then write" pattern: two parallel
2685
+ * materializations on the same plan slug previously both observed a 'free'
2686
+ * filename and silently overwrote each other. This helper uses
2687
+ * `fs.openSync(path, 'wx')` so the OS rejects duplicate creates atomically.
2688
+ *
2689
+ * Attempt schedule (100 attempts total):
2690
+ * attempt = 0 → baseFileName (e.g. "slug.json")
2691
+ * attempt = 1 → "${stem}-1${ext}" (e.g. "slug-1.json")
2692
+ * attempt = 2 → "${stem}-2${ext}"
2693
+ * ...
2694
+ * attempt = 99 → "${stem}-99${ext}"
2695
+ *
2696
+ * Behavior:
2697
+ * - On EEXIST: increment attempt and retry.
2698
+ * - Any other openSync error propagates immediately.
2699
+ * - After 100 EEXIST failures: throw
2700
+ * 'Could not reserve unique PRD filename after 100 attempts'.
2701
+ * - On success: write `content` to the reserved file, close fd, return basename.
2702
+ */
2703
+ function reservePrdFilename(prdDir, baseFileName, content = '') {
2704
+ const ext = path.extname(baseFileName);
2705
+ const stem = ext ? baseFileName.slice(0, -ext.length) : baseFileName;
2706
+ for (let attempt = 0; attempt < 100; attempt++) {
2707
+ const candidateName = attempt === 0 ? baseFileName : `${stem}-${attempt}${ext}`;
2708
+ const candidatePath = path.join(prdDir, candidateName);
2709
+ let fd;
2710
+ try {
2711
+ fd = fs.openSync(candidatePath, 'wx');
2712
+ } catch (err) {
2713
+ if (err && err.code === 'EEXIST') continue;
2714
+ throw err;
2715
+ }
2716
+ try {
2717
+ if (content) fs.writeSync(fd, content);
2718
+ } finally {
2719
+ fs.closeSync(fd);
2720
+ }
2721
+ return candidateName;
2722
+ }
2723
+ throw new Error('Could not reserve unique PRD filename after 100 attempts');
2724
+ }
2725
+
2291
2726
  function materializePlansAsWorkItems(config) {
2292
2727
  if (!fs.existsSync(PRD_DIR)) { try { fs.mkdirSync(PRD_DIR, { recursive: true }); } catch (e) { log('warn', 'create PRD directory: ' + e.message); } }
2293
2728
  const writePrdLocked = (fileName, data) => {
@@ -2409,9 +2844,11 @@ function materializePlansAsWorkItems(config) {
2409
2844
  const parsed = JSON.parse(stripped);
2410
2845
  if (parsed.missing_features) {
2411
2846
  const jsonName = mf.replace(/\.md$/, '.json');
2412
- writePrdLocked(jsonName, parsed);
2847
+ // Atomic open ('wx' + retry) so two parallel migrations on the
2848
+ // same slug don't both overwrite a single PRD (P-9b7e5d3c).
2849
+ const reserved = reservePrdFilename(checkDir, jsonName, JSON.stringify(parsed, null, 2));
2413
2850
  try { fs.unlinkSync(path.join(checkDir, mf)); } catch { /* cleanup */ }
2414
- log('info', `Plan enforcement: moved ${mf} → prd/${jsonName} (PRDs must be .json in prd/)`);
2851
+ log('info', `Plan enforcement: moved ${mf} → prd/${reserved} (PRDs must be .json in prd/)`);
2415
2852
  }
2416
2853
  } catch {} // Not JSON — it's a proper plan .md, leave it
2417
2854
  }
@@ -2424,9 +2861,11 @@ function materializePlansAsWorkItems(config) {
2424
2861
  try {
2425
2862
  const parsed = safeJson(path.join(PLANS_DIR, jf));
2426
2863
  if (parsed?.missing_features) {
2427
- writePrdLocked(jf, parsed);
2864
+ // Atomic open ('wx' + retry) so two parallel migrations on the
2865
+ // same slug don't both overwrite a single PRD (P-9b7e5d3c).
2866
+ const reserved = reservePrdFilename(PRD_DIR, jf, JSON.stringify(parsed, null, 2));
2428
2867
  try { fs.unlinkSync(path.join(PLANS_DIR, jf)); } catch { /* cleanup */ }
2429
- log('info', `Auto-migrated PRD ${jf} from plans/ to prd/`);
2868
+ log('info', `Auto-migrated PRD ${jf} from plans/ to prd/${reserved}`);
2430
2869
  }
2431
2870
  } catch (e) { log('warn', 'migrate PRD from plans: ' + e.message); }
2432
2871
  }
@@ -2669,12 +3108,20 @@ function materializePlansAsWorkItems(config) {
2669
3108
  const reconciled = reconcileItemsWithPrs(existingItems, allPrsForReconcile, { onlyIds: newlyCreatedIds });
2670
3109
  if (reconciled > 0) log('info', `Plan reconciliation: marked ${reconciled} item(s) as done → ${projName}`);
2671
3110
 
2672
- // PRD removal sync: cancel pending work items whose PRD item was removed from the plan
3111
+ // PRD removal sync: cancel pending work items whose PRD item was removed from the plan.
3112
+ // IMPORTANT: decomposed sub-items (children spawned by the decompose agent for
3113
+ // implement:large parents) carry parent_id and have IDs of the shape
3114
+ // <parentId>-a/-b/-c. They are NEVER written back into the PRD's missing_features
3115
+ // array, so an id-only predicate cancels them every time the materializer creates a
3116
+ // new WI for the same plan. Accept WIs whose parent_id matches any current PRD id
3117
+ // (one-hop only — decompose produces a single level of children today).
2673
3118
  const currentPrdIds = new Set(plan.missing_features.map(f => f.id));
2674
3119
  let cancelled = 0;
2675
3120
  for (const wi of existingItems) {
2676
3121
  if (wi.status !== WI_STATUS.PENDING || wi.sourcePlan !== file) continue;
2677
- if (!currentPrdIds.has(wi.id)) {
3122
+ const ownIdInPrd = currentPrdIds.has(wi.id);
3123
+ const parentInPrd = wi.parent_id && currentPrdIds.has(wi.parent_id);
3124
+ if (!ownIdInPrd && !parentInPrd) {
2678
3125
  wi.status = WI_STATUS.CANCELLED;
2679
3126
  wi.cancelledAt = ts();
2680
3127
  wi.cancelReason = `PRD item removed from ${file}`;
@@ -2718,8 +3165,9 @@ function materializePlansAsWorkItems(config) {
2718
3165
  const mainBranch = shared.resolveMainBranch(root, firstProject.mainBranch);
2719
3166
  const branch = sanitizeBranch(plan.feature_branch);
2720
3167
  // Create branch from main (idempotent — ignores if exists)
2721
- exec(`git branch "${branch}" "${mainBranch}" 2>/dev/null || true`, { cwd: root, stdio: 'pipe' });
2722
- exec(`git push -u origin "${branch}" 2>/dev/null || true`, { cwd: root, stdio: 'pipe' });
3168
+ // P-a7c4d2e8 (F3): argv-form (shell:false) replaces shell-piped exec.
3169
+ try { shared.shellSafeGitSync(['branch', branch, mainBranch], { cwd: root }); } catch { /* idempotent — branch may already exist */ }
3170
+ try { shared.shellSafeGitSync(['push', '-u', 'origin', branch], { cwd: root }); } catch (e) { log('warn', `git push -u origin ${branch} (pre-create): ${e.message?.split('\n')[0]}`); }
2723
3171
  log('info', `Shared branch pre-created: ${branch} for plan ${file}`);
2724
3172
  } catch (err) {
2725
3173
  log('warn', `Failed to pre-create shared branch for ${file}: ${err.message}`);
@@ -3437,6 +3885,13 @@ function renderProjectWorkItemPromptForAgent(item, workType, agentId, config, pr
3437
3885
  pr_url: item.pr_url || item.prUrl || '',
3438
3886
  reviewer: item.reviewer || 'Reviewer',
3439
3887
  review_note: item.review_note || item.reviewNote || item.description || item.title || 'See PR thread comments',
3888
+ // W-mp68q6ke0010de68 — opt-in keep_processes hint plumbed via item.meta.
3889
+ // Truthy `keep_processes` triggers the playbook hint section; missing flag
3890
+ // means the agent never learns the feature exists (default-off).
3891
+ keep_processes: !!(item.meta && item.meta.keep_processes),
3892
+ keep_processes_ttl_minutes: item.meta && Number.isFinite(Number(item.meta.keep_processes_ttl_minutes))
3893
+ ? Math.floor(Number(item.meta.keep_processes_ttl_minutes))
3894
+ : '',
3440
3895
  };
3441
3896
  const cpResult = buildWorkItemDispatchVars(item, vars, config, {
3442
3897
  worktreePath: vars.worktree_path || root,
@@ -4661,6 +5116,23 @@ let tickRunning = false;
4661
5116
  let _tickStartedAt = 0;
4662
5117
  const TICK_TIMEOUT_MS = 300000; // 5 min — force-release tick lock if stuck
4663
5118
 
5119
+ // P-c2e5a1d9-a — Generation counter that is incremented on every tick start
5120
+ // AND every time the force-release branch reclaims a hung tick. The in-flight
5121
+ // (hung) tickInner captures its own `myGeneration` at entry; if a force-release
5122
+ // later bumps `tickGeneration`, the stale tick can detect the mismatch via
5123
+ // `_isTickStale()` and abort instead of mutating shared state alongside the
5124
+ // fresh tick that took over. Sub-task -a wires the scaffolding + a single
5125
+ // guard call after the heartbeat write; per-phase guards land in -b.
5126
+ let tickGeneration = 0;
5127
+
5128
+ function _isTickStale(gen) {
5129
+ if (gen !== tickGeneration) {
5130
+ log('warn', 'Tick generation mismatch, aborting stale tick');
5131
+ return true;
5132
+ }
5133
+ return false;
5134
+ }
5135
+
4664
5136
  function _pollIntervalMsFromTicks(ticks, tickIntervalMs) {
4665
5137
  const normalizedTicks = Math.max(1, Number(ticks) || 1);
4666
5138
  const normalizedTickInterval = Math.max(1, Number(tickIntervalMs) || ENGINE_DEFAULTS.tickInterval);
@@ -4683,6 +5155,10 @@ async function tick() {
4683
5155
  log('error', `Tick hung for ${Math.round((Date.now() - _tickStartedAt) / 1000)}s — force-releasing lock`);
4684
5156
  tickRunning = false;
4685
5157
  _tickStartedAt = 0;
5158
+ // P-c2e5a1d9-a — Bump generation so the in-flight (hung) tickInner sees
5159
+ // a mismatch via `_isTickStale()` and bails out before mutating shared
5160
+ // state alongside the fresh tick that's about to take over.
5161
+ tickGeneration++;
4686
5162
  }
4687
5163
  return;
4688
5164
  }
@@ -4699,6 +5175,11 @@ async function tick() {
4699
5175
  }
4700
5176
 
4701
5177
  async function tickInner() {
5178
+ // P-c2e5a1d9-a — Capture this tick's generation as the very first statement
5179
+ // so any guard later in this function can detect a force-release that
5180
+ // reclaimed the lock while we were still running.
5181
+ const myGeneration = ++tickGeneration;
5182
+
4702
5183
  const control = getControl();
4703
5184
  if (control.state !== 'running' && control.state !== 'stopping') {
4704
5185
  log('info', `Engine state is "${control.state}" — exiting process`);
@@ -4708,6 +5189,11 @@ async function tickInner() {
4708
5189
  // Write heartbeat so dashboard can detect stale engine
4709
5190
  try { mutateControl(c => ({ ...c, heartbeat: Date.now() })); } catch (e) { log('warn', 'write heartbeat: ' + e.message); }
4710
5191
 
5192
+ // P-c2e5a1d9-a — Initial wiring guard: bail immediately if a force-release
5193
+ // reclaimed our lock while the heartbeat write was in flight. Per-phase
5194
+ // guards inside the rest of tickInner are sub-task -b's scope.
5195
+ if (_isTickStale(myGeneration)) return;
5196
+
4711
5197
  const config = getConfig();
4712
5198
  tickCount++;
4713
5199
  const now = Date.now();
@@ -4737,6 +5223,23 @@ async function tickInner() {
4737
5223
  // 2.5. Periodic cleanup + MCP sync (every 10 ticks = ~5 minutes)
4738
5224
  if (tickCount % 10 === 0) {
4739
5225
  try { await runCleanup(config); } catch (e) { log('warn', `runCleanup: ${e.message}`); }
5226
+ if (_isTickStale(myGeneration)) return;
5227
+ }
5228
+
5229
+ // 2.52. keep_processes TTL/dead-PID sweep (W-mp68q6ke0010de68). Walks
5230
+ // agents/*/keep-pids.json, kills+unlinks expired entries, silently unlinks
5231
+ // entries whose PIDs are all gone, leaves malformed files alone. Cheap (one
5232
+ // readdir + N small file reads), bounded by ENGINE_DEFAULTS.keepProcesses.
5233
+ const keepSweepEvery = Math.max(1, ENGINE_DEFAULTS.keepProcesses?.sweepEvery || 30);
5234
+ if (ENGINE_DEFAULTS.keepProcesses?.enabled !== false && tickCount % keepSweepEvery === 0) {
5235
+ safe('sweepKeepProcesses', () => {
5236
+ const { sweepKeepProcesses } = require('./engine/keep-process-sweep');
5237
+ const stats = sweepKeepProcesses();
5238
+ if (stats.scanned > 0 && (stats.expiredFiles || stats.deadFiles || stats.malformed)) {
5239
+ log('info', `keep-processes sweep: scanned=${stats.scanned} expired=${stats.expiredFiles} dead=${stats.deadFiles} malformed=${stats.malformed} killed=${stats.killedPids}`);
5240
+ }
5241
+ });
5242
+ if (_isTickStale(myGeneration)) return;
4740
5243
  }
4741
5244
 
4742
5245
  // 2.55. Check persistent watches (3 tick-equivalents, default ~3 minutes)
@@ -4831,7 +5334,9 @@ async function tickInner() {
4831
5334
  log('info', '[gh] PR status poll skipped — throttled');
4832
5335
  }
4833
5336
  if (statusPolls.length) await Promise.allSettled(statusPolls);
5337
+ if (_isTickStale(myGeneration)) return;
4834
5338
  try { await processPendingRebases(config); } catch (err) { log('warn', `Pending rebase processing error: ${err?.message || err}`); }
5339
+ if (_isTickStale(myGeneration)) return;
4835
5340
  // Sync PR status back to PRD items (missing → done when active PR exists)
4836
5341
  try { syncPrdFromPrs(config); } catch (err) { log('warn', `PRD sync error: ${err?.message || err}`); }
4837
5342
  // Check if any plans can be marked completed (all features done/in-pr)
@@ -4875,12 +5380,14 @@ async function tickInner() {
4875
5380
  log('info', '[gh] PR comment poll skipped — throttled');
4876
5381
  }
4877
5382
  if (commentPolls.length) await Promise.allSettled(commentPolls);
5383
+ if (_isTickStale(myGeneration)) return;
4878
5384
  // Reconciliation runs regardless of poll flags — it's a recovery sweep, not a convenience poll
4879
5385
  // Reconciliation also parallelized — ADO and GitHub reconciliation are independent
4880
5386
  const reconcilePolls = [];
4881
5387
  reconcilePolls.push(reconcilePrs(config).catch(err => { log('warn', `ADO PR reconciliation error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
4882
5388
  reconcilePolls.push(ghReconcilePrs(config).catch(err => { log('warn', `GitHub PR reconciliation error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
4883
5389
  await Promise.allSettled(reconcilePolls);
5390
+ if (_isTickStale(myGeneration)) return;
4884
5391
  }
4885
5392
 
4886
5393
  // 2.9. Stalled dispatch detection — auto-retry failed items blocking the graph (every 20 ticks = ~10 min)
@@ -4903,6 +5410,7 @@ async function tickInner() {
4903
5410
  const dispatchKeysToClear = [];
4904
5411
  const cooldownKeysToClear = [];
4905
5412
 
5413
+ if (_isTickStale(myGeneration)) return;
4906
5414
  mutateWorkItems(wiPath, items => {
4907
5415
  let changed = false;
4908
5416
  const failedIds = new Set(items.filter(w => w.status === WI_STATUS.FAILED).map(w => w.id));
@@ -4959,6 +5467,7 @@ async function tickInner() {
4959
5467
  // Clear dispatch entries AFTER work-items lock is released (no nested locks)
4960
5468
  for (const key of dispatchKeysToClear) {
4961
5469
  try {
5470
+ if (_isTickStale(myGeneration)) return;
4962
5471
  mutateDispatch((dp) => {
4963
5472
  dp.completed = dp.completed.filter(d => d.meta?.dispatchKey !== key);
4964
5473
  return dp;
@@ -4997,6 +5506,7 @@ async function tickInner() {
4997
5506
  try { pruneStalePrDispatches(config); } catch (e) { log('warn', 'prune stale PR dispatches: ' + e.message); }
4998
5507
  let discoveryOk = true;
4999
5508
  try { await discoverWork(config); } catch (e) { log('warn', 'discoverWork: ' + e.message); discoveryOk = false; }
5509
+ if (_isTickStale(myGeneration)) return;
5000
5510
 
5001
5511
  // 4. Update snapshot
5002
5512
  safe('updateSnapshot', () => updateSnapshot(config));
@@ -5022,6 +5532,7 @@ async function tickInner() {
5022
5532
  const pa = itemPriority[a.meta?.item?.priority] ?? 1, pb = itemPriority[b.meta?.item?.priority] ?? 1;
5023
5533
  return pa - pb;
5024
5534
  });
5535
+ if (_isTickStale(myGeneration)) return;
5025
5536
  mutateDispatch((dp) => {
5026
5537
  dp.pending = dispatch.pending;
5027
5538
  dp.active = dispatch.active || dp.active;
@@ -5069,6 +5580,7 @@ async function tickInner() {
5069
5580
  delete item.skipReason;
5070
5581
  refreshDeferredWorkItemPrompt(item, config);
5071
5582
  try {
5583
+ if (_isTickStale(myGeneration)) return;
5072
5584
  mutateDispatch((dp) => {
5073
5585
  const p = (dp.pending || []).find(d => d.id === item.id);
5074
5586
  if (p) {
@@ -5185,6 +5697,7 @@ async function tickInner() {
5185
5697
  log('error', `spawnAgent exception for ${item.id}: ${spawnErr.message}`);
5186
5698
  proc = null;
5187
5699
  }
5700
+ if (_isTickStale(myGeneration)) return;
5188
5701
  if (proc === null) {
5189
5702
  // spawnAgent failed (e.g., worktree creation error). It already called
5190
5703
  // completeDispatch internally which handles retry logic, but log at the
@@ -5198,6 +5711,7 @@ async function tickInner() {
5198
5711
  ? path.join(ENGINE_DIR, '..', 'work-items.json')
5199
5712
  : item.meta.project?.name ? projectWorkItemsPath({ name: item.meta.project.name, localPath: item.meta.project.localPath }) : null;
5200
5713
  if (wiPath) {
5714
+ if (_isTickStale(myGeneration)) return;
5201
5715
  mutateWorkItems(wiPath, items => {
5202
5716
  const wi = items.find(i => i.id === item.meta.item.id);
5203
5717
  if (wi && wi.status === WI_STATUS.DISPATCHED) {
@@ -5267,6 +5781,7 @@ async function tickInner() {
5267
5781
  }
5268
5782
  }
5269
5783
  if (skipReasonChanged) {
5784
+ if (_isTickStale(myGeneration)) return;
5270
5785
  mutateDispatch((dp) => { dp.pending = postDispatch.pending; return dp; });
5271
5786
  }
5272
5787
  }
@@ -5300,6 +5815,7 @@ module.exports = {
5300
5815
  // Discovery
5301
5816
  discoverWork, discoverFromPrs, discoverFromWorkItems, discoverCentralWorkItems,
5302
5817
  materializePlansAsWorkItems,
5818
+ reservePrdFilename, // exported for testing (P-9b7e5d3c)
5303
5819
  sweepStaleArchivedPrdBackups, // exported for testing
5304
5820
 
5305
5821
  // Shared helpers (used by lifecycle.js and tests)
@@ -5333,6 +5849,14 @@ module.exports = {
5333
5849
  // Tick
5334
5850
  tick,
5335
5851
  resolveMaxConcurrent, _pollIntervalMsFromTicks, _shouldRunPeriodicPhase, // exported for testing
5852
+ // P-c2e5a1d9-a — exported for testing the tick-generation force-release path
5853
+ _isTickStale,
5854
+ get tickGeneration() { return tickGeneration; },
5855
+ set tickGeneration(v) { tickGeneration = v; },
5856
+ get tickRunning() { return tickRunning; },
5857
+ set tickRunning(v) { tickRunning = v; },
5858
+ get _tickStartedAt() { return _tickStartedAt; },
5859
+ set _tickStartedAt(v) { _tickStartedAt = v; },
5336
5860
  };
5337
5861
 
5338
5862
  // ─── Entrypoint ─────────────────────────────────────────────────────────────