@yemi33/minions 0.1.2219 → 0.1.2221

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -379,6 +379,7 @@ async function openSettings() {
379
379
  settingsField('Worktree Root', 'set-worktreeRoot', e.worktreeRoot || '../worktrees', '', 'Relative or absolute path for git worktrees; on Windows prefer a short path like C:\\wt') +
380
380
  settingsField('Assert-Clean Status Probe Timeout', 'set-assertCleanStatusTimeoutMs', e.assertCleanStatusTimeoutMs || 10000, 'ms', 'Timeout for the `git status --porcelain` preflight probe in assertCleanSharedWorktree. Raise this (e.g. 60000) on GVFS/Plastic-backed monorepos where sparse hydration runs longer than the 10s default. On timeout the engine quarantines the bad worktree and emits a RETRYABLE failure so the next dispatch starts fresh. Clamped 1000–120000ms.') +
381
381
  settingsField('Orphan-holder scan timeout', 'set-orphanHolderScanTimeoutMs', e.orphanHolderScanTimeoutMs || 5000, 'ms', 'Cap on the cross-platform scan (PowerShell / /proc walk / lsof) that identifies the OS process holding a stuck worktree\'s cwd. Bumps up only matter on heavily-loaded hosts where the holder scan races the sweep tick. Clamped 1000–30000ms.') +
382
+ settingsField('Worktree-holder reap probe timeout', 'set-statusProbeKillTimeoutMs', e.statusProbeKillTimeoutMs || 12000, 'ms', 'Windows-only. Timeout for the pre-quarantine worktree-holder reap probe (legacy git.exe-cmdline sweep + PEB-CWD scan that kills an orphaned agent process tree whose CWD pins the worktree dir, causing EBUSY on rename AND `git worktree remove --force`). The old hardcoded 2000ms was the proximate cause of the reaper being a no-op under concurrent-agent load (PowerShell cold-start + process enumeration exceeded 2s → ETIMEDOUT → reaped nothing). ETIMEDOUT is always swallowed (reap nothing; quarantine still proceeds), so raising this only widens the success window. Clamped 2000–60000ms.') +
382
383
  '</div>';
383
384
 
384
385
  const paneCopilot =
@@ -998,6 +999,7 @@ async function saveSettings() {
998
999
  ccUseWorkerPool: !!document.getElementById('set-ccUseWorkerPool')?.checked,
999
1000
  autoReapOrphanWorktreeHolders: !!document.getElementById('set-autoReapOrphanWorktreeHolders')?.checked,
1000
1001
  orphanHolderScanTimeoutMs: document.getElementById('set-orphanHolderScanTimeoutMs')?.value,
1002
+ statusProbeKillTimeoutMs: document.getElementById('set-statusProbeKillTimeoutMs')?.value,
1001
1003
  adoPollEnabled: document.getElementById('set-adoPollEnabled').checked,
1002
1004
  ghPollEnabled: document.getElementById('set-ghPollEnabled').checked,
1003
1005
  pollingPaused: document.getElementById('set-pollingPaused').checked,
@@ -186,7 +186,11 @@
186
186
  .layout {
187
187
  display: grid;
188
188
  grid-template-columns: minmax(0, 820px) minmax(0, 575px);
189
- grid-template-rows: 1fr 1fr;
189
+ /* Status row is content-sized (`auto`) so the right-side control panel
190
+ (Watches + Knowledge) grows to fit and never scrolls internally — no
191
+ fixed/max height. History takes the remaining space and scrolls within
192
+ its own region. (W-mqgx8qkv0007218d) */
193
+ grid-template-rows: auto minmax(0, 1fr);
190
194
  grid-template-areas:
191
195
  "actions status"
192
196
  "actions history";
@@ -1100,6 +1104,10 @@
1100
1104
 
1101
1105
  /* ── Status panel sub-divisions: Team cards over System tiles,
1102
1106
  separated by a divider line (no text headers). ──── */
1107
+ /* The Status panel is the right-side control panel. Its body never scrolls
1108
+ internally — the panel grows to fit Team cards + Watches/Knowledge tiles
1109
+ (the status grid row is `auto`-sized). (W-mqgx8qkv0007218d) */
1110
+ .panel-status .panel-body { overflow: visible; }
1103
1111
  .team-section { margin-bottom: 14px; }
1104
1112
  .cockpit-section { padding-top: 14px; border-top: 1px solid var(--border); }
1105
1113
 
package/dashboard.js CHANGED
@@ -10437,6 +10437,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
10437
10437
  // 30s ceiling (the scan runs once per orphan-sweep tick and a slow
10438
10438
  // scan must not block the rest of the sweep for minutes).
10439
10439
  orphanHolderScanTimeoutMs: [1000, 30000],
10440
+ // W-mqila0t5 — Windows worktree-holder reap probe timeout. 2s floor
10441
+ // (PowerShell cold-start + PEB enumeration needs headroom; the old
10442
+ // 2000ms hardcode was the no-op bug); 60s ceiling (the reap runs once
10443
+ // on the quarantine path and ETIMEDOUT is swallowed, so a long probe
10444
+ // never wedges dispatch — it just falls through to force-remove).
10445
+ statusProbeKillTimeoutMs: [2000, 60000],
10440
10446
  idleAlertMinutes: [1], shutdownTimeout: [30000], restartGracePeriod: [60000],
10441
10447
  meetingRoundTimeout: [60000],
10442
10448
  // W-mq066js7000fff1f-c (Gap B/C): steering safety-net knobs.
@@ -19,7 +19,7 @@ tick()
19
19
  2.5 runCleanup() Periodic cleanup (every 60 ticks ≈ 10min)
20
20
  2.52 sweepKeepProcesses() keep_processes TTL/dead-PID sweep (every 180 ticks)
21
21
  2.53 sweepManagedSpawn() managed_spawn TTL/dead-PID/log-rotate sweep (every 180 ticks)
22
- 2.54 pruneWorktreesPeriodic() Periodic worktree GC: in-root + out-of-root git registry sweep (every worktreePruneIntervalTicks ≈ 30 ticks; catches Windows EPERM/EBUSY stragglers and `git worktree list` entries outside worktreeRoot)
22
+ 2.54 pruneWorktreesPeriodic() Periodic worktree GC: in-root + out-of-root git registry sweep + locked-`initializing` missing-dir reclaim (every worktreePruneIntervalTicks ≈ 30 ticks; catches Windows EPERM/EBUSY stragglers, `git worktree list` entries outside worktreeRoot, and branch-bricking locked missing-dir entries plain prune can't reap)
23
23
  2.55 checkWatches() Persistent watch jobs (every 18 tick-equivalents)
24
24
  2.6 pollPrStatus() Poll ADO + GitHub for build, review, merge status (wall-clock cadence from prPollStatusEvery × tickInterval, default ≈ 12min)
25
25
  processPendingRebases() Run any rebase work queued from the previous tick
@@ -160,6 +160,123 @@ trailing `git worktree prune --expire=now` (drops registry entries whose
160
160
  dirs are already gone) still runs regardless of ownership. The marker is
161
161
  gitignored so it never shows up in any worktree's `git status`.
162
162
 
163
+ ### Locked-`initializing` missing-dir reclaim (W-mqifblkf00149df5)
164
+
165
+ A third pruner, `worktree-gc.js#reclaimMissingDirWorktrees`, closes a
166
+ branch-bricking gap the two scanners above are blind to. An interrupted
167
+ `git worktree add` (engine crash / kill during git's `initializing` lock
168
+ phase) leaves a `.git/worktrees/<id>/` ADMIN entry behind — often carrying
169
+ a `locked` file, reason `initializing` — whose backing working directory
170
+ never materialized. The entry still claims the branch, so every later
171
+ `git worktree add <branch>` fails with
172
+ `'<branch>' is already used by worktree at '<path>'`. Because the entry is
173
+ LOCKED, **plain `git worktree prune` skips it by design** (so does the
174
+ out-of-root sweep's `prune --expire=now`), and the in-root scanner walks
175
+ the filesystem where the dir is already gone — neither can reap it. The
176
+ branch stays bricked for all future dispatches until a human runs
177
+ `git worktree unlock` + `git worktree remove -f -f`.
178
+
179
+ `reclaimMissingDirWorktrees` runs per project on the
180
+ `pruneWorktreesPeriodic` cadence (via `cleanup.js#runPeriodicWorktreeSweep`).
181
+ For each `git worktree list --porcelain` entry it reclaims with
182
+ `git worktree remove -f -f <path>` (the **double force** is required — a
183
+ single `-f` errors `cannot remove a locked working tree`) followed by
184
+ `git worktree prune`, but ONLY for the genuinely-safe case:
185
+
186
+ 1. backing `path` does NOT exist on disk (`!fs.existsSync`) — a present
187
+ dir may hold unpushed work and is **never** force-removed by this sweep;
188
+ 2. no non-terminal dispatch still claims that exact path
189
+ (`shared.isWorktreePathLive`, fail-open: skip/leak rather than nuke); and
190
+ 3. it is not the main checkout (`path === project root`).
191
+
192
+ The same `remove -f -f` reaper (`engine.js#pruneStaleWorktreeForBranch`) is
193
+ also wired into **both** spawn-time worktree-creation recovery paths —
194
+ `_branchOnRemote === true` AND the fresh-create `else` branch — so a
195
+ dispatch that hits the stale entry at `git worktree add` time recovers
196
+ in-line instead of erroring out.
197
+
198
+ ### Dir-PRESENT CWD-pinned quarantine reap (W-mqila0t5)
199
+
200
+ The sibling failure mode to the missing-dir case above: the backing dir is
201
+ fully PRESENT (a real checkout), but the dirty-reused-worktree quarantine in
202
+ `engine.js#_quarantineDirtyWorktree` cannot move it aside. The rename returns
203
+ `EBUSY`, and the `git worktree remove --force` fallback ALSO returns `EBUSY`,
204
+ so the dispatch ends `WORKTREE_QUARANTINE_ENV_BLOCKED` and the WI
205
+ auto-recovery loop re-queues into the identical wall.
206
+
207
+ **Root cause.** The lock holder is *not* a file handle and *not* (only) a
208
+ `git.exe` — it is an ORPHANED AGENT PROCESS TREE left by the prior failed
209
+ dispatch: a `copilot.exe` (the agent CLI) plus its `cmd.exe` / `node.exe` /
210
+ `powershell.exe` children, all still alive with their CURRENT WORKING
211
+ DIRECTORY set to the worktree root. On Windows a live process CWD pins the
212
+ directory, so rename / rmdir / `git worktree remove --force` all return
213
+ `EBUSY` / `ERROR_SHARING_VIOLATION` until those processes die. Two diagnostic
214
+ traps:
215
+
216
+ - The Windows **Restart Manager (rstrtmgr.dll) returns ZERO lockers** — it is
217
+ file-handle based, but the lock here is a DIRECTORY handle (CWD). Any
218
+ file-handle detector false-negatives.
219
+ - A **CommandLine match misses it** — `Win32_Process` does not expose CWD, and
220
+ the CWD-pinning child's argv never mentions the worktree path.
221
+
222
+ **The reap (`engine.js#_reapWorktreeHolders`, called before the rename).**
223
+ Windows-only; three layers, each fail-open:
224
+
225
+ - **Layer 0** — legacy `git.exe`-by-cmdline sweep
226
+ (`_killGitDescendantsForWorktree`).
227
+ - **Layer 1 — ownership reap (preferred, precise).** For each TERMINAL
228
+ dispatch that owned this worktree path (discovered via
229
+ `_findTerminalWorktreeOwners`, which reads the dispatch store's non-live
230
+ sections), look up its recorded root PID from the PID file
231
+ (`shared.findDispatchPidFile`) and kill that PID plus its full descendant
232
+ tree (`shared.listProcessDescendants` → `shared.killImmediate`). Because it
233
+ only ever kills a tree the engine itself spawned for THIS path, it can never
234
+ touch a live sibling.
235
+ - **Layer 2 — CWD-scan reap (fallback for orphans the map / PID files no
236
+ longer track, e.g. after an engine restart).** `shared.findProcessCwdHolders`
237
+ reads each process's REAL working directory out of its PEB
238
+ (`NtQueryInformationProcess` → `PebBaseAddress`, then `ReadProcessMemory` of
239
+ `RTL_USER_PROCESS_PARAMETERS.CurrentDirectory.DosPath`) and returns only
240
+ processes whose CWD is at/under THIS worktree; each is killed. A live sibling
241
+ agent running in its OWN worktree has a CWD under a DIFFERENT path and is
242
+ never reaped.
243
+
244
+ **Mandatory gates (every reap):**
245
+
246
+ 1. POSIX is a no-op (the `EBUSY` race is Windows-specific).
247
+ 2. Master switch `engine.statusProbeKillDescendantsWin32` (default ON).
248
+ 3. `shared.isWorktreePathLive(path, { excludeDispatchId })` must be false —
249
+ fail-open: on a live claim OR on a SQL error / throw the WHOLE reap is
250
+ SKIPPED (leak a worktree rather than nuke a live agent's unpushed work).
251
+ This check is **re-run at kill time** before Layer 2 (W-mqinlicl): the PEB
252
+ probe can take several seconds, and liveness flips on the engine tick — a
253
+ "not live" snapshot from the top of the function is unsafe to kill on.
254
+ Verified live: a path flagged orphan-pinned during recon was reused by a
255
+ fresh dispatch ~2 min later; the kill-time re-check (fail-open) skips the
256
+ Layer 2 kills once the path goes live, so a newly-live agent is never nuked.
257
+ 4. Only CWDs at/under THIS worktreePath are ever reaped.
258
+
259
+ **Two call sites (W-mqinlicl §6).** `_reapWorktreeHolders` runs both BEFORE the
260
+ quarantine rename (`_quarantineDirtyWorktree`) AND at **dispatch-end GC**
261
+ (`worktree-gc.js#gcDispatchWorktreeIfOrphan`, via the injected `reapHolders`
262
+ hook) right before `shared.removeWorktree`. The dispatch-end call passes the
263
+ ending dispatch's id as `ownerDispatchId` (terminal-owner discovery misses it —
264
+ the owner is still `active`) plus `excludeDispatchId`, so a dispatch always
265
+ reaps the orphaned descendant tree it spawned before releasing its worktree.
266
+ This closes the matching `removeWorktree` `EPERM`/`ETIMEDOUT` removal failures
267
+ at the same root cause as the quarantine `EBUSY`. The reap is best-effort: a
268
+ throw never blocks the removal that follows.
269
+
270
+ **Probe timeout.** `engine.statusProbeKillTimeoutMs` (default 12000ms, clamped
271
+ 2000–60000; Settings → Worker Pool & Worktrees → "Worktree-holder reap probe
272
+ timeout"). The old hardcoded 2000ms was the proximate cause of the reaper
273
+ being a no-op exactly under concurrent-agent load: a PowerShell cold-start +
274
+ process / PEB enumeration over a busy box routinely exceeded 2s, so the probe
275
+ died with `cmd.exe ETIMEDOUT` and reaped nothing. ETIMEDOUT and any other
276
+ throw from a probe are always swallowed (reap nothing), so the quarantine
277
+ still proceeds to its rename / force-remove fallback — raising the timeout
278
+ only widens the window in which the reap can succeed.
279
+
163
280
  ## Holder identification + opt-in auto-reap (W-mq6f2fe0000557fa)
164
281
 
165
282
  Orphan-sweep escalations also run `shared.findProcessesWithCwdInside(wt)`
package/engine/cleanup.js CHANGED
@@ -1568,6 +1568,11 @@ function scrubStaleMetrics() {
1568
1568
  * - `pruneOrphanWorktreesFromGitRegistry`: walks `git worktree list` per
1569
1569
  * project and reaps registered worktrees outside the configured
1570
1570
  * `worktreeRoot` (covers `D:/tmp-*`, `D:/squad-worktrees/*`, etc.).
1571
+ * - `reclaimMissingDirWorktrees` (W-mqifblkf00149df5): walks
1572
+ * `git worktree list` per project and force-removes (`remove -f -f`) the
1573
+ * LOCKED-`initializing` entries whose backing dir is MISSING — the
1574
+ * branch-bricking case plain `prune` can't touch. Gated by missing-dir +
1575
+ * `isWorktreePathLive`.
1571
1576
  *
1572
1577
  * Both pruners share the SAME `dispatchSnap` from `queries.getDispatch()`
1573
1578
  * so live work isn't yanked between scans. Returns aggregate stats for
@@ -1577,13 +1582,14 @@ function runPeriodicWorktreeSweep(config) {
1577
1582
  const worktreeGc = require('./worktree-gc');
1578
1583
  const projects = getProjects(config);
1579
1584
  if (projects.length === 0) {
1580
- return { scanned: 0, kept: 0, evicted: 0, failed: 0, outOfRootEvicted: 0, prunedRegistry: 0 };
1585
+ return { scanned: 0, kept: 0, evicted: 0, failed: 0, outOfRootEvicted: 0, prunedRegistry: 0, missingDirReclaimed: 0, missingDirSkippedLive: 0 };
1581
1586
  }
1582
1587
  const dispatchSnap = getDispatch();
1583
1588
  const worktreeRootRel = config?.engine?.worktreeRoot || ENGINE_DEFAULTS.worktreeRoot;
1584
1589
  const _log = (lvl, msg) => log(lvl, msg);
1585
1590
 
1586
1591
  let scanned = 0, kept = 0, evicted = 0, failed = 0, outOfRootEvicted = 0, prunedRegistry = 0;
1592
+ let missingDirReclaimed = 0, missingDirSkippedLive = 0;
1587
1593
  const _writeToInbox = (a, s, c) => { try { return shared.writeToInbox(a, s, c); } catch (_e) { return false; } };
1588
1594
  try {
1589
1595
  const r1 = worktreeGc.pruneOrphanWorktrees({
@@ -1606,7 +1612,22 @@ function runPeriodicWorktreeSweep(config) {
1606
1612
  prunedRegistry += r2.prunedRegistry || 0;
1607
1613
  } catch (e) { log('warn', `worktree-gc periodic out-of-root: ${e.message}`); }
1608
1614
 
1609
- return { scanned, kept, evicted, failed, outOfRootEvicted, prunedRegistry };
1615
+ // W-mqifblkf00149df5 reclaim locked-`initializing` worktree entries whose
1616
+ // backing dir is MISSING. Neither pruner above touches these: the in-root
1617
+ // scanner walks the filesystem (the dir is gone, so it's invisible) and the
1618
+ // out-of-root scanner deliberately skips dirs inside worktreeRoot and only
1619
+ // runs a plain `prune` that SKIPS locked entries. Without this, an
1620
+ // interrupted `git worktree add` bricks the branch until a human intervenes.
1621
+ try {
1622
+ const r3 = worktreeGc.reclaimMissingDirWorktrees({
1623
+ projects, log: _log, config,
1624
+ });
1625
+ missingDirReclaimed += r3.reclaimed || 0;
1626
+ missingDirSkippedLive += r3.skippedLive || 0;
1627
+ failed += r3.failed || 0;
1628
+ } catch (e) { log('warn', `worktree-gc periodic missing-dir reclaim: ${e.message}`); }
1629
+
1630
+ return { scanned, kept, evicted, failed, outOfRootEvicted, prunedRegistry, missingDirReclaimed, missingDirSkippedLive };
1610
1631
  }
1611
1632
 
1612
1633
  // ─── Exports ─────────────────────────────────────────────────────────────────
package/engine/shared.js CHANGED
@@ -2705,6 +2705,18 @@ const ENGINE_DEFAULTS = {
2705
2705
  // incidents are closed by (1b) alone — but both are cheap and idempotent.
2706
2706
  statusProbeUseNoOptionalLocks: true,
2707
2707
  statusProbeKillDescendantsWin32: true,
2708
+ // W-mqila0t5 — generous timeout (ms) for the Windows worktree-holder reap
2709
+ // probe used before/under quarantine. The old hardcoded 2000ms was the
2710
+ // proximate cause of the reaper being a no-op exactly under concurrent-agent
2711
+ // load: a PowerShell cold-start + Win32_Process / PEB-CWD enumeration over a
2712
+ // busy box routinely exceeds 2s, so the probe died with cmd.exe ETIMEDOUT and
2713
+ // reaped nothing — leaving the orphaned, CWD-pinning agent process tree alive
2714
+ // and the worktree dir EBUSY-pinned for rename AND `git worktree remove
2715
+ // --force`. Both the legacy git.exe-cmdline sweep and the new CWD-scan +
2716
+ // ownership reap use this. ETIMEDOUT is always swallowed (reap-nothing), so
2717
+ // raising this only widens the window in which the reap can succeed; it never
2718
+ // blocks the quarantine's rename/force-remove fallback. Clamped 2000–60000ms.
2719
+ statusProbeKillTimeoutMs: 12000,
2708
2720
  completionReportRetentionDays: 90, // retain completion report sidecars beyond capped dispatch history
2709
2721
  completionReportMaxFiles: 5000, // hard cap for completion report sidecars during cleanup
2710
2722
  // P-bfa2c-cors-wildcard: extra Origins permitted to receive an
@@ -7498,6 +7510,120 @@ function _findMacProcessesWithCwdInside(resolved, timeoutMs, now) {
7498
7510
  return out;
7499
7511
  }
7500
7512
 
7513
+ // W-mqila0t5 — Windows-only PEB-based CWD probe used by the worktree-holder
7514
+ // reaper. Unlike findProcessesWithCwdInside (whose Windows path is a
7515
+ // CommandLine-basename heuristic — Win32_Process does NOT expose the working
7516
+ // directory), this reads each process's REAL current directory out of its PEB
7517
+ // (NtQueryInformationProcess → PebBaseAddress, then ReadProcessMemory of
7518
+ // RTL_USER_PROCESS_PARAMETERS.CurrentDirectory.DosPath). That's the only way
7519
+ // to find a process that pins a directory via its CWD — e.g. an orphaned
7520
+ // copilot.exe / cmd.exe / node.exe agent child whose cwd is the worktree root
7521
+ // but whose argv never mentions the path (so the cmdline heuristic
7522
+ // false-negatives, and the Restart Manager / file-handle detectors report ZERO
7523
+ // lockers because the lock is a DIRECTORY handle, not a file handle).
7524
+ //
7525
+ // Returns [{ pid, name, cwd }] for processes whose CWD is at-or-under
7526
+ // `worktreePath`. POSIX returns [] (the EBUSY race is Windows-specific; POSIX
7527
+ // reaping stays a no-op). Fail-open: any error / timeout / spawn failure →
7528
+ // [] (reap nothing). x64 offsets only (PebBaseAddress@8, PEB+0x20 →
7529
+ // ProcessParameters, RTL_USER_PROCESS_PARAMETERS+0x38 → CurrentDirectory).
7530
+ function _windowsCwdProbeScript(psTarget) {
7531
+ return [
7532
+ "$ErrorActionPreference='SilentlyContinue'",
7533
+ `$target = '${psTarget}'.TrimEnd('\\').ToLower()`,
7534
+ 'Add-Type -Namespace MinionsPeb -Name Native -MemberDefinition @"',
7535
+ '[DllImport("ntdll.dll")] public static extern int NtQueryInformationProcess(IntPtr h, int cls, byte[] info, int len, ref int ret);',
7536
+ '[DllImport("kernel32.dll")] public static extern IntPtr OpenProcess(int access, bool inherit, int pid);',
7537
+ '[DllImport("kernel32.dll")] public static extern bool CloseHandle(IntPtr h);',
7538
+ '[DllImport("kernel32.dll")] public static extern bool ReadProcessMemory(IntPtr h, IntPtr baseAddr, byte[] buf, int size, ref int read);',
7539
+ '"@',
7540
+ 'function Get-ProcCwd($procId) {',
7541
+ ' $h = [MinionsPeb.Native]::OpenProcess(0x410, $false, $procId)',
7542
+ ' if ($h -eq [IntPtr]::Zero) { return $null }',
7543
+ ' try {',
7544
+ ' $pbi = New-Object byte[] 48; $ret = 0',
7545
+ ' if ([MinionsPeb.Native]::NtQueryInformationProcess($h, 0, $pbi, 48, [ref]$ret) -ne 0) { return $null }',
7546
+ ' $pebBase = [System.BitConverter]::ToInt64($pbi, 8)',
7547
+ ' if ($pebBase -eq 0) { return $null }',
7548
+ ' $p8 = New-Object byte[] 8; $r = 0',
7549
+ ' if (-not [MinionsPeb.Native]::ReadProcessMemory($h, [IntPtr]($pebBase + 0x20), $p8, 8, [ref]$r)) { return $null }',
7550
+ ' $params = [System.BitConverter]::ToInt64($p8, 0)',
7551
+ ' if ($params -eq 0) { return $null }',
7552
+ ' $us = New-Object byte[] 16',
7553
+ ' if (-not [MinionsPeb.Native]::ReadProcessMemory($h, [IntPtr]($params + 0x38), $us, 16, [ref]$r)) { return $null }',
7554
+ ' $len = [System.BitConverter]::ToUInt16($us, 0)',
7555
+ ' $bufPtr = [System.BitConverter]::ToInt64($us, 8)',
7556
+ ' if ($len -le 0 -or $bufPtr -eq 0) { return $null }',
7557
+ ' $sb = New-Object byte[] $len',
7558
+ ' if (-not [MinionsPeb.Native]::ReadProcessMemory($h, [IntPtr]$bufPtr, $sb, $len, [ref]$r)) { return $null }',
7559
+ ' return [System.Text.Encoding]::Unicode.GetString($sb, 0, $len)',
7560
+ ' } finally { [void][MinionsPeb.Native]::CloseHandle($h) }',
7561
+ '}',
7562
+ 'foreach ($p in Get-Process) {',
7563
+ ' $cwd = Get-ProcCwd $p.Id',
7564
+ ' if (-not $cwd) { continue }',
7565
+ " $c = $cwd.TrimEnd('\\').ToLower()",
7566
+ " if ($c -eq $target -or $c.StartsWith($target + '\\')) {",
7567
+ ' Write-Output ("{0}|{1}|{2}" -f $p.Id, $p.ProcessName, $cwd)',
7568
+ ' }',
7569
+ '}',
7570
+ ].join('\n');
7571
+ }
7572
+
7573
+ function _parseCwdHolderLines(raw, resolved) {
7574
+ if (!raw || !String(raw).trim()) return [];
7575
+ const prefix = (resolved.replace(/[\\/]+$/g, '') + path.sep).toLowerCase();
7576
+ const target = resolved.replace(/[\\/]+$/g, '').toLowerCase();
7577
+ const out = [];
7578
+ for (const line of String(raw).split(/\r?\n/)) {
7579
+ const t = line.trim();
7580
+ if (!t) continue;
7581
+ const parts = t.split('|');
7582
+ if (parts.length < 3) continue;
7583
+ const pid = Number(parts[0]);
7584
+ if (!Number.isInteger(pid) || pid <= 0) continue;
7585
+ const name = parts[1] || '';
7586
+ const cwd = parts.slice(2).join('|');
7587
+ // Defense-in-depth: re-confirm the cwd is at/under THIS worktree so a
7588
+ // sibling worktree's live agent is never reaped (the script filters too).
7589
+ const cwdLower = cwd.replace(/[\\/]+$/g, '').toLowerCase();
7590
+ if (cwdLower !== target && !cwdLower.startsWith(prefix)) continue;
7591
+ out.push({ pid, name, cwd });
7592
+ }
7593
+ return out;
7594
+ }
7595
+
7596
+ function findProcessCwdHolders(worktreePath, opts = {}) {
7597
+ if (process.platform !== 'win32') return [];
7598
+ if (!worktreePath || typeof worktreePath !== 'string') return [];
7599
+ let resolved;
7600
+ try { resolved = path.resolve(worktreePath); } catch { return []; }
7601
+ if (!resolved) return [];
7602
+ const timeoutMs = Number(opts.timeoutMs) > 0
7603
+ ? Number(opts.timeoutMs)
7604
+ : (ENGINE_DEFAULTS.statusProbeKillTimeoutMs || 12000);
7605
+ const psTarget = resolved.replace(/'/g, "''");
7606
+ const script = _windowsCwdProbeScript(psTarget);
7607
+ // Run from a temp .ps1 file to avoid fragile inline-quote escaping of the
7608
+ // embedded C#. Fail-open at every step.
7609
+ let scriptPath = null;
7610
+ let raw;
7611
+ try {
7612
+ const dir = _dispatchTmpRoot();
7613
+ try { fs.mkdirSync(dir, { recursive: true }); } catch { /* exists */ }
7614
+ scriptPath = path.join(dir, `cwd-probe-${process.pid}-${Date.now()}.ps1`);
7615
+ fs.writeFileSync(scriptPath, script, 'utf8');
7616
+ raw = _execSync(
7617
+ `powershell -NoProfile -NonInteractive -ExecutionPolicy Bypass -File "${scriptPath}"`,
7618
+ { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'], timeout: timeoutMs, windowsHide: true, maxBuffer: 4 * 1024 * 1024 }
7619
+ );
7620
+ } catch { raw = ''; /* ETIMEDOUT / spawn fail / write fail → reap nothing */ }
7621
+ finally {
7622
+ if (scriptPath) { try { fs.unlinkSync(scriptPath); } catch { /* best-effort */ } }
7623
+ }
7624
+ return _parseCwdHolderLines(raw, resolved);
7625
+ }
7626
+
7501
7627
  // W-mq6f2fe0000557fa — clear the per-path failure cooldown entry so the
7502
7628
  // post-holder-reap retry can attempt removeWorktree even though the path
7503
7629
  // has already failed >= 3 times. Without this, the cooldown silently
@@ -8711,6 +8837,7 @@ module.exports = {
8711
8837
  isProcessCommandLineMatchingAgent,
8712
8838
  listAllProcesses,
8713
8839
  findProcessesWithCwdInside,
8840
+ findProcessCwdHolders,
8714
8841
  clearWorktreeFailureCache,
8715
8842
  listProcessDescendants,
8716
8843
  listProcessReachable,
@@ -477,6 +477,15 @@ function gcDispatchWorktreeIfOrphan(opts) {
477
477
  excludeDispatchId = null,
478
478
  config = null,
479
479
  writeToInbox = null,
480
+ // W-mqinlicl §6 — optional pre-remove holder reaper. The dispatch-end
481
+ // caller (engine.js) injects `_reapWorktreeHolders` so that before we try
482
+ // to remove this worktree we kill the ending dispatch's own orphaned
483
+ // descendant tree (copilot/node/cmd children whose CWD pins the dir). This
484
+ // closes the same EBUSY/EPERM/ETIMEDOUT removal failures the quarantine
485
+ // path's reap closes — a dispatch should always reap the tree it spawned
486
+ // before releasing its worktree. Windows-only + isWorktreePathLive-gated
487
+ // inside the reaper; a no-op everywhere else. Best-effort: never blocks GC.
488
+ reapHolders = null,
480
489
  } = opts || {};
481
490
  const decision = shouldGcDispatchWorktree(opts);
482
491
  if (!decision.gc) {
@@ -488,6 +497,18 @@ function gcDispatchWorktreeIfOrphan(opts) {
488
497
  const _removeFn = typeof removeWorktree === 'function' ? removeWorktree : shared.removeWorktree;
489
498
  const _rmOpts = excludeDispatchId ? { excludeDispatchId } : undefined;
490
499
  const resolved = (() => { try { return path.resolve(worktreePath); } catch { return worktreePath; } })();
500
+ // W-mqinlicl §6 — reap the ending dispatch's own orphaned descendant tree
501
+ // BEFORE attempting removal. The owning dispatch is still 'active' at this
502
+ // point, so it is passed as `ownerDispatchId` (terminal-owner discovery would
503
+ // miss it) and as `excludeDispatchId` so the reaper's live-guard ignores this
504
+ // dispatch's own row. Failure here must never block the removal that follows.
505
+ if (typeof reapHolders === 'function') {
506
+ try {
507
+ reapHolders(worktreePath, { excludeDispatchId, ownerDispatchId: excludeDispatchId });
508
+ } catch (reapErr) {
509
+ log('warn', `worktree-gc: pre-remove reap threw for ${worktreePath}: ${reapErr && reapErr.message}`);
510
+ }
511
+ }
491
512
  try {
492
513
  const removed = _removeFn(worktreePath, gitRoot, worktreeRoot, _rmOpts);
493
514
  if (removed) {
@@ -971,11 +992,151 @@ function pruneOrphanWorktreesFromGitRegistry(opts) {
971
992
  return result;
972
993
  }
973
994
 
995
+ /**
996
+ * Missing-dir reclaim sweep (W-mqifblkf00149df5): repo-wide, branch-agnostic
997
+ * reaper for worktree ADMIN entries whose backing working directory is MISSING
998
+ * on disk. An interrupted `git worktree add` (engine crash / kill during git's
999
+ * `initializing` lock phase) leaves a `.git/worktrees/<id>/` registration
1000
+ * behind — often with a `locked` file, reason `initializing` — whose working
1001
+ * dir never materialized. Because the entry is LOCKED, `git worktree prune`
1002
+ * (and `pruneOrphanWorktreesFromGitRegistry`'s `prune --expire=now`) SKIP it
1003
+ * by design, so it survives forever and BRICKS the branch it claims: every
1004
+ * later `git worktree add <branch>` fails with
1005
+ * `'<branch>' is already used by worktree at '<path>'`. Recovery used to need
1006
+ * a human running `git worktree unlock` + `git worktree remove -f -f`.
1007
+ *
1008
+ * This sweep reclaims ONLY the genuinely-safe case, for each project repo:
1009
+ * 1. backing `path` does NOT exist on disk (`!fs.existsSync`) — a present
1010
+ * dir may hold unpushed work and is OUT OF SCOPE (never force-removed
1011
+ * here), AND
1012
+ * 2. no non-terminal dispatch still claims that exact path
1013
+ * (`shared.isWorktreePathLive` — fail-open: leak/skip rather than nuke),
1014
+ * AND
1015
+ * 3. it is not the main checkout (`path === project root`).
1016
+ * For those entries it runs `git worktree remove -f -f <path>` (the double
1017
+ * force beats `locked: initializing`; plain prune cannot) and, when at least
1018
+ * one entry was reclaimed for a repo, a trailing `git worktree prune`.
1019
+ *
1020
+ * Injection points (all optional — default to the real implementations):
1021
+ * projects, log, fs, execSilent, parseWorktreePorcelain, isWorktreePathLive,
1022
+ * db.
1023
+ *
1024
+ * Returns `{ scanned, reclaimed, skippedLive, failed, perProject }` where
1025
+ * `scanned` counts only the missing-dir candidates considered (dirs that still
1026
+ * exist are skipped before scanning).
1027
+ */
1028
+ function reclaimMissingDirWorktrees(opts) {
1029
+ opts = opts || {};
1030
+ const projects = Array.isArray(opts.projects) ? opts.projects : [];
1031
+ const log = typeof opts.log === 'function' ? opts.log : _noopLog;
1032
+ const _fs = opts.fs || fs;
1033
+ const _execSilent = typeof opts.execSilent === 'function'
1034
+ ? opts.execSilent
1035
+ : shared.execSilent;
1036
+ const _parseWorktreePorcelain = typeof opts.parseWorktreePorcelain === 'function'
1037
+ ? opts.parseWorktreePorcelain
1038
+ : shared.parseWorktreePorcelain;
1039
+ const _isWorktreePathLive = typeof opts.isWorktreePathLive === 'function'
1040
+ ? opts.isWorktreePathLive
1041
+ : shared.isWorktreePathLive;
1042
+ const _liveOpts = opts.db ? { db: opts.db } : undefined;
1043
+
1044
+ const result = { scanned: 0, reclaimed: 0, skippedLive: 0, failed: 0, perProject: {} };
1045
+ const _seenAbs = new Set(); // dedup across projects sharing a parent repo
1046
+
1047
+ for (const project of projects) {
1048
+ if (!project || !project.localPath) continue;
1049
+ let rootDir;
1050
+ try { rootDir = path.resolve(String(project.localPath)); } catch { continue; }
1051
+ let rootExists = false;
1052
+ try { rootExists = _fs.existsSync(rootDir); } catch { rootExists = false; }
1053
+ if (!rootExists) continue;
1054
+
1055
+ const projStats = { scanned: 0, reclaimed: 0, skippedLive: 0, failed: 0 };
1056
+ let raw = '';
1057
+ try {
1058
+ raw = String(_execSilent('git --no-optional-locks worktree list --porcelain', {
1059
+ cwd: rootDir, timeout: 15000, windowsHide: true,
1060
+ }) || '');
1061
+ } catch (e) {
1062
+ log('warn', `worktree-gc: missing-dir reclaim list failed for ${project.name || rootDir}: ${e.message}`);
1063
+ result.perProject[project.name || rootDir] = projStats;
1064
+ continue;
1065
+ }
1066
+
1067
+ let trees;
1068
+ try { trees = _parseWorktreePorcelain(raw); }
1069
+ catch (e) {
1070
+ log('warn', `worktree-gc: missing-dir reclaim parse failed for ${project.name || rootDir}: ${e.message}`);
1071
+ result.perProject[project.name || rootDir] = projStats;
1072
+ continue;
1073
+ }
1074
+
1075
+ let reclaimedHere = 0;
1076
+ for (const wt of trees) {
1077
+ if (!wt || !wt.path) continue;
1078
+ let wtAbs;
1079
+ try { wtAbs = path.resolve(wt.path); } catch { continue; }
1080
+ if (wtAbs === rootDir) continue; // never the main checkout
1081
+ if (_seenAbs.has(wtAbs)) continue;
1082
+ _seenAbs.add(wtAbs);
1083
+
1084
+ // SAFETY 1 — only reclaim entries whose backing dir is genuinely MISSING.
1085
+ // A present dir may hold unpushed work; force-removing it is out of scope.
1086
+ let dirExists = true;
1087
+ try { dirExists = _fs.existsSync(wtAbs); } catch { dirExists = true; }
1088
+ if (dirExists) continue;
1089
+
1090
+ projStats.scanned++;
1091
+ result.scanned++;
1092
+
1093
+ // SAFETY 2 — never reclaim a path a non-terminal dispatch still claims
1094
+ // (fail-open: isWorktreePathLive returns true when SQL is unreachable).
1095
+ let live = true;
1096
+ try { live = !!_isWorktreePathLive(wt.path, _liveOpts); }
1097
+ catch (_e) { live = true; }
1098
+ if (live) {
1099
+ projStats.skippedLive++; result.skippedLive++;
1100
+ log('debug', `worktree-gc: missing-dir reclaim skipping live-claimed ${wtAbs}`);
1101
+ continue;
1102
+ }
1103
+
1104
+ try {
1105
+ _execSilent(`git worktree remove -f -f "${wtAbs}"`, {
1106
+ cwd: rootDir, timeout: 15000, windowsHide: true,
1107
+ });
1108
+ reclaimedHere++;
1109
+ projStats.reclaimed++; result.reclaimed++;
1110
+ try { shared.bumpWorktreeGcMetric('missingDirReclaimed'); } catch { /* metric optional */ }
1111
+ log('info', `worktree-gc: reclaimed missing-dir worktree ${wtAbs}${wt.locked ? ' (was locked)' : ''}${wt.branch ? ` [branch ${wt.branch}]` : ''}`);
1112
+ } catch (e) {
1113
+ projStats.failed++; result.failed++;
1114
+ log('warn', `worktree-gc: missing-dir reclaim remove -f -f failed for ${wtAbs}: ${(e.message || '').split('\n')[0]}`);
1115
+ }
1116
+ }
1117
+
1118
+ // Only prune when we actually reaped something — keeps the no-stale-entry
1119
+ // path a pure no-op (no extra git spawn).
1120
+ if (reclaimedHere > 0) {
1121
+ try {
1122
+ _execSilent('git worktree prune', { cwd: rootDir, timeout: 10000, windowsHide: true });
1123
+ } catch (e) {
1124
+ log('warn', `worktree-gc: missing-dir reclaim prune failed for ${project.name || rootDir}: ${e.message}`);
1125
+ }
1126
+ }
1127
+
1128
+ result.perProject[project.name || rootDir] = projStats;
1129
+ }
1130
+
1131
+ return result;
1132
+ }
1133
+
974
1134
  module.exports = {
975
1135
  shouldGcDispatchWorktree,
976
1136
  gcDispatchWorktreeIfOrphan,
977
1137
  pruneOrphanWorktrees,
978
1138
  pruneOrphanWorktreesFromGitRegistry,
1139
+ reclaimMissingDirWorktrees, // W-mqifblkf00149df5 — locked-initializing missing-dir reaper
979
1140
  // exported for testing (W-mq5o6bvy000x7191)
980
1141
  _resetStuckPathsForTesting,
981
1142
  _stuckPaths,
package/engine.js CHANGED
@@ -1078,15 +1078,25 @@ async function _renameWithRetry(src, dst, opts = {}) {
1078
1078
  throw lastErr;
1079
1079
  }
1080
1080
 
1081
+ // W-mqila0t5 — resolve the worktree-holder reap probe timeout, honoring a
1082
+ // `config.engine.statusProbeKillTimeoutMs` override (the dashboard Settings
1083
+ // control persists there) and falling back to the ENGINE_DEFAULTS default.
1084
+ // Fail-safe: any error reading config falls back to the default.
1085
+ function _reapProbeTimeoutMs() {
1086
+ try {
1087
+ const v = Number((getConfig() || {}).engine?.statusProbeKillTimeoutMs);
1088
+ if (Number.isFinite(v) && v > 0) return v;
1089
+ } catch { /* fall through to default */ }
1090
+ return ENGINE_DEFAULTS.statusProbeKillTimeoutMs || 12000;
1091
+ }
1092
+
1081
1093
  // W-mq5n1zx5 — Layer 2a: on Windows, kill any live `git.exe` descendants
1082
1094
  // whose command line points at the quarantine target path. We never
1083
1095
  // tracked the PID of the `git status --porcelain` child that the probe
1084
1096
  // timed out on, so we can't kill by PID — instead we shell out to
1085
1097
  // PowerShell's CIM cmdlets to find matching processes and Stop-Process
1086
- // them. Cheap (<2s) and idempotent if no descendants are alive, the
1087
- // CIM query returns nothing and exits 0. POSIX is a no-op (the EBUSY
1088
- // race is Windows-specific). Best-effort; failure is logged but never
1089
- // blocks the quarantine.
1098
+ // them. POSIX is a no-op (the EBUSY race is Windows-specific). Best-effort;
1099
+ // failure is logged but never blocks the quarantine.
1090
1100
  function _killGitDescendantsForWorktree(worktreePath) {
1091
1101
  if (process.platform !== 'win32') return { killed: 0, skipped: true };
1092
1102
  if (!ENGINE_DEFAULTS.statusProbeKillDescendantsWin32) return { killed: 0, skipped: true };
@@ -1100,7 +1110,7 @@ function _killGitDescendantsForWorktree(worktreePath) {
1100
1110
  "if ($matches) { $matches | ForEach-Object { Stop-Process -Id $_.ProcessId -Force; $_.ProcessId }; }",
1101
1111
  ].join(' ');
1102
1112
  try {
1103
- const out = shared.execSilent(`powershell -NoProfile -Command "${ps.replace(/"/g, '\\"')}"`, { timeout: 2000, encoding: 'utf8' });
1113
+ const out = shared.execSilent(`powershell -NoProfile -Command "${ps.replace(/"/g, '\\"')}"`, { timeout: _reapProbeTimeoutMs(), encoding: 'utf8' });
1104
1114
  const killed = String(out || '').split(/\r?\n/).map(s => s.trim()).filter(Boolean).length;
1105
1115
  if (killed > 0) log('info', `_killGitDescendantsForWorktree: killed ${killed} git.exe descendant(s) holding ${worktreePath}`);
1106
1116
  return { killed };
@@ -1110,6 +1120,188 @@ function _killGitDescendantsForWorktree(worktreePath) {
1110
1120
  }
1111
1121
  }
1112
1122
 
1123
+ // W-mqila0t5 — discover the dispatch id(s) that previously OWNED a worktree
1124
+ // path and are now TERMINAL (i.e. not pending/active). Used by Layer 1 of the
1125
+ // holder reaper to look up and kill the exact descendant tree the engine
1126
+ // itself spawned for this worktree. Reading from the dispatch store (all
1127
+ // sections) means we still find the owner after it has gone terminal — that's
1128
+ // precisely the orphan case (the prior failed dispatch left a CWD-pinning
1129
+ // child alive). Active/pending owners are deliberately EXCLUDED: a live owner
1130
+ // is handled by the isWorktreePathLive skip, never by a kill.
1131
+ // Injectable via _deps.readDispatchSectioned for testing.
1132
+ function _findTerminalWorktreeOwners(worktreePath, _deps = {}) {
1133
+ if (!worktreePath) return [];
1134
+ const target = shared._normalizeWorktreePath(worktreePath);
1135
+ if (!target) return [];
1136
+ let sectioned;
1137
+ try {
1138
+ const reader = typeof _deps.readDispatchSectioned === 'function'
1139
+ ? _deps.readDispatchSectioned
1140
+ : require('./engine/dispatch-store').readDispatchSectioned;
1141
+ sectioned = reader();
1142
+ } catch {
1143
+ try {
1144
+ sectioned = require(path.join(__dirname, 'engine', 'dispatch-store')).readDispatchSectioned();
1145
+ } catch { return []; }
1146
+ }
1147
+ if (!sectioned || typeof sectioned !== 'object') return [];
1148
+ const owners = [];
1149
+ for (const [status, rows] of Object.entries(sectioned)) {
1150
+ // Skip the live sections — a live owner must never be reaped here.
1151
+ if (status === 'pending' || status === 'active') continue;
1152
+ for (const rec of (Array.isArray(rows) ? rows : [])) {
1153
+ if (!rec || !rec.id) continue;
1154
+ const wt = rec.worktreePath || rec.meta?.worktreePath || '';
1155
+ if (wt && shared._normalizeWorktreePath(wt) === target) {
1156
+ if (!owners.includes(rec.id)) owners.push(rec.id);
1157
+ }
1158
+ }
1159
+ }
1160
+ return owners;
1161
+ }
1162
+
1163
+ // W-mqila0t5 — broaden the pre-quarantine worktree-holder reaper to also close
1164
+ // the dir-PRESENT + CWD-pinned + EBUSY variant (the one firing in production
1165
+ // RIGHT NOW). The old path only nuked git.exe processes matched by command
1166
+ // line; the real lock holder is an ORPHANED AGENT PROCESS TREE (copilot/cmd/
1167
+ // node/powershell) whose CWD is the worktree root — invisible to a
1168
+ // command-line match and to file-handle detectors (the lock is a DIRECTORY
1169
+ // handle, not a file handle). Two complementary layers, both hard-gated:
1170
+ //
1171
+ // Layer 0 — legacy git.exe-by-cmdline sweep (_killGitDescendantsForWorktree).
1172
+ // Layer 1 — OWNERSHIP reap: for each TERMINAL dispatch that owned this
1173
+ // worktree (plus an explicitly-supplied `ownerDispatchId`, used by
1174
+ // the dispatch-end GC path where the owning dispatch is still
1175
+ // 'active' and so isn't found by terminal-owner discovery), kill
1176
+ // its recorded root PID (from the PID file) plus the full
1177
+ // descendant tree. Precise; can never touch a live sibling.
1178
+ // Layer 2 — CWD-SCAN reap (fallback for orphans the map/PID files no longer
1179
+ // track, e.g. after an engine restart): PEB-CWD probe finds procs
1180
+ // whose CWD is under THIS worktree and kills them. Only ever reaps
1181
+ // CWDs at/under THIS worktreePath — a sibling agent running in its
1182
+ // OWN worktree is never touched.
1183
+ //
1184
+ // MANDATORY GATES: POSIX is a no-op; the master statusProbeKillDescendantsWin32
1185
+ // switch gates everything; and isWorktreePathLive(path,{excludeDispatchId})
1186
+ // must be false (fail-open: on SQL error / throw we SKIP the whole reap and
1187
+ // leak the worktree rather than risk nuking a live agent's unpushed work).
1188
+ // The live check is RE-RUN at kill time before Layer 2 (W-mqinlicl TOCTOU fix):
1189
+ // the PEB probe can take several seconds, and liveness flips on the engine
1190
+ // tick — a stale "not live" snapshot from the top of the function is unsafe to
1191
+ // kill on. Every probe is wrapped so an ETIMEDOUT / throw is swallowed (reap
1192
+ // nothing) and the caller (quarantine OR dispatch-end removal) still proceeds.
1193
+ function _reapWorktreeHolders(worktreePath, opts = {}) {
1194
+ const result = { layer0: 0, layer1: 0, layer2: 0, skipped: false, reason: null };
1195
+ const deps = opts._deps || {};
1196
+ const platform = deps.platform || process.platform;
1197
+ if (platform !== 'win32') { result.skipped = true; result.reason = 'posix'; return result; }
1198
+ if (!ENGINE_DEFAULTS.statusProbeKillDescendantsWin32) { result.skipped = true; result.reason = 'disabled'; return result; }
1199
+ if (!worktreePath) { result.skipped = true; result.reason = 'no-path'; return result; }
1200
+
1201
+ const excludeDispatchId = opts.excludeDispatchId || null;
1202
+ const ownerDispatchId = opts.ownerDispatchId || null;
1203
+ const isLiveFn = typeof deps.isWorktreePathLive === 'function' ? deps.isWorktreePathLive : shared.isWorktreePathLive;
1204
+ const liveOpts = excludeDispatchId ? { excludeDispatchId } : {};
1205
+ const killFn = typeof deps.kill === 'function' ? deps.kill : shared.killImmediate;
1206
+ const listDescendantsFn = typeof deps.listProcessDescendants === 'function' ? deps.listProcessDescendants : shared.listProcessDescendants;
1207
+ const findOwnersFn = typeof deps.findTerminalOwners === 'function' ? deps.findTerminalOwners : (wt) => _findTerminalWorktreeOwners(wt, deps);
1208
+ const findPidFileFn = typeof deps.findDispatchPidFile === 'function' ? deps.findDispatchPidFile : shared.findDispatchPidFile;
1209
+ const readPidFn = typeof deps.readPid === 'function' ? deps.readPid : (file) => {
1210
+ try { return parseInt(String(fs.readFileSync(file, 'utf8')).trim(), 10); } catch { return null; }
1211
+ };
1212
+ const listCwdHoldersFn = typeof deps.findProcessCwdHolders === 'function' ? deps.findProcessCwdHolders : shared.findProcessCwdHolders;
1213
+ // Layer 0 sweep is injectable so tests can exercise Layers 1/2 in isolation.
1214
+ const layer0Fn = typeof deps.killGitDescendants === 'function' ? deps.killGitDescendants : _killGitDescendantsForWorktree;
1215
+
1216
+ // MANDATORY GATE — fail-open: any throw OR a live claim → skip the whole reap.
1217
+ let live;
1218
+ try { live = isLiveFn(worktreePath, liveOpts); }
1219
+ catch (e) {
1220
+ log('warn', `_reapWorktreeHolders: isWorktreePathLive threw for ${worktreePath} (${e.message}) — fail-open skip`);
1221
+ result.skipped = true; result.reason = 'live-guard-threw';
1222
+ return result;
1223
+ }
1224
+ if (live) { result.skipped = true; result.reason = 'live'; return result; }
1225
+
1226
+ // Layer 0 — legacy git.exe-by-cmdline sweep.
1227
+ try { result.layer0 = (layer0Fn(worktreePath).killed) || 0; }
1228
+ catch { /* best-effort */ }
1229
+
1230
+ // Layer 1 — ownership reap of terminal owners' (plus the supplied
1231
+ // ownerDispatchId's) recorded PID trees. Fast (no external probe), so the
1232
+ // top-of-function live check is still current here.
1233
+ try {
1234
+ const owners = new Set(findOwnersFn(worktreePath) || []);
1235
+ if (ownerDispatchId) owners.add(ownerDispatchId);
1236
+ const toKill = new Set();
1237
+ for (const ownerId of owners) {
1238
+ let rootPid = null;
1239
+ try {
1240
+ const pidFile = findPidFileFn(ownerId);
1241
+ if (pidFile) rootPid = readPidFn(pidFile);
1242
+ } catch { rootPid = null; }
1243
+ if (!Number.isInteger(rootPid) || rootPid <= 0) continue;
1244
+ toKill.add(rootPid);
1245
+ try { for (const d of (listDescendantsFn(rootPid) || [])) toKill.add(d); }
1246
+ catch { /* descendant enum best-effort */ }
1247
+ }
1248
+ for (const pid of toKill) {
1249
+ if (!Number.isInteger(pid) || pid <= 0 || pid === process.pid) continue;
1250
+ try { killFn({ pid }); result.layer1++; }
1251
+ catch { /* keep reaping the rest */ }
1252
+ }
1253
+ if (result.layer1 > 0) {
1254
+ log('info', `_reapWorktreeHolders: Layer 1 reaped ${result.layer1} recorded PID(s) for owner(s) of ${worktreePath}`);
1255
+ }
1256
+ } catch (e) {
1257
+ log('warn', `_reapWorktreeHolders: Layer 1 ownership reap threw for ${worktreePath} (${e.message}) — continuing`);
1258
+ }
1259
+
1260
+ // Layer 2 — CWD-scan reap (PEB probe). Fail-open: ETIMEDOUT / throw → [].
1261
+ try {
1262
+ const timeoutMs = _reapProbeTimeoutMs();
1263
+ const target = shared._normalizeWorktreePath(worktreePath);
1264
+ const holders = listCwdHoldersFn(worktreePath, { timeoutMs }) || [];
1265
+ // TOCTOU re-check (W-mqinlicl §4): the PEB probe above can take several
1266
+ // seconds, and a fresh dispatch can reuse this worktree mid-probe. Verified
1267
+ // live: recon flagged a path as orphan-pinned, then the engine reused it ~2
1268
+ // min later — killing on the stale snapshot would murder a live agent.
1269
+ // Re-confirm liveness at kill time; fail-open (skip the Layer 2 kills, do
1270
+ // NOT touch any PID) on a live claim OR on any SQL error/throw.
1271
+ let stillLive;
1272
+ try { stillLive = isLiveFn(worktreePath, liveOpts); }
1273
+ catch (e) {
1274
+ log('warn', `_reapWorktreeHolders: Layer 2 live re-check threw for ${worktreePath} (${e.message}) — fail-open, skipping Layer 2 kills`);
1275
+ stillLive = true;
1276
+ }
1277
+ if (stillLive) {
1278
+ result.layer2Skipped = 'live-at-kill-time';
1279
+ log('warn', `_reapWorktreeHolders: Layer 2 skipped — ${worktreePath} went live during the CWD probe (TOCTOU guard)`);
1280
+ } else {
1281
+ for (const h of holders) {
1282
+ const pid = Number(h && h.pid);
1283
+ if (!Number.isInteger(pid) || pid <= 0 || pid === process.pid) continue;
1284
+ // Defense-in-depth: NEVER reap a holder whose reported CWD resolves
1285
+ // under a DIFFERENT worktree (a live sibling agent). The probe already
1286
+ // filters, but re-confirm here so a stub / future caller can't widen it.
1287
+ if (h && h.cwd && target) {
1288
+ const c = shared._normalizeWorktreePath(h.cwd);
1289
+ if (c && c !== target && !c.startsWith(target + '/')) continue;
1290
+ }
1291
+ try { killFn({ pid }); result.layer2++; }
1292
+ catch { /* keep reaping the rest */ }
1293
+ }
1294
+ if (result.layer2 > 0) {
1295
+ log('info', `_reapWorktreeHolders: Layer 2 reaped ${result.layer2} CWD-pinning process(es) under ${worktreePath}`);
1296
+ }
1297
+ }
1298
+ } catch (e) {
1299
+ log('warn', `_reapWorktreeHolders: Layer 2 CWD-scan reap threw for ${worktreePath} (${e.message}) — continuing (caller proceeds)`);
1300
+ }
1301
+
1302
+ return result;
1303
+ }
1304
+
1113
1305
  // W-mq5n1zx5 — Layer 3a: bump the rolling counters for quarantine rename
1114
1306
  // outcomes (attempts, success, successAfterRetry, fallbackForceRemove,
1115
1307
  // totalFailure). Best-effort; metrics failures must not break the
@@ -1473,12 +1665,18 @@ async function _quarantineDirtyWorktree(rootDir, worktreePath, branchName, gitOp
1473
1665
  return { quarantinedPath: null, backupRef: null, skipped: true };
1474
1666
  }
1475
1667
 
1476
- // W-mq5n1zx5 Layer 2a: pre-emptively kill any git.exe descendants whose
1477
- // command line points at the worktree. The status-probe child that timed
1478
- // out earlier may still be alive holding packfile handles; if so, the
1479
- // rename below will fail with EBUSY/EPERM until the descendant exits.
1480
- // POSIX is a no-op. Best-effort; failure here is non-fatal.
1481
- _killGitDescendantsForWorktree(worktreePath);
1668
+ // W-mq5n1zx5 / W-mqila0t5 Layer 0–2: pre-emptively reap any process holding
1669
+ // the worktree dir before we rename it. The status-probe child that timed out
1670
+ // earlier may still hold packfile handles (Layer 0: git.exe by cmdline), and —
1671
+ // the variant firing in production an ORPHANED AGENT PROCESS TREE
1672
+ // (copilot/cmd/node/powershell) from the prior failed dispatch may still be
1673
+ // alive with its CWD set to the worktree root, pinning the dir so rename AND
1674
+ // `git worktree remove --force` both return EBUSY. Layer 1 reaps the recorded
1675
+ // PID tree of any terminal owning dispatch; Layer 2 PEB-scans for any process
1676
+ // whose CWD is under this worktree. All layers are Windows-only, gated on the
1677
+ // live-worktree guard, and fail-open — failure here is non-fatal and the
1678
+ // quarantine still proceeds to its rename / force-remove fallback.
1679
+ _reapWorktreeHolders(worktreePath, { excludeDispatchId: diag.dispatchId });
1482
1680
 
1483
1681
  // W-mq5n1zx5 Layer 1a: rename with jittered backoff. Replaces the bare
1484
1682
  // `fs.renameSync(worktreePath, quarantinedPath)` that used to throw
@@ -2466,14 +2664,31 @@ async function spawnAgent(dispatchItem, config) {
2466
2664
  log('info', `Branch ${branchName} already checked out at ${existingWtPath} — reusing`);
2467
2665
  worktreePath = existingWtPath;
2468
2666
  worktreeReused = true;
2469
- } else if (existingWtPath && !fs.existsSync(existingWtPath)) {
2470
- log('warn', `Branch ${branchName} tracked in missing dir ${existingWtPath} — pruning and recreating`);
2471
- try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
2472
- await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
2473
- log('info', `Recovered worktree for ${branchName} after stale entry prune`);
2474
2667
  } else {
2475
- try { await shared.shellSafeGit(['worktree', 'prune'], { ..._gitOpts, cwd: rootDir, timeout: 10000 }); } catch (e) { log('warn', 'git: ' + e.message); }
2476
- await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
2668
+ // Gap B (W-mqifblkf00149df5): the branch is registered to a
2669
+ // worktree whose backing dir is MISSING — either
2670
+ // findExistingWorktree returned null (it filters missing
2671
+ // dirs) or it resolved a non-null path that no longer
2672
+ // exists. A plain `git worktree prune` SKIPS locked:
2673
+ // initializing entries BY DESIGN, so a crash-left stale
2674
+ // entry survives and the retry fails identically, bricking
2675
+ // the branch for every future dispatch. Route through the
2676
+ // remove -f -f reaper (pruneStaleWorktreeForBranch),
2677
+ // mirroring the _branchOnRemote === true recovery above, so
2678
+ // BOTH paths can reap locked-initializing missing-dir entries.
2679
+ const pruned = await pruneStaleWorktreeForBranch(rootDir, branchName, _gitOpts);
2680
+ if (pruned > 0) {
2681
+ log('info', `Pruned ${pruned} stale worktree entry(ies) for ${branchName}; retrying worktree add`);
2682
+ await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
2683
+ log('info', `Recovered worktree for ${branchName} after stale entry prune`);
2684
+ } else {
2685
+ // Prune was a no-op — the holder is a real worktree
2686
+ // (at-or-inside project root, or git's view drifted).
2687
+ // Surface non-retryably with the holder path.
2688
+ const holder = _parseAlreadyUsedHolderPath(e2.message) || _parseAlreadyUsedHolderPath(e1.message);
2689
+ if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, e2); return null; }
2690
+ throw e2;
2691
+ }
2477
2692
  }
2478
2693
  } else {
2479
2694
  throw e2;
@@ -4668,6 +4883,10 @@ async function spawnAgent(dispatchItem, config) {
4668
4883
  // default worktreePoolSize:0 config. Pool-return uses the same
4669
4884
  // plumbing; this is the matching wiring on the orphan-GC side.
4670
4885
  excludeDispatchId: id,
4886
+ // W-mqinlicl §6 — reap THIS dispatch's own orphaned descendant tree
4887
+ // (CWD-pinning copilot/node/cmd children) before removeWorktree, so
4888
+ // dispatch-end GC doesn't EBUSY/EPERM/ETIMEDOUT on a tree we spawned.
4889
+ reapHolders: (wt, o) => _reapWorktreeHolders(wt, o),
4671
4890
  log,
4672
4891
  });
4673
4892
  if (_gcResult.outcome === 'gc') {
@@ -8874,8 +9093,9 @@ async function tickInner() {
8874
9093
  const { runPeriodicWorktreeSweep } = require('./engine/cleanup');
8875
9094
  const stats = runPeriodicWorktreeSweep(config);
8876
9095
  const totalEvicted = (stats.evicted || 0) + (stats.outOfRootEvicted || 0);
8877
- if (totalEvicted > 0 || stats.failed > 0) {
8878
- log('info', `worktree-prune sweep: scanned=${stats.scanned} evicted=${stats.evicted} outOfRootEvicted=${stats.outOfRootEvicted} kept=${stats.kept} failed=${stats.failed} prunedRegistry=${stats.prunedRegistry}`);
9096
+ const reclaimed = stats.missingDirReclaimed || 0;
9097
+ if (totalEvicted > 0 || stats.failed > 0 || reclaimed > 0) {
9098
+ log('info', `worktree-prune sweep: scanned=${stats.scanned} evicted=${stats.evicted} outOfRootEvicted=${stats.outOfRootEvicted} kept=${stats.kept} failed=${stats.failed} prunedRegistry=${stats.prunedRegistry} missingDirReclaimed=${reclaimed} missingDirSkippedLive=${stats.missingDirSkippedLive || 0}`);
8879
9099
  }
8880
9100
  });
8881
9101
  if (_isTickStale(myGeneration)) return;
@@ -9758,6 +9978,7 @@ module.exports = {
9758
9978
  buildDepConflictFixItem, deriveConflictFixKey, // exported for testing (W-mpcwojgr000a0244)
9759
9979
  isWorktreeRetryableError, removeStaleIndexLock, syncReusedWorktree, assertCleanSharedWorktree, _quarantineDirtyWorktree, // exported for testing
9760
9980
  _renameWithRetry, _statusPorcelainCmd, _killGitDescendantsForWorktree, _bumpQuarantineOutcome, // exported for testing (W-mq5n1zx5)
9981
+ _reapWorktreeHolders, _findTerminalWorktreeOwners, // exported for testing (W-mqila0t5 — CWD-pinned holder reap)
9761
9982
  pruneStaleWorktreeForBranch, // exported for testing
9762
9983
  findExistingWorktree, // exported for testing
9763
9984
  probeBranchOnRemote, // exported for testing (W-mphnm6a1000281b8)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2219",
3
+ "version": "0.1.2221",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"