npm - @yemi33/minions - Versions diffs - 0.1.1871 → 0.1.1873 - Mend

@yemi33/minions 0.1.1871 → 0.1.1873

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,10 @@
 # Changelog
+## 0.1.1873 (2026-05-11)
+### Other
+- Implement: Phantom completion work preservation (P-e0b4f7a5) (#2356)
 ## 0.1.1871 (2026-05-11)
 ### Features

package/engine/cleanup.js CHANGED Viewed

@@ -79,6 +79,26 @@ function localBranchWorktreeInUse(root, branch) {
   }
 }
+// P-e0b4f7a5 — collect branches of work items currently in the
+// phantom-completion retry state for a given project. Returns a Set of
+// branch strings. Used by the worktree cleanup loop to protect worktrees
+// belonging to in-flight phantom retries from the 2-hour age sweep —
+// without this protection the agent's pushed branch reference could be
+// destroyed alongside the worktree before the retry runs.
+function collectPhantomBranchesForProject(project) {
+  const branches = new Set();
+  try {
+    const items = safeJson(projectWorkItemsPath(project)) || [];
+    if (!Array.isArray(items)) return branches;
+    for (const w of items) {
+      if (w && w._phantomCompletion === true && w._phantomBranch) {
+        branches.add(String(w._phantomBranch));
+      }
+    }
+  } catch { /* best-effort — never let cleanup crash on a missing/corrupt WI file */ }
+  return branches;
+}
 function cleanupMergedPrLocalBranch(root, project, pr) {
   const branch = normalizeLocalBranchName(pr?.branch);
   const result = { deleted: false, forced: false, skipped: null };
@@ -451,6 +471,11 @@ async function runCleanup(config, verbose = false) {
       const wtEntries = []; // { dir, wtPath, mtime, shouldClean, isProtected }
       const dispatch = getDispatch();
       const activeDispatchIds = new Set((dispatch.active || []).map(d => d.id));
+      // P-e0b4f7a5 — branches whose work item is mid-phantom-retry. Their
+      // worktrees must survive the age/cap sweep until the retry completes
+      // (or exhausts its budget) so the agent's already-pushed branch ref
+      // isn't destroyed alongside the worktree.
+      const phantomBranches = collectPhantomBranchesForProject(project);
       // Probe `git branch --show-current` for every worktree in chunks of 5.
       // Sequential probing was the dominant cost in the cleanup phase
@@ -492,6 +517,20 @@ async function runCleanup(config, verbose = false) {
         });
         if (isReferenced) isProtected = true;
+        // P-e0b4f7a5 — protect worktrees whose branch matches a work item in
+        // the phantom-completion retry state. The dispatch may have already
+        // moved to dispatch.completed (so isReferenced is false) but the
+        // retry will re-dispatch on the same branch shortly.
+        if (!isProtected && phantomBranches.size > 0) {
+          for (const branch of phantomBranches) {
+            if (worktreeMatchesBranch(dirLower, branch, actualBranch)) {
+              isProtected = true;
+              if (verbose) console.log(`  Skipping worktree ${dir}: phantom-completion retry pending`);
+              break;
+            }
+          }
+        }
         // Also clean worktrees older than 2 hours with no active dispatch referencing them
         let mtime = Date.now();
         if (!shouldClean) {
@@ -499,7 +538,7 @@ async function runCleanup(config, verbose = false) {
             const stat = fs.statSync(wtPath);
             mtime = stat.mtimeMs;
             const ageMs = Date.now() - mtime;
-            if (ageMs > 7200000 && !isReferenced) { // 2 hours
+            if (ageMs > 7200000 && !isReferenced && !isProtected) { // 2 hours — P-e0b4f7a5: phantom-protected worktrees survive the age sweep too
               shouldClean = true;
             }
           } catch { /* optional */ }
@@ -1080,4 +1119,5 @@ module.exports = {
   worktreeMatchesBranch,     // exported for testing
   getWorktreeBranch,         // exported for lifecycle cleanup
   cleanupMergedPrLocalBranch, // exported for lifecycle cleanup and testing
+  collectPhantomBranchesForProject, // P-e0b4f7a5 — exported for testing
 };

package/engine/lifecycle.js CHANGED Viewed

@@ -543,6 +543,11 @@ function updateWorkItemStatus(meta, status, reason) {
         delete target.failReason;
         delete target.failedAt;
         delete target._retryCount;
+        // P-e0b4f7a5 — successful completion (including a phantom-retry
+        // succeeding) clears the phantom markers so cleanup can reap the
+        // worktree on the next sweep.
+        delete target._phantomCompletion;
+        delete target._phantomBranch;
         target.completedAt = ts();
         // Restore agent info from dispatch metadata (cleared on retry reset)
         if (meta._agentId && !target.dispatched_to) target.dispatched_to = meta._agentId;
@@ -1087,6 +1092,90 @@ async function findOpenPrForBranch(meta, config) {
   return null;
 }
+// P-e0b4f7a5 — quick "did the agent push the branch before the runtime
+// crashed?" probe. `git ls-remote origin <branch>` returns a non-empty
+// "<sha>\trefs/heads/<branch>" line when the branch exists on the remote and
+// nothing when it doesn't. Used by enforcePrAttachmentContract to gate the
+// phantom-recovery PR auto-link: if the branch isn't there, no PR can exist
+// either and there's no point burning another `gh pr list` round-trip.
+async function _phantomBranchExistsOnRemote(meta, config) {
+  if (!meta?.branch) return false;
+  const projectObj = resolvePrFallbackProject(meta, config);
+  // Fall back to the branch lookup from any cwd if no project root is known —
+  // git will use the ambient remote configuration. We prefer the project root
+  // because dispatch worktrees may not have origin wired yet.
+  const cwd = projectObj?.localPath || meta?.cwd || process.cwd();
+  try {
+    const out = await runFileCapture('git', ['ls-remote', '--heads', 'origin', String(meta.branch)], { cwd, timeout: 15000 });
+    // Any non-empty stdout line that ends in refs/heads/<branch> = branch exists.
+    return /\trefs\/heads\//.test(String(out || ''));
+  } catch (err) {
+    log('debug', `Phantom ls-remote probe failed for ${meta.branch}: ${err.message}`);
+    return false;
+  }
+}
+// P-e0b4f7a5 — extracted from enforcePrAttachmentContract so the phantom
+// recovery path can reuse the same canonical-attach upsert without
+// duplicating the entry construction. Returns null if the link succeeded,
+// or a contract-failure object if the verification step couldn't read the
+// PR tracking state (state-error path mirrors the original inline behavior).
+function _attachFoundPrToWi(found, meta, agentId, resultSummary, config) {
+  const entry = {
+    id: shared.getCanonicalPrId(found.project, found.prNumber, found.url),
+    prNumber: found.prNumber,
+    title: meta.item?.title || `PR #${found.prNumber}`,
+    agent: agentId,
+    branch: meta.branch || '',
+    reviewStatus: 'pending',
+    status: PR_STATUS.ACTIVE,
+    created: ts(),
+    url: found.url,
+    prdItems: [meta.item.id],
+    sourcePlan: meta.item?.sourcePlan || '',
+    itemType: meta.item?.itemType || '',
+  };
+  shared.upsertPullRequestRecord(shared.projectPrPath(found.project), entry, {
+    project: found.project,
+    itemId: meta.item.id,
+  });
+  try {
+    if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
+  } catch (err) {
+    const reason = `${meta.item.id} auto-linked a PR but PR attachment verification could not read PR tracking state: ${err.message}`;
+    markPrAttachmentVerificationError(meta, agentId, reason, resultSummary);
+    log('warn', reason);
+    return { reason, itemId: meta.item.id, severity: 'hard', stateError: true };
+  }
+  return null;
+}
+// P-e0b4f7a5 — phantom-completion recovery: when the runtime crashes before
+// emitting its terminating result event, the agent may still have pushed
+// the branch (and possibly opened the PR) seconds beforehand. Verify with
+// `git ls-remote origin <branch>` and, if the branch landed on the remote,
+// attempt one final canonical PR attachment via the existing
+// findOpenPrForBranch helper. Returns true if a PR was found and linked
+// (work is recoverable — caller should treat as success), false otherwise.
+async function _attemptPhantomPrRecovery(meta, agentId, resultSummary, config) {
+  if (!meta?.branch || !meta?.item?.id) return false;
+  const branchOnRemote = await _phantomBranchExistsOnRemote(meta, config);
+  if (!branchOnRemote) return false;
+  const recovered = await findOpenPrForBranch(meta, config);
+  if (!recovered) {
+    log('info', `Phantom-completion: branch ${meta.branch} exists on remote for ${meta.item.id} but no open PR found — routing through phantom retry budget`);
+    return false;
+  }
+  const attachResult = _attachFoundPrToWi(recovered, meta, agentId, resultSummary, config);
+  log('info', `Phantom-completion recovery: auto-linked existing PR ${shared.getCanonicalPrId(recovered.project, recovered.prNumber, recovered.url)} on branch ${meta.branch} for ${meta.item.id} (runtime crashed but agent had pushed the PR)`);
+  // attachResult === null = link verified; non-null = canonical-attach
+  // verification failed (state error). Treat state error as "not recovered"
+  // so the caller falls through to the normal failure path with that error
+  // surfaced via markPrAttachmentVerificationError already called inside
+  // _attachFoundPrToWi.
+  return attachResult === null;
+}
 // Lightweight probe for "did the agent's output contain ANY PR URL?". Used by
 // the PR-attachment contract to distinguish silent-failure (no URL anywhere)
 // from auto-link-miss (URL present but engine couldn't canonically attach it).
@@ -1113,10 +1202,79 @@ function _outputHasRuntimeResultEvent(output) {
   return /"type":\s*"result"/.test(output);
 }
-function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity) {
+function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity, opts) {
   const noPrWiPath = resolveWorkItemPath(meta);
   const isHard = severity !== 'soft';
+  const isPhantom = !!(opts && opts.phantom);
   let syncFailedToPrd = false;
+  // Phantom branch: a runtime crash that hard-fails for "no PR attached" should
+  // not bypass the retry budget — the agent never got a chance to do the work.
+  // Track these separately on `_phantomRetryCount` so they don't pollute the
+  // PR-attachment retry counter (`_retryCount`). Cap at maxPhantomRetries; only
+  // hard-fail once the phantom budget is exhausted.
+  let phantomRetryDeferred = false;
+  let phantomRetryExhausted = false;
+  let phantomRetryCount = 0;
+  if (isHard && isPhantom && noPrWiPath) {
+    mutateJsonFileLocked(noPrWiPath, data => {
+      if (!Array.isArray(data)) return data;
+      const w = data.find(i => i.id === meta.item.id);
+      if (!w) return data;
+      const phantomRetries = w._phantomRetryCount || 0;
+      if (phantomRetries < ENGINE_DEFAULTS.maxPhantomRetries) {
+        w.status = WI_STATUS.PENDING;
+        w._phantomRetryCount = phantomRetries + 1;
+        w._lastRetryAt = ts();
+        w._lastRetryReason = reason;
+        w._pendingReason = 'phantom_completion';
+        // P-e0b4f7a5 — _phantomCompletion + _phantomBranch let cleanup.js
+        // protect the worktree of an in-flight phantom retry. Without these
+        // markers the 2-hour age sweep can wipe the worktree (and the agent's
+        // already-pushed branch reference) between phantom detection and
+        // re-dispatch.
+        w._phantomCompletion = true;
+        if (meta.branch) w._phantomBranch = meta.branch;
+        delete w.completedAt;
+        delete w.dispatched_at;
+        delete w.dispatched_to;
+        delete w.failReason;
+        delete w.failedAt;
+        delete w._missingPrAttachment;
+        phantomRetryDeferred = true;
+        phantomRetryCount = phantomRetries + 1;
+        log('warn', `Work item ${meta.item.id} hit phantom-completion path — retry ${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries} (runtime likely crashed before emitting result event)`);
+      } else {
+        phantomRetryExhausted = true;
+        phantomRetryCount = phantomRetries;
+      }
+      return data;
+    }, { skipWriteIfUnchanged: true });
+    if (phantomRetryDeferred) {
+      // Soft inbox note: the runtime crashed but we're retrying; surface the
+      // event without flagging the WI as silent failure.
+      shared.writeToInbox('engine', `phantom-completion-retry-${meta.item.id}`,
+        `# Phantom completion retry for ${meta.item.id}\n\n` +
+        `**Agent:** ${agentId}\n` +
+        `**Work item:** \`${meta.item.id}\` — ${meta.item.title || ''}\n` +
+        `**Type:** ${meta.item.type || 'unknown'}\n` +
+        `**Branch:** ${meta.branch || '(none)'}\n` +
+        `**Phantom retry:** ${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries}\n\n` +
+        `${reason}\n` +
+        (resultSummary ? `\n## Agent summary\n${resultSummary}\n` : ''),
+        null,
+        { sourceItem: meta.item.id, reason: 'phantom-completion-retry' });
+      // Sync PRD back to pending so dependent flow doesn't see it as failed.
+      if (meta.item?.sourcePlan) {
+        try { syncPrdItemStatus(meta.item.id, WI_STATUS.PENDING, meta.item.sourcePlan); } catch (e) { log('warn', 'phantom retry PRD sync: ' + e.message); }
+      }
+      return;
+    }
+    if (phantomRetryExhausted) {
+      // Fall through to the regular hard-fail path with augmented reason so
+      // operators see "phantom retries exhausted" instead of the generic msg.
+      reason = `${reason} — phantom retries exhausted (${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries})`;
+    }
+  }
   if (noPrWiPath) {
     mutateJsonFileLocked(noPrWiPath, data => {
       if (!Array.isArray(data)) return data;
@@ -1132,6 +1290,11 @@ function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity)
         delete w.completedAt;
         delete w._noPr;
         delete w._noPrReason;
+        // P-e0b4f7a5 — terminal hard-fail (genuine missing PR or phantom
+        // retries exhausted) clears the in-flight phantom markers so cleanup
+        // can finally reap the worktree.
+        delete w._phantomCompletion;
+        delete w._phantomBranch;
       } else {
         // Soft: don't change status or failReason — the agent did the work,
         // we just couldn't auto-attach the PR. Surface a flag for the dashboard
@@ -1208,7 +1371,8 @@ function markPrAttachmentVerificationError(meta, agentId, reason, resultSummary)
     { sourceItem: meta.item.id, reason: 'pr-attachment-state-error' });
 }
-async function enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, output) {
+async function enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, output, opts) {
+  const detectPhantom = !!(opts && opts.detectPhantom);
   if (!isPrAttachmentRequired(type, meta?.item, meta)) return null;
   try {
     if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
@@ -1221,39 +1385,35 @@ async function enforcePrAttachmentContract(type, meta, agentId, config, resultSu
   const found = await findOpenPrForBranch(meta, config);
   if (found) {
-    const entry = {
-      id: shared.getCanonicalPrId(found.project, found.prNumber, found.url),
-      prNumber: found.prNumber,
-      title: meta.item?.title || `PR #${found.prNumber}`,
-      agent: agentId,
-      branch: meta.branch || '',
-      reviewStatus: 'pending',
-      status: PR_STATUS.ACTIVE,
-      created: ts(),
-      url: found.url,
-      prdItems: [meta.item.id],
-      sourcePlan: meta.item?.sourcePlan || '',
-      itemType: meta.item?.itemType || '',
-    };
-    shared.upsertPullRequestRecord(shared.projectPrPath(found.project), entry, {
-      project: found.project,
-      itemId: meta.item.id,
-    });
-    log('info', `Auto-linked existing PR ${entry.id} on branch ${meta.branch} for ${meta.item.id}`);
-    try {
-      if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
-    } catch (err) {
-      const reason = `${meta.item.id} auto-linked a PR but PR attachment verification could not read PR tracking state: ${err.message}`;
-      markPrAttachmentVerificationError(meta, agentId, reason, resultSummary);
-      log('warn', reason);
-      return { reason, itemId: meta.item.id, severity: 'hard', stateError: true };
-    }
+    const attachResult = _attachFoundPrToWi(found, meta, agentId, resultSummary, config);
+    log('info', `Auto-linked existing PR ${shared.getCanonicalPrId(found.project, found.prNumber, found.url)} on branch ${meta.branch} for ${meta.item.id}`);
+    if (attachResult === null) return null;
+    return attachResult;
   }
   // Distinguish "agent never claimed a PR" (hard — silent failure the contract
   // was designed to catch) from "agent claimed a PR but engine couldn't attach
   // it canonically" (soft — verification gap, not a failure).
   const severity = _outputContainsPrUrl(output) ? 'soft' : 'hard';
+  // Phantom completion = hard severity + opt-in detectPhantom + no terminating
+  // result event in stream. The runtime CLI crashed mid-conversation; the
+  // agent never got a chance to open a PR. Hard-failing here would bypass the
+  // retry budget for a runtime bug. Surface phantom: true to
+  // markMissingPrAttachment so it routes through the _phantomRetryCount path.
+  const isPhantom = severity === 'hard' && detectPhantom && !_outputHasRuntimeResultEvent(output);
+  // P-e0b4f7a5 — phantom-completion recovery: an agent may have pushed its
+  // branch (and even opened the PR) seconds before the runtime crashed.
+  // Verify with `git ls-remote origin <branch>` and, if the branch landed,
+  // make one final canonical-attach attempt before burning a phantom retry.
+  // This recovers work that would otherwise be lost — both the worktree
+  // (cleanup would reap it) and the orphan PR link (no WI ever points at it).
+  if (isPhantom) {
+    if (await _attemptPhantomPrRecovery(meta, agentId, resultSummary, config)) {
+      return null;
+    }
+  }
   // Hard-fail messaging: if the runtime never emitted its terminating result
   // event, the failure is a phantom completion (runtime CLI crashed), not the
   // agent silently skipping work. Surface that truthfully so operators don't
@@ -1268,9 +1428,9 @@ async function enforcePrAttachmentContract(type, meta, agentId, config, resultSu
   } else {
     reason = `${meta.item.id} completed and a PR URL was found in the agent's output, but it couldn't be canonically attached. The work likely succeeded — verify by checking the PR list. (Branch: ${meta.branch || '(none)'}, agent: ${agentId})`;
   }
-  markMissingPrAttachment(meta, agentId, reason, resultSummary, severity);
+  markMissingPrAttachment(meta, agentId, reason, resultSummary, severity, { phantom: isPhantom });
   log(severity === 'hard' ? 'warn' : 'info', reason);
-  return { reason, itemId: meta.item.id, severity };
+  return { reason, itemId: meta.item.id, severity, phantom: isPhantom };
 }
 // ─── Post-Completion Hooks ──────────────────────────────────────────────────
@@ -1592,7 +1752,7 @@ async function detectPrFixBranchChange(meta, config) {
   return { changed: null, beforeHead, afterHead: remoteHead || '', reason: 'unable to prove branch head after fix' };
 }
-function recordPrNoOpFixAttempt(target, cause, source, dispatchItem, branchChange, config) {
+function recordPrNoOpFixAttempt(target, cause, source, dispatchItem, branchChange, config, noopReason) {
   const evidenceFingerprint = shared.prFixEvidenceFingerprint(target, cause);
   const prior = shared.getPrNoOpFixRecord(target, cause);
   const sameEvidence = prior?.evidenceFingerprint === evidenceFingerprint;
@@ -1623,6 +1783,20 @@ function recordPrNoOpFixAttempt(target, cause, source, dispatchItem, branchChang
     afterHead: branchChange?.afterHead || '',
   };
+  // Record a same-SHA dispatch outcome on the PR record so the eligibility
+  // filter can short-circuit duplicate build-fix dispatches against an
+  // unchanged commit. Reset happens implicitly when headSha advances and the
+  // discovery filter compares lastDispatchHeadSha to the current head.
+  const headSha = getPrFixBaselineHead(target);
+  target.lastDispatchedAt = now;
+  target.lastDispatchOutcome = 'noop';
+  target.lastDispatchHeadSha = headSha;
+  target.lastDispatchReason = String(
+    noopReason
+      || branchChange?.reason
+      || 'fix completed without changing the PR branch'
+  ).slice(0, 500);
   if (cause === shared.PR_FIX_CAUSE.HUMAN_FEEDBACK && target.humanFeedback) {
     target.humanFeedback.pendingFix = !paused;
     if (paused) target.humanFeedback.noOpPaused = true;
@@ -1639,6 +1813,14 @@ function clearPrNoOpFixAttempt(target, cause) {
   if (Object.keys(target._noOpFixes).length === 0) delete target._noOpFixes;
   if (target._lastNoOpFix?.cause === cause) delete target._lastNoOpFix;
   if (target.humanFeedback) delete target.humanFeedback.noOpPaused;
+  // The lastDispatch* trackers exist to prevent duplicate noop dispatches at
+  // the same head; once the agent actually pushed a fix we no longer want them
+  // to suppress a fresh dispatch (the SHA may have moved or the next failure
+  // is genuinely new).
+  delete target.lastDispatchedAt;
+  delete target.lastDispatchOutcome;
+  delete target.lastDispatchHeadSha;
+  delete target.lastDispatchReason;
 }
 function updatePrAfterFix(pr, project, source, options = {}, legacyDispatchId = '') {
@@ -1666,7 +1848,7 @@ function updatePrAfterFix(pr, project, source, options = {}, legacyDispatchId =
       target.minionsReview = next;
     };
     if (explicitlyChangedBranch && options.branchChange?.changed === false) {
-      const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, options.branchChange, options.config);
+      const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, options.branchChange, options.config, options.noopReason);
       result = { noOp: true, cause, paused: !!record.paused, count: record.count };
       log('warn', `Updated ${pr.id} → recorded no-op ${cause} fix attempt ${record.count}${record.paused ? ' (paused)' : ''}; PR branch was unchanged`);
       return prs;
@@ -1678,7 +1860,7 @@ function updatePrAfterFix(pr, project, source, options = {}, legacyDispatchId =
     // automation cause handled — a future tick with working detection must
     // be free to re-dispatch.
     if (explicitlyChangedBranch && options.branchChange?.changed === null) {
-      const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, options.branchChange, options.config);
+      const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, options.branchChange, options.config, options.noopReason);
       result = { noOp: true, cause, paused: !!record.paused, count: record.count, indeterminate: true };
       log('warn', `Updated ${pr.id} → recorded indeterminate ${cause} fix attempt ${record.count}${record.paused ? ' (paused)' : ''}; PR branch advance could not be verified${options.branchChange?.reason ? ` (${options.branchChange.reason})` : ''}`);
       return prs;
@@ -2542,6 +2724,20 @@ function detectNonTerminalResultSummary(_resultSummary, structuredCompletion, co
 }
 function deferNonTerminalCompletion(meta, detection) {
+  return _deferRetryWithCounter(meta, detection, '_retryCount', ENGINE_DEFAULTS.maxRetries, 'nonterminal_completion');
+}
+// Phantom-completion variant — uses _phantomRetryCount + maxPhantomRetries so
+// runtime-crash retries don't share a budget with the PR-attachment contract's
+// retries. Cap is independent (ENGINE_DEFAULTS.maxPhantomRetries) so the two
+// failure modes can be tuned separately. Failure mode triggered when the
+// runtime exits cleanly but emits no result event, no structured completion,
+// and no completion report — see detectNonTerminalResultSummary.
+function deferPhantomCompletion(meta, detection) {
+  return _deferRetryWithCounter(meta, detection, '_phantomRetryCount', ENGINE_DEFAULTS.maxPhantomRetries, 'phantom_completion');
+}
+function _deferRetryWithCounter(meta, detection, counterField, maxCount, pendingReason) {
   const itemId = meta?.item?.id;
   const reason = detection?.reason || 'Nonterminal completion summary';
   if (!itemId) return reason;
@@ -2554,35 +2750,49 @@ function deferNonTerminalCompletion(meta, detection) {
       if (!Array.isArray(data)) return data;
       const w = data.find(i => i.id === itemId);
       if (!w) return data;
-      const retries = w._retryCount || 0;
-      if (retries < ENGINE_DEFAULTS.maxRetries) {
+      const retries = w[counterField] || 0;
+      if (retries < maxCount) {
         w.status = WI_STATUS.PENDING;
-        w._retryCount = retries + 1;
+        w[counterField] = retries + 1;
         w._lastRetryAt = ts();
         w._lastRetryReason = reason;
-        w._pendingReason = 'nonterminal_completion';
+        w._pendingReason = pendingReason;
+        // P-e0b4f7a5 — phantom-retry path stamps _phantomCompletion +
+        // _phantomBranch so cleanup.js can preserve the worktree across the
+        // re-dispatch window. Only set for the phantom counter; nonterminal
+        // retries don't share this protection.
+        if (counterField === '_phantomRetryCount') {
+          w._phantomCompletion = true;
+          if (meta?.branch) w._phantomBranch = meta.branch;
+        }
         delete w.completedAt;
         delete w.dispatched_at;
         delete w.dispatched_to;
         delete w.failedAt;
         finalStatus = WI_STATUS.PENDING;
-        log('warn', `Work item ${itemId} reported nonterminal success — retry ${retries + 1}/${ENGINE_DEFAULTS.maxRetries}: ${reason}`);
+        log('warn', `Work item ${itemId} reported ${pendingReason} — retry ${retries + 1}/${maxCount} (${counterField}): ${reason}`);
       } else {
         w.status = WI_STATUS.FAILED;
-        w.failReason = `${reason} after ${ENGINE_DEFAULTS.maxRetries} attempts`;
+        w.failReason = `${reason} after ${maxCount} attempts`;
         w.failedAt = ts();
         delete w.completedAt;
         delete w.dispatched_at;
         delete w.dispatched_to;
         delete w._pendingReason;
+        // Exhausted phantom retries: clear the in-flight markers so cleanup
+        // can reap the worktree on the next sweep.
+        if (counterField === '_phantomRetryCount') {
+          delete w._phantomCompletion;
+          delete w._phantomBranch;
+        }
         finalStatus = WI_STATUS.FAILED;
-        log('warn', `Work item ${itemId} failed — repeated nonterminal completion summaries after ${ENGINE_DEFAULTS.maxRetries} attempts`);
+        log('warn', `Work item ${itemId} failed — repeated ${pendingReason} after ${maxCount} attempts`);
       }
       return data;
     }, { defaultValue: [], skipWriteIfUnchanged: true });
     syncPrdItemStatus(itemId, finalStatus, meta.item?.sourcePlan);
   } catch (err) {
-    log('warn', `nonterminal completion gate: ${err.message}`);
+    log('warn', `${pendingReason} gate: ${err.message}`);
   }
   return reason;
 }
@@ -2792,8 +3002,9 @@ function handleDecompositionResult(stdout, meta, config, runtimeName) {
   return 0;
 }
-async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, config) {
+async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, config, opts) {
+  const detectPhantom = !!(opts && opts.detectPhantom);
   const type = dispatchItem.type;
   const meta = dispatchItem.meta;
   const isSuccess = code === 0;
@@ -3033,13 +3244,27 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
   let completionContractFailure = null;
   if (effectiveSuccess && meta?.item?.id && !skipDoneStatus) {
-    const nonTerminalCompletion = detectNonTerminalResultSummary(completionGateSummary, structuredCompletion, reportCompletion);
+    const nonTerminalCompletion = detectNonTerminalResultSummary(completionGateSummary, structuredCompletion, reportCompletion, { detectPhantom });
     if (nonTerminalCompletion) {
-      skipDoneStatus = true;
-      const reason = deferNonTerminalCompletion(meta, nonTerminalCompletion);
-      completionContractFailure = { reason, itemId: meta.item.id, nonTerminal: true, processWorkItemFailure: false };
-      if (!nonCleanReportWritten) {
-        writeNonCleanAgentReport(dispatchItem, agentId, 'partial', structuredCompletion, completionGateSummary, code);
+      const isPhantomDetection = nonTerminalCompletion.phrase === 'phantom-completion';
+      // P-e0b4f7a5 — before deferring a phantom retry, attempt to recover
+      // the agent's work via the ls-remote + canonical-attach probe. If the
+      // agent had pushed its branch (and possibly opened the PR) seconds
+      // before the runtime crashed, link the PR and treat the WI as a
+      // normal successful completion. This preserves work that would
+      // otherwise be lost and avoids burning a phantom retry on something
+      // that already shipped.
+      if (isPhantomDetection && await _attemptPhantomPrRecovery(meta, agentId, resultSummary, config)) {
+        log('info', `Phantom-completion recovered for ${meta.item.id} via ls-remote + PR auto-link — no retry needed`);
+      } else {
+        skipDoneStatus = true;
+        const reason = isPhantomDetection
+          ? deferPhantomCompletion(meta, nonTerminalCompletion)
+          : deferNonTerminalCompletion(meta, nonTerminalCompletion);
+        completionContractFailure = { reason, itemId: meta.item.id, nonTerminal: true, processWorkItemFailure: false, phantom: isPhantomDetection };
+        if (!nonCleanReportWritten) {
+          writeNonCleanAgentReport(dispatchItem, agentId, 'partial', structuredCompletion, completionGateSummary, code);
+        }
       }
     }
   }
@@ -3055,7 +3280,7 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
   }
   if (effectiveSuccess && meta?.item?.id && !skipDoneStatus && !noopRationale) {
-    completionContractFailure = await enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, stdout);
+    completionContractFailure = await enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, stdout, { detectPhantom });
     if (completionContractFailure?.severity === 'hard' || completionContractFailure?.nonTerminal) {
       skipDoneStatus = true;
     }
@@ -3208,6 +3433,7 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
       dispatchItem,
       branchChange: prFixBranchChange,
       config,
+      noopReason: noopRationale || meta?._noopReason || '',
     });
     // (#984) Sync PRD status for PR-linked features: fix work items have a different ID
     // than the original PRD feature, so syncPrdItemStatus(fixWiId, ...) finds nothing.
@@ -3437,6 +3663,10 @@ module.exports = {
   parseCompletionFieldSummary,
   parseCompletionNoop,
   detectNonTerminalResultSummary,
+  deferNonTerminalCompletion,
+  deferPhantomCompletion,
+  enforcePrAttachmentContract,
+  markMissingPrAttachment,
   parseCompletionReportFile,
   persistCompletionReport,
   runPostCompletionHooks,

package/engine/shared.js CHANGED Viewed

@@ -1078,6 +1078,7 @@ const ENGINE_DEFAULTS = {
   evalMaxIterations: 3, // legacy UI/config field; engine discovery no longer enforces review→fix cycle caps
   evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
   maxRetries: 3, // max dispatch retries before marking work item as failed
+  maxPhantomRetries: 3, // max retries for "phantom completion" (runtime crashed before emitting type:"result"); tracked separately from _retryCount so phantom retries don't pollute the normal PR-attachment retry budget. See engine/lifecycle.markMissingPrAttachment + detectNonTerminalResultSummary.
   minRetryGapMs: 120000, // 2min — minimum gap between retry dispatches for the same work item; prevents tight retry loops when an idempotent agent (e.g. review bailing out on a duplicate) cannot produce the expected output (#1770)
   pipelineApiRetries: 2, // max attempts for pipeline API calls
   pipelineApiRetryDelay: 2000, // ms delay between pipeline API retries

package/engine/spawn-agent.js CHANGED Viewed

@@ -162,6 +162,81 @@ function formatProcessExitSentinel(exitCode, signal) {
   return `\n[process-exit] code=${exitCode}${signal ? ` signal=${signal}` : ''}\n`;
 }
+/**
+ * Pre-push stale-HEAD guard for fix-task dispatches (P-c8f2d5e3).
+ *
+ * When the engine reuses an existing worktree on a PR branch that was rebased
+ * upstream (force-push), the local HEAD can sit behind origin/<branch>. The
+ * first push from that worktree silently overwrites the rebased history — a
+ * confirmed silent-overwrite footgun captured in team memory.
+ *
+ * This helper runs:
+ *   git fetch origin <branch>
+ *   git rev-list --count HEAD..origin/<branch>
+ * inside the worktree. When the count is > 0 it throws a clear, actionable
+ * error so engine.spawnAgent can abort the dispatch before invoking the
+ * runtime CLI — i.e. before the agent has a chance to push.
+ *
+ * The fetch is best-effort: if origin doesn't have the ref yet (first push on
+ * a fresh branch, common for shared-branch plan items), the helper returns
+ * `{ ok: true, skipped: 'no-upstream' }` instead of failing — there's no
+ * rebased tip to overwrite. Any other fetch failure is also treated as a
+ * skip with `skipped: 'fetch-failed'` so transient network issues don't
+ * brick an otherwise-healthy dispatch.
+ *
+ * @param {object} args
+ * @param {string} args.branch - PR branch name (already sanitized)
+ * @param {string} args.cwd    - Worktree path
+ * @param {function} [args.exec] - Async exec(cmd, opts) — injectable for tests
+ * @param {object}   [args.gitOpts] - Options passed through to exec
+ * @returns {Promise<{ok: true, behindCount: number, skipped?: string}>}
+ * @throws {Error & {code: 'STALE_HEAD'}} when local HEAD is behind origin
+ */
+async function assertStaleHeadOk({ branch, cwd, exec, gitOpts } = {}) {
+  if (!branch) throw new Error('assertStaleHeadOk: branch is required');
+  if (!cwd) throw new Error('assertStaleHeadOk: cwd is required');
+  const execFn = typeof exec === 'function'
+    ? exec
+    : require('./shared').execAsync;
+  const opts = { ...(gitOpts || {}), cwd };
+  // Best-effort fetch. Branch-missing-on-origin is a legitimate state (first
+  // push on a freshly-cut feature branch) and must NOT block dispatch.
+  try {
+    await execFn(`git fetch origin "${branch}"`, opts);
+  } catch (err) {
+    const msg = (err && (err.stderr?.toString?.() || err.message || '')) + '';
+    if (/couldn'?t find remote ref|not found in upstream|unknown revision/i.test(msg)) {
+      return { ok: true, behindCount: 0, skipped: 'no-upstream' };
+    }
+    // Other failures (network/auth/timeout) — skip rather than block.
+    return { ok: true, behindCount: 0, skipped: 'fetch-failed' };
+  }
+  let countOut;
+  try {
+    countOut = await execFn(`git rev-list --count HEAD..origin/${branch}`, opts);
+  } catch (err) {
+    // origin/<branch> resolution failed AFTER fetch — treat as no-upstream.
+    return { ok: true, behindCount: 0, skipped: 'rev-list-failed' };
+  }
+  const raw = typeof countOut === 'string'
+    ? countOut
+    : (countOut?.stdout?.toString?.() ?? String(countOut ?? ''));
+  const behindCount = parseInt(String(raw).trim(), 10);
+  if (!Number.isFinite(behindCount) || behindCount <= 0) {
+    return { ok: true, behindCount: Number.isFinite(behindCount) ? behindCount : 0 };
+  }
+  const err = new Error(
+    `PR branch was rebased; local HEAD is stale (${behindCount} commits behind origin). ` +
+    `Run \`git pull --rebase origin ${branch}\` first.`
+  );
+  err.code = 'STALE_HEAD';
+  err.behindCount = behindCount;
+  err.branch = branch;
+  throw err;
+}
 // The orphan reaper recovers an agent's exit code by scanning live-output.log for
 // `[process-exit] code=N`. The previous design wrote the sentinel to stdout, hoping
 // the engine's stdout consumer (engine.js) would copy it into the file — but when
@@ -456,6 +531,6 @@ function main() {
   });
 }
-module.exports = { parseSpawnArgs, buildSpawnInvocation, normalizeRuntimeExit, shouldInjectAdoTokenEnv, injectAdoTokenEnv, injectAdoTokenEnvForRepoHost, writeProcessExitSentinel, computeAddDirs, createParentPipeForwarder };
+module.exports = { parseSpawnArgs, buildSpawnInvocation, normalizeRuntimeExit, shouldInjectAdoTokenEnv, injectAdoTokenEnv, injectAdoTokenEnvForRepoHost, writeProcessExitSentinel, computeAddDirs, createParentPipeForwarder, assertStaleHeadOk };
 if (require.main === module) main();

package/engine/timeout.js CHANGED Viewed

@@ -318,7 +318,13 @@ function checkTimeouts(config) {
     // Run post-completion hooks via shared helper (async — fire and forget in timeout context).
     // Pass the actual exit code so autoRecovery (PR-created-but-failed) still works correctly.
-    runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
+    // detectPhantom: true mirrors the line 310 detectNonTerminalResultSummary call —
+    // when the timeout path completes a dispatch via the [process-exit] sentinel,
+    // we have no guarantee the runtime emitted a result event. Propagating
+    // detectPhantom downstream lets enforcePrAttachmentContract route phantom
+    // hard-fails through the _phantomRetryCount budget instead of bypassing
+    // the retry counter entirely (P-d9a3e6f4).
+    runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config, { detectPhantom: true }).catch(e => log('warn', 'post-completion hooks: ' + e.message));
     if (hasProcess) {
       shared.killImmediate(activeProcesses.get(item.id)?.proc);

package/engine.js CHANGED Viewed

@@ -28,6 +28,7 @@ const { exec, execAsync, execSilent, runFile, ts, ENGINE_DEFAULTS,
   WI_STATUS, DONE_STATUSES, WORK_TYPE, PLAN_STATUS, PRD_ITEM_STATUS, PRD_MATERIALIZABLE, PR_STATUS, DISPATCH_RESULT, AGENT_STATUS,
   FAILURE_CLASS } = shared;
 const { resolveRuntime } = require('./engine/runtimes');
+const { assertStaleHeadOk } = require('./engine/spawn-agent');
 const queries = require('./engine/queries');
 // ─── Paths ──────────────────────────────────────────────────────────────────
@@ -1114,6 +1115,41 @@ async function spawnAgent(dispatchItem, config) {
     log('warn', `Agent ${agentId} running ${type} task in main repo (no worktree) for ${id} — changes may land on master directly`);
   }
+  // ── Stale-HEAD guard for fix-task pushes (P-c8f2d5e3) ────────────────────
+  // When a PR branch is rebased upstream (force-push), a reused worktree can
+  // sit on local HEAD that's behind origin/<branch>. The first push from that
+  // worktree silently overwrites the rebased history. Fix-task dispatches are
+  // the canonical case: they always target an existing PR branch the engine
+  // already polled. Abort dispatch BEFORE invoking the runtime CLI so the
+  // agent never gets a chance to push over the rebased tip.
+  // Read-only and non-fix dispatches are out of scope — implement tasks cut
+  // their own branch from main, and review/verify don't push.
+  if (type === WORK_TYPE.FIX && branchName && worktreePath && cwd === worktreePath) {
+    try {
+      const guard = await assertStaleHeadOk({
+        branch: branchName,
+        cwd: worktreePath,
+        exec: execAsync,
+        gitOpts: { ..._gitOpts, timeout: 15000 },
+      });
+      if (guard.skipped) {
+        log('info', `Stale-HEAD guard skipped for ${id} (${branchName}): ${guard.skipped}`);
+      }
+    } catch (err) {
+      if (err && err.code === 'STALE_HEAD') {
+        log('error', `Stale-HEAD guard rejected fix dispatch ${id} on ${branchName}: ${err.message}`);
+        _cleanupPromptFiles();
+        completeDispatch(id, DISPATCH_RESULT.ERROR, err.message.slice(0, 300));
+        cleanupTempAgent(agentId);
+        return null;
+      }
+      // Non-STALE_HEAD failures from the guard itself shouldn't block dispatch
+      // (the guard is conservative by design — fetch/network issues fall through
+      // to skipped:'fetch-failed'). Log and continue.
+      log('warn', `Stale-HEAD guard error for ${id} (${branchName}): ${err.message}`);
+    }
+  }
   // ── Runtime + opts resolution (P-2a6d9c4f) ────────────────────────────────
   // Every CLI-specific knob flows through the runtime adapter resolved from
   // resolveAgentCli(agent, engine). Engine code MUST NOT branch on
@@ -2936,6 +2972,20 @@ async function discoverFromPrs(config, project) {
     const autoFixBuilds = config.engine?.autoFixBuilds ?? ENGINE_DEFAULTS.autoFixBuilds;
     if (pollEnabled && autoFixBuilds && pr.status === PR_STATUS.ACTIVE && pr.buildStatus === 'failing'
       && !isPrNoOpFixCauseSuppressed(pr, shared.PR_FIX_CAUSE.BUILD_FAILURE)) {
+      // P-b7e1c4d2: skip when the most recent dispatch already noop'd against
+      // the same head SHA — chronic across PRs #2315–#2323 where every fix
+      // agent rebutted "this is a pre-existing master baseline" but the
+      // cached buildStatus:failing kept re-triggering the loop. The check
+      // clears automatically once a new commit lands (lastDispatchHeadSha
+      // stops matching the current head).
+      const currentHeadSha = String(pr.headSha || pr._adoSourceCommit || pr._adoHeadCommit || '').trim();
+      if (pr.lastDispatchOutcome === 'noop'
+        && pr.lastDispatchHeadSha
+        && currentHeadSha
+        && pr.lastDispatchHeadSha === currentHeadSha) {
+        log('info', `Skipping build-fix for ${pr.id}: last dispatch was noop on the same head ${currentHeadSha.slice(0, 8)} (${(pr.lastDispatchReason || '').slice(0, 120)})`);
+        continue;
+      }
       const buildCauseKey = getPrAutomationCauseKey('build', pr);
       const key = getPrAutomationDispatchKey(`build-fix-${project?.name || 'default'}-${prDisplayId}`, buildCauseKey);
       if (isPrAutomationCauseHandledOrPending(project, pr, buildCauseKey)) continue;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@yemi33/minions",
-  "version": "0.1.1871",
+  "version": "0.1.1873",
   "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
   "bin": {
     "minions": "bin/minions.js"