@yemi33/minions 0.1.1872 → 0.1.1873

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,10 +1,13 @@
1
1
  # Changelog
2
2
 
3
- ## 0.1.1872 (2026-05-11)
3
+ ## 0.1.1873 (2026-05-11)
4
+
5
+ ### Other
6
+ - Implement: Phantom completion work preservation (P-e0b4f7a5) (#2356)
7
+
8
+ ## 0.1.1871 (2026-05-11)
4
9
 
5
10
  ### Features
6
- - Stale-HEAD guard on fix-task pushes (P-c8f2d5e3) (#2360)
7
- - Cached buildStatus invalidation on no-op completion (#2355)
8
11
  - per-agent memory file architecture (P-f1c5a8b6) (#2354)
9
12
  - Implement pre-dispatch acceptance criteria validation gate (P-a2d6b9c7) (#2352)
10
13
 
package/engine/cleanup.js CHANGED
@@ -79,6 +79,26 @@ function localBranchWorktreeInUse(root, branch) {
79
79
  }
80
80
  }
81
81
 
82
+ // P-e0b4f7a5 — collect branches of work items currently in the
83
+ // phantom-completion retry state for a given project. Returns a Set of
84
+ // branch strings. Used by the worktree cleanup loop to protect worktrees
85
+ // belonging to in-flight phantom retries from the 2-hour age sweep —
86
+ // without this protection the agent's pushed branch reference could be
87
+ // destroyed alongside the worktree before the retry runs.
88
+ function collectPhantomBranchesForProject(project) {
89
+ const branches = new Set();
90
+ try {
91
+ const items = safeJson(projectWorkItemsPath(project)) || [];
92
+ if (!Array.isArray(items)) return branches;
93
+ for (const w of items) {
94
+ if (w && w._phantomCompletion === true && w._phantomBranch) {
95
+ branches.add(String(w._phantomBranch));
96
+ }
97
+ }
98
+ } catch { /* best-effort — never let cleanup crash on a missing/corrupt WI file */ }
99
+ return branches;
100
+ }
101
+
82
102
  function cleanupMergedPrLocalBranch(root, project, pr) {
83
103
  const branch = normalizeLocalBranchName(pr?.branch);
84
104
  const result = { deleted: false, forced: false, skipped: null };
@@ -451,6 +471,11 @@ async function runCleanup(config, verbose = false) {
451
471
  const wtEntries = []; // { dir, wtPath, mtime, shouldClean, isProtected }
452
472
  const dispatch = getDispatch();
453
473
  const activeDispatchIds = new Set((dispatch.active || []).map(d => d.id));
474
+ // P-e0b4f7a5 — branches whose work item is mid-phantom-retry. Their
475
+ // worktrees must survive the age/cap sweep until the retry completes
476
+ // (or exhausts its budget) so the agent's already-pushed branch ref
477
+ // isn't destroyed alongside the worktree.
478
+ const phantomBranches = collectPhantomBranchesForProject(project);
454
479
 
455
480
  // Probe `git branch --show-current` for every worktree in chunks of 5.
456
481
  // Sequential probing was the dominant cost in the cleanup phase
@@ -492,6 +517,20 @@ async function runCleanup(config, verbose = false) {
492
517
  });
493
518
  if (isReferenced) isProtected = true;
494
519
 
520
+ // P-e0b4f7a5 — protect worktrees whose branch matches a work item in
521
+ // the phantom-completion retry state. The dispatch may have already
522
+ // moved to dispatch.completed (so isReferenced is false) but the
523
+ // retry will re-dispatch on the same branch shortly.
524
+ if (!isProtected && phantomBranches.size > 0) {
525
+ for (const branch of phantomBranches) {
526
+ if (worktreeMatchesBranch(dirLower, branch, actualBranch)) {
527
+ isProtected = true;
528
+ if (verbose) console.log(` Skipping worktree ${dir}: phantom-completion retry pending`);
529
+ break;
530
+ }
531
+ }
532
+ }
533
+
495
534
  // Also clean worktrees older than 2 hours with no active dispatch referencing them
496
535
  let mtime = Date.now();
497
536
  if (!shouldClean) {
@@ -499,7 +538,7 @@ async function runCleanup(config, verbose = false) {
499
538
  const stat = fs.statSync(wtPath);
500
539
  mtime = stat.mtimeMs;
501
540
  const ageMs = Date.now() - mtime;
502
- if (ageMs > 7200000 && !isReferenced) { // 2 hours
541
+ if (ageMs > 7200000 && !isReferenced && !isProtected) { // 2 hours — P-e0b4f7a5: phantom-protected worktrees survive the age sweep too
503
542
  shouldClean = true;
504
543
  }
505
544
  } catch { /* optional */ }
@@ -1080,4 +1119,5 @@ module.exports = {
1080
1119
  worktreeMatchesBranch, // exported for testing
1081
1120
  getWorktreeBranch, // exported for lifecycle cleanup
1082
1121
  cleanupMergedPrLocalBranch, // exported for lifecycle cleanup and testing
1122
+ collectPhantomBranchesForProject, // P-e0b4f7a5 — exported for testing
1083
1123
  };
@@ -543,6 +543,11 @@ function updateWorkItemStatus(meta, status, reason) {
543
543
  delete target.failReason;
544
544
  delete target.failedAt;
545
545
  delete target._retryCount;
546
+ // P-e0b4f7a5 — successful completion (including a phantom-retry
547
+ // succeeding) clears the phantom markers so cleanup can reap the
548
+ // worktree on the next sweep.
549
+ delete target._phantomCompletion;
550
+ delete target._phantomBranch;
546
551
  target.completedAt = ts();
547
552
  // Restore agent info from dispatch metadata (cleared on retry reset)
548
553
  if (meta._agentId && !target.dispatched_to) target.dispatched_to = meta._agentId;
@@ -1087,6 +1092,90 @@ async function findOpenPrForBranch(meta, config) {
1087
1092
  return null;
1088
1093
  }
1089
1094
 
1095
+ // P-e0b4f7a5 — quick "did the agent push the branch before the runtime
1096
+ // crashed?" probe. `git ls-remote origin <branch>` returns a non-empty
1097
+ // "<sha>\trefs/heads/<branch>" line when the branch exists on the remote and
1098
+ // nothing when it doesn't. Used by enforcePrAttachmentContract to gate the
1099
+ // phantom-recovery PR auto-link: if the branch isn't there, no PR can exist
1100
+ // either and there's no point burning another `gh pr list` round-trip.
1101
+ async function _phantomBranchExistsOnRemote(meta, config) {
1102
+ if (!meta?.branch) return false;
1103
+ const projectObj = resolvePrFallbackProject(meta, config);
1104
+ // Fall back to the branch lookup from any cwd if no project root is known —
1105
+ // git will use the ambient remote configuration. We prefer the project root
1106
+ // because dispatch worktrees may not have origin wired yet.
1107
+ const cwd = projectObj?.localPath || meta?.cwd || process.cwd();
1108
+ try {
1109
+ const out = await runFileCapture('git', ['ls-remote', '--heads', 'origin', String(meta.branch)], { cwd, timeout: 15000 });
1110
+ // Any non-empty stdout line that ends in refs/heads/<branch> = branch exists.
1111
+ return /\trefs\/heads\//.test(String(out || ''));
1112
+ } catch (err) {
1113
+ log('debug', `Phantom ls-remote probe failed for ${meta.branch}: ${err.message}`);
1114
+ return false;
1115
+ }
1116
+ }
1117
+
1118
+ // P-e0b4f7a5 — extracted from enforcePrAttachmentContract so the phantom
1119
+ // recovery path can reuse the same canonical-attach upsert without
1120
+ // duplicating the entry construction. Returns null if the link succeeded,
1121
+ // or a contract-failure object if the verification step couldn't read the
1122
+ // PR tracking state (state-error path mirrors the original inline behavior).
1123
+ function _attachFoundPrToWi(found, meta, agentId, resultSummary, config) {
1124
+ const entry = {
1125
+ id: shared.getCanonicalPrId(found.project, found.prNumber, found.url),
1126
+ prNumber: found.prNumber,
1127
+ title: meta.item?.title || `PR #${found.prNumber}`,
1128
+ agent: agentId,
1129
+ branch: meta.branch || '',
1130
+ reviewStatus: 'pending',
1131
+ status: PR_STATUS.ACTIVE,
1132
+ created: ts(),
1133
+ url: found.url,
1134
+ prdItems: [meta.item.id],
1135
+ sourcePlan: meta.item?.sourcePlan || '',
1136
+ itemType: meta.item?.itemType || '',
1137
+ };
1138
+ shared.upsertPullRequestRecord(shared.projectPrPath(found.project), entry, {
1139
+ project: found.project,
1140
+ itemId: meta.item.id,
1141
+ });
1142
+ try {
1143
+ if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
1144
+ } catch (err) {
1145
+ const reason = `${meta.item.id} auto-linked a PR but PR attachment verification could not read PR tracking state: ${err.message}`;
1146
+ markPrAttachmentVerificationError(meta, agentId, reason, resultSummary);
1147
+ log('warn', reason);
1148
+ return { reason, itemId: meta.item.id, severity: 'hard', stateError: true };
1149
+ }
1150
+ return null;
1151
+ }
1152
+
1153
+ // P-e0b4f7a5 — phantom-completion recovery: when the runtime crashes before
1154
+ // emitting its terminating result event, the agent may still have pushed
1155
+ // the branch (and possibly opened the PR) seconds beforehand. Verify with
1156
+ // `git ls-remote origin <branch>` and, if the branch landed on the remote,
1157
+ // attempt one final canonical PR attachment via the existing
1158
+ // findOpenPrForBranch helper. Returns true if a PR was found and linked
1159
+ // (work is recoverable — caller should treat as success), false otherwise.
1160
+ async function _attemptPhantomPrRecovery(meta, agentId, resultSummary, config) {
1161
+ if (!meta?.branch || !meta?.item?.id) return false;
1162
+ const branchOnRemote = await _phantomBranchExistsOnRemote(meta, config);
1163
+ if (!branchOnRemote) return false;
1164
+ const recovered = await findOpenPrForBranch(meta, config);
1165
+ if (!recovered) {
1166
+ log('info', `Phantom-completion: branch ${meta.branch} exists on remote for ${meta.item.id} but no open PR found — routing through phantom retry budget`);
1167
+ return false;
1168
+ }
1169
+ const attachResult = _attachFoundPrToWi(recovered, meta, agentId, resultSummary, config);
1170
+ log('info', `Phantom-completion recovery: auto-linked existing PR ${shared.getCanonicalPrId(recovered.project, recovered.prNumber, recovered.url)} on branch ${meta.branch} for ${meta.item.id} (runtime crashed but agent had pushed the PR)`);
1171
+ // attachResult === null = link verified; non-null = canonical-attach
1172
+ // verification failed (state error). Treat state error as "not recovered"
1173
+ // so the caller falls through to the normal failure path with that error
1174
+ // surfaced via markPrAttachmentVerificationError already called inside
1175
+ // _attachFoundPrToWi.
1176
+ return attachResult === null;
1177
+ }
1178
+
1090
1179
  // Lightweight probe for "did the agent's output contain ANY PR URL?". Used by
1091
1180
  // the PR-attachment contract to distinguish silent-failure (no URL anywhere)
1092
1181
  // from auto-link-miss (URL present but engine couldn't canonically attach it).
@@ -1113,10 +1202,79 @@ function _outputHasRuntimeResultEvent(output) {
1113
1202
  return /"type":\s*"result"/.test(output);
1114
1203
  }
1115
1204
 
1116
- function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity) {
1205
+ function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity, opts) {
1117
1206
  const noPrWiPath = resolveWorkItemPath(meta);
1118
1207
  const isHard = severity !== 'soft';
1208
+ const isPhantom = !!(opts && opts.phantom);
1119
1209
  let syncFailedToPrd = false;
1210
+ // Phantom branch: a runtime crash that hard-fails for "no PR attached" should
1211
+ // not bypass the retry budget — the agent never got a chance to do the work.
1212
+ // Track these separately on `_phantomRetryCount` so they don't pollute the
1213
+ // PR-attachment retry counter (`_retryCount`). Cap at maxPhantomRetries; only
1214
+ // hard-fail once the phantom budget is exhausted.
1215
+ let phantomRetryDeferred = false;
1216
+ let phantomRetryExhausted = false;
1217
+ let phantomRetryCount = 0;
1218
+ if (isHard && isPhantom && noPrWiPath) {
1219
+ mutateJsonFileLocked(noPrWiPath, data => {
1220
+ if (!Array.isArray(data)) return data;
1221
+ const w = data.find(i => i.id === meta.item.id);
1222
+ if (!w) return data;
1223
+ const phantomRetries = w._phantomRetryCount || 0;
1224
+ if (phantomRetries < ENGINE_DEFAULTS.maxPhantomRetries) {
1225
+ w.status = WI_STATUS.PENDING;
1226
+ w._phantomRetryCount = phantomRetries + 1;
1227
+ w._lastRetryAt = ts();
1228
+ w._lastRetryReason = reason;
1229
+ w._pendingReason = 'phantom_completion';
1230
+ // P-e0b4f7a5 — _phantomCompletion + _phantomBranch let cleanup.js
1231
+ // protect the worktree of an in-flight phantom retry. Without these
1232
+ // markers the 2-hour age sweep can wipe the worktree (and the agent's
1233
+ // already-pushed branch reference) between phantom detection and
1234
+ // re-dispatch.
1235
+ w._phantomCompletion = true;
1236
+ if (meta.branch) w._phantomBranch = meta.branch;
1237
+ delete w.completedAt;
1238
+ delete w.dispatched_at;
1239
+ delete w.dispatched_to;
1240
+ delete w.failReason;
1241
+ delete w.failedAt;
1242
+ delete w._missingPrAttachment;
1243
+ phantomRetryDeferred = true;
1244
+ phantomRetryCount = phantomRetries + 1;
1245
+ log('warn', `Work item ${meta.item.id} hit phantom-completion path — retry ${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries} (runtime likely crashed before emitting result event)`);
1246
+ } else {
1247
+ phantomRetryExhausted = true;
1248
+ phantomRetryCount = phantomRetries;
1249
+ }
1250
+ return data;
1251
+ }, { skipWriteIfUnchanged: true });
1252
+ if (phantomRetryDeferred) {
1253
+ // Soft inbox note: the runtime crashed but we're retrying; surface the
1254
+ // event without flagging the WI as silent failure.
1255
+ shared.writeToInbox('engine', `phantom-completion-retry-${meta.item.id}`,
1256
+ `# Phantom completion retry for ${meta.item.id}\n\n` +
1257
+ `**Agent:** ${agentId}\n` +
1258
+ `**Work item:** \`${meta.item.id}\` — ${meta.item.title || ''}\n` +
1259
+ `**Type:** ${meta.item.type || 'unknown'}\n` +
1260
+ `**Branch:** ${meta.branch || '(none)'}\n` +
1261
+ `**Phantom retry:** ${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries}\n\n` +
1262
+ `${reason}\n` +
1263
+ (resultSummary ? `\n## Agent summary\n${resultSummary}\n` : ''),
1264
+ null,
1265
+ { sourceItem: meta.item.id, reason: 'phantom-completion-retry' });
1266
+ // Sync PRD back to pending so dependent flow doesn't see it as failed.
1267
+ if (meta.item?.sourcePlan) {
1268
+ try { syncPrdItemStatus(meta.item.id, WI_STATUS.PENDING, meta.item.sourcePlan); } catch (e) { log('warn', 'phantom retry PRD sync: ' + e.message); }
1269
+ }
1270
+ return;
1271
+ }
1272
+ if (phantomRetryExhausted) {
1273
+ // Fall through to the regular hard-fail path with augmented reason so
1274
+ // operators see "phantom retries exhausted" instead of the generic msg.
1275
+ reason = `${reason} — phantom retries exhausted (${phantomRetryCount}/${ENGINE_DEFAULTS.maxPhantomRetries})`;
1276
+ }
1277
+ }
1120
1278
  if (noPrWiPath) {
1121
1279
  mutateJsonFileLocked(noPrWiPath, data => {
1122
1280
  if (!Array.isArray(data)) return data;
@@ -1132,6 +1290,11 @@ function markMissingPrAttachment(meta, agentId, reason, resultSummary, severity)
1132
1290
  delete w.completedAt;
1133
1291
  delete w._noPr;
1134
1292
  delete w._noPrReason;
1293
+ // P-e0b4f7a5 — terminal hard-fail (genuine missing PR or phantom
1294
+ // retries exhausted) clears the in-flight phantom markers so cleanup
1295
+ // can finally reap the worktree.
1296
+ delete w._phantomCompletion;
1297
+ delete w._phantomBranch;
1135
1298
  } else {
1136
1299
  // Soft: don't change status or failReason — the agent did the work,
1137
1300
  // we just couldn't auto-attach the PR. Surface a flag for the dashboard
@@ -1208,7 +1371,8 @@ function markPrAttachmentVerificationError(meta, agentId, reason, resultSummary)
1208
1371
  { sourceItem: meta.item.id, reason: 'pr-attachment-state-error' });
1209
1372
  }
1210
1373
 
1211
- async function enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, output) {
1374
+ async function enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, output, opts) {
1375
+ const detectPhantom = !!(opts && opts.detectPhantom);
1212
1376
  if (!isPrAttachmentRequired(type, meta?.item, meta)) return null;
1213
1377
  try {
1214
1378
  if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
@@ -1221,39 +1385,35 @@ async function enforcePrAttachmentContract(type, meta, agentId, config, resultSu
1221
1385
 
1222
1386
  const found = await findOpenPrForBranch(meta, config);
1223
1387
  if (found) {
1224
- const entry = {
1225
- id: shared.getCanonicalPrId(found.project, found.prNumber, found.url),
1226
- prNumber: found.prNumber,
1227
- title: meta.item?.title || `PR #${found.prNumber}`,
1228
- agent: agentId,
1229
- branch: meta.branch || '',
1230
- reviewStatus: 'pending',
1231
- status: PR_STATUS.ACTIVE,
1232
- created: ts(),
1233
- url: found.url,
1234
- prdItems: [meta.item.id],
1235
- sourcePlan: meta.item?.sourcePlan || '',
1236
- itemType: meta.item?.itemType || '',
1237
- };
1238
- shared.upsertPullRequestRecord(shared.projectPrPath(found.project), entry, {
1239
- project: found.project,
1240
- itemId: meta.item.id,
1241
- });
1242
- log('info', `Auto-linked existing PR ${entry.id} on branch ${meta.branch} for ${meta.item.id}`);
1243
- try {
1244
- if (hasCanonicalPrAttachment(meta.item.id, config)) return null;
1245
- } catch (err) {
1246
- const reason = `${meta.item.id} auto-linked a PR but PR attachment verification could not read PR tracking state: ${err.message}`;
1247
- markPrAttachmentVerificationError(meta, agentId, reason, resultSummary);
1248
- log('warn', reason);
1249
- return { reason, itemId: meta.item.id, severity: 'hard', stateError: true };
1250
- }
1388
+ const attachResult = _attachFoundPrToWi(found, meta, agentId, resultSummary, config);
1389
+ log('info', `Auto-linked existing PR ${shared.getCanonicalPrId(found.project, found.prNumber, found.url)} on branch ${meta.branch} for ${meta.item.id}`);
1390
+ if (attachResult === null) return null;
1391
+ return attachResult;
1251
1392
  }
1252
1393
 
1253
1394
  // Distinguish "agent never claimed a PR" (hard — silent failure the contract
1254
1395
  // was designed to catch) from "agent claimed a PR but engine couldn't attach
1255
1396
  // it canonically" (soft — verification gap, not a failure).
1256
1397
  const severity = _outputContainsPrUrl(output) ? 'soft' : 'hard';
1398
+ // Phantom completion = hard severity + opt-in detectPhantom + no terminating
1399
+ // result event in stream. The runtime CLI crashed mid-conversation; the
1400
+ // agent never got a chance to open a PR. Hard-failing here would bypass the
1401
+ // retry budget for a runtime bug. Surface phantom: true to
1402
+ // markMissingPrAttachment so it routes through the _phantomRetryCount path.
1403
+ const isPhantom = severity === 'hard' && detectPhantom && !_outputHasRuntimeResultEvent(output);
1404
+
1405
+ // P-e0b4f7a5 — phantom-completion recovery: an agent may have pushed its
1406
+ // branch (and even opened the PR) seconds before the runtime crashed.
1407
+ // Verify with `git ls-remote origin <branch>` and, if the branch landed,
1408
+ // make one final canonical-attach attempt before burning a phantom retry.
1409
+ // This recovers work that would otherwise be lost — both the worktree
1410
+ // (cleanup would reap it) and the orphan PR link (no WI ever points at it).
1411
+ if (isPhantom) {
1412
+ if (await _attemptPhantomPrRecovery(meta, agentId, resultSummary, config)) {
1413
+ return null;
1414
+ }
1415
+ }
1416
+
1257
1417
  // Hard-fail messaging: if the runtime never emitted its terminating result
1258
1418
  // event, the failure is a phantom completion (runtime CLI crashed), not the
1259
1419
  // agent silently skipping work. Surface that truthfully so operators don't
@@ -1268,9 +1428,9 @@ async function enforcePrAttachmentContract(type, meta, agentId, config, resultSu
1268
1428
  } else {
1269
1429
  reason = `${meta.item.id} completed and a PR URL was found in the agent's output, but it couldn't be canonically attached. The work likely succeeded — verify by checking the PR list. (Branch: ${meta.branch || '(none)'}, agent: ${agentId})`;
1270
1430
  }
1271
- markMissingPrAttachment(meta, agentId, reason, resultSummary, severity);
1431
+ markMissingPrAttachment(meta, agentId, reason, resultSummary, severity, { phantom: isPhantom });
1272
1432
  log(severity === 'hard' ? 'warn' : 'info', reason);
1273
- return { reason, itemId: meta.item.id, severity };
1433
+ return { reason, itemId: meta.item.id, severity, phantom: isPhantom };
1274
1434
  }
1275
1435
 
1276
1436
  // ─── Post-Completion Hooks ──────────────────────────────────────────────────
@@ -2564,6 +2724,20 @@ function detectNonTerminalResultSummary(_resultSummary, structuredCompletion, co
2564
2724
  }
2565
2725
 
2566
2726
  function deferNonTerminalCompletion(meta, detection) {
2727
+ return _deferRetryWithCounter(meta, detection, '_retryCount', ENGINE_DEFAULTS.maxRetries, 'nonterminal_completion');
2728
+ }
2729
+
2730
+ // Phantom-completion variant — uses _phantomRetryCount + maxPhantomRetries so
2731
+ // runtime-crash retries don't share a budget with the PR-attachment contract's
2732
+ // retries. Cap is independent (ENGINE_DEFAULTS.maxPhantomRetries) so the two
2733
+ // failure modes can be tuned separately. Failure mode triggered when the
2734
+ // runtime exits cleanly but emits no result event, no structured completion,
2735
+ // and no completion report — see detectNonTerminalResultSummary.
2736
+ function deferPhantomCompletion(meta, detection) {
2737
+ return _deferRetryWithCounter(meta, detection, '_phantomRetryCount', ENGINE_DEFAULTS.maxPhantomRetries, 'phantom_completion');
2738
+ }
2739
+
2740
+ function _deferRetryWithCounter(meta, detection, counterField, maxCount, pendingReason) {
2567
2741
  const itemId = meta?.item?.id;
2568
2742
  const reason = detection?.reason || 'Nonterminal completion summary';
2569
2743
  if (!itemId) return reason;
@@ -2576,35 +2750,49 @@ function deferNonTerminalCompletion(meta, detection) {
2576
2750
  if (!Array.isArray(data)) return data;
2577
2751
  const w = data.find(i => i.id === itemId);
2578
2752
  if (!w) return data;
2579
- const retries = w._retryCount || 0;
2580
- if (retries < ENGINE_DEFAULTS.maxRetries) {
2753
+ const retries = w[counterField] || 0;
2754
+ if (retries < maxCount) {
2581
2755
  w.status = WI_STATUS.PENDING;
2582
- w._retryCount = retries + 1;
2756
+ w[counterField] = retries + 1;
2583
2757
  w._lastRetryAt = ts();
2584
2758
  w._lastRetryReason = reason;
2585
- w._pendingReason = 'nonterminal_completion';
2759
+ w._pendingReason = pendingReason;
2760
+ // P-e0b4f7a5 — phantom-retry path stamps _phantomCompletion +
2761
+ // _phantomBranch so cleanup.js can preserve the worktree across the
2762
+ // re-dispatch window. Only set for the phantom counter; nonterminal
2763
+ // retries don't share this protection.
2764
+ if (counterField === '_phantomRetryCount') {
2765
+ w._phantomCompletion = true;
2766
+ if (meta?.branch) w._phantomBranch = meta.branch;
2767
+ }
2586
2768
  delete w.completedAt;
2587
2769
  delete w.dispatched_at;
2588
2770
  delete w.dispatched_to;
2589
2771
  delete w.failedAt;
2590
2772
  finalStatus = WI_STATUS.PENDING;
2591
- log('warn', `Work item ${itemId} reported nonterminal success — retry ${retries + 1}/${ENGINE_DEFAULTS.maxRetries}: ${reason}`);
2773
+ log('warn', `Work item ${itemId} reported ${pendingReason} — retry ${retries + 1}/${maxCount} (${counterField}): ${reason}`);
2592
2774
  } else {
2593
2775
  w.status = WI_STATUS.FAILED;
2594
- w.failReason = `${reason} after ${ENGINE_DEFAULTS.maxRetries} attempts`;
2776
+ w.failReason = `${reason} after ${maxCount} attempts`;
2595
2777
  w.failedAt = ts();
2596
2778
  delete w.completedAt;
2597
2779
  delete w.dispatched_at;
2598
2780
  delete w.dispatched_to;
2599
2781
  delete w._pendingReason;
2782
+ // Exhausted phantom retries: clear the in-flight markers so cleanup
2783
+ // can reap the worktree on the next sweep.
2784
+ if (counterField === '_phantomRetryCount') {
2785
+ delete w._phantomCompletion;
2786
+ delete w._phantomBranch;
2787
+ }
2600
2788
  finalStatus = WI_STATUS.FAILED;
2601
- log('warn', `Work item ${itemId} failed — repeated nonterminal completion summaries after ${ENGINE_DEFAULTS.maxRetries} attempts`);
2789
+ log('warn', `Work item ${itemId} failed — repeated ${pendingReason} after ${maxCount} attempts`);
2602
2790
  }
2603
2791
  return data;
2604
2792
  }, { defaultValue: [], skipWriteIfUnchanged: true });
2605
2793
  syncPrdItemStatus(itemId, finalStatus, meta.item?.sourcePlan);
2606
2794
  } catch (err) {
2607
- log('warn', `nonterminal completion gate: ${err.message}`);
2795
+ log('warn', `${pendingReason} gate: ${err.message}`);
2608
2796
  }
2609
2797
  return reason;
2610
2798
  }
@@ -2814,8 +3002,9 @@ function handleDecompositionResult(stdout, meta, config, runtimeName) {
2814
3002
  return 0;
2815
3003
  }
2816
3004
 
2817
- async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, config) {
3005
+ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, config, opts) {
2818
3006
 
3007
+ const detectPhantom = !!(opts && opts.detectPhantom);
2819
3008
  const type = dispatchItem.type;
2820
3009
  const meta = dispatchItem.meta;
2821
3010
  const isSuccess = code === 0;
@@ -3055,13 +3244,27 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
3055
3244
 
3056
3245
  let completionContractFailure = null;
3057
3246
  if (effectiveSuccess && meta?.item?.id && !skipDoneStatus) {
3058
- const nonTerminalCompletion = detectNonTerminalResultSummary(completionGateSummary, structuredCompletion, reportCompletion);
3247
+ const nonTerminalCompletion = detectNonTerminalResultSummary(completionGateSummary, structuredCompletion, reportCompletion, { detectPhantom });
3059
3248
  if (nonTerminalCompletion) {
3060
- skipDoneStatus = true;
3061
- const reason = deferNonTerminalCompletion(meta, nonTerminalCompletion);
3062
- completionContractFailure = { reason, itemId: meta.item.id, nonTerminal: true, processWorkItemFailure: false };
3063
- if (!nonCleanReportWritten) {
3064
- writeNonCleanAgentReport(dispatchItem, agentId, 'partial', structuredCompletion, completionGateSummary, code);
3249
+ const isPhantomDetection = nonTerminalCompletion.phrase === 'phantom-completion';
3250
+ // P-e0b4f7a5 before deferring a phantom retry, attempt to recover
3251
+ // the agent's work via the ls-remote + canonical-attach probe. If the
3252
+ // agent had pushed its branch (and possibly opened the PR) seconds
3253
+ // before the runtime crashed, link the PR and treat the WI as a
3254
+ // normal successful completion. This preserves work that would
3255
+ // otherwise be lost and avoids burning a phantom retry on something
3256
+ // that already shipped.
3257
+ if (isPhantomDetection && await _attemptPhantomPrRecovery(meta, agentId, resultSummary, config)) {
3258
+ log('info', `Phantom-completion recovered for ${meta.item.id} via ls-remote + PR auto-link — no retry needed`);
3259
+ } else {
3260
+ skipDoneStatus = true;
3261
+ const reason = isPhantomDetection
3262
+ ? deferPhantomCompletion(meta, nonTerminalCompletion)
3263
+ : deferNonTerminalCompletion(meta, nonTerminalCompletion);
3264
+ completionContractFailure = { reason, itemId: meta.item.id, nonTerminal: true, processWorkItemFailure: false, phantom: isPhantomDetection };
3265
+ if (!nonCleanReportWritten) {
3266
+ writeNonCleanAgentReport(dispatchItem, agentId, 'partial', structuredCompletion, completionGateSummary, code);
3267
+ }
3065
3268
  }
3066
3269
  }
3067
3270
  }
@@ -3077,7 +3280,7 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
3077
3280
  }
3078
3281
 
3079
3282
  if (effectiveSuccess && meta?.item?.id && !skipDoneStatus && !noopRationale) {
3080
- completionContractFailure = await enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, stdout);
3283
+ completionContractFailure = await enforcePrAttachmentContract(type, meta, agentId, config, resultSummary, stdout, { detectPhantom });
3081
3284
  if (completionContractFailure?.severity === 'hard' || completionContractFailure?.nonTerminal) {
3082
3285
  skipDoneStatus = true;
3083
3286
  }
@@ -3460,6 +3663,10 @@ module.exports = {
3460
3663
  parseCompletionFieldSummary,
3461
3664
  parseCompletionNoop,
3462
3665
  detectNonTerminalResultSummary,
3666
+ deferNonTerminalCompletion,
3667
+ deferPhantomCompletion,
3668
+ enforcePrAttachmentContract,
3669
+ markMissingPrAttachment,
3463
3670
  parseCompletionReportFile,
3464
3671
  persistCompletionReport,
3465
3672
  runPostCompletionHooks,
package/engine/shared.js CHANGED
@@ -1078,6 +1078,7 @@ const ENGINE_DEFAULTS = {
1078
1078
  evalMaxIterations: 3, // legacy UI/config field; engine discovery no longer enforces review→fix cycle caps
1079
1079
  evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
1080
1080
  maxRetries: 3, // max dispatch retries before marking work item as failed
1081
+ maxPhantomRetries: 3, // max retries for "phantom completion" (runtime crashed before emitting type:"result"); tracked separately from _retryCount so phantom retries don't pollute the normal PR-attachment retry budget. See engine/lifecycle.markMissingPrAttachment + detectNonTerminalResultSummary.
1081
1082
  minRetryGapMs: 120000, // 2min — minimum gap between retry dispatches for the same work item; prevents tight retry loops when an idempotent agent (e.g. review bailing out on a duplicate) cannot produce the expected output (#1770)
1082
1083
  pipelineApiRetries: 2, // max attempts for pipeline API calls
1083
1084
  pipelineApiRetryDelay: 2000, // ms delay between pipeline API retries
package/engine/timeout.js CHANGED
@@ -318,7 +318,13 @@ function checkTimeouts(config) {
318
318
 
319
319
  // Run post-completion hooks via shared helper (async — fire and forget in timeout context).
320
320
  // Pass the actual exit code so autoRecovery (PR-created-but-failed) still works correctly.
321
- runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
321
+ // detectPhantom: true mirrors the line 310 detectNonTerminalResultSummary call
322
+ // when the timeout path completes a dispatch via the [process-exit] sentinel,
323
+ // we have no guarantee the runtime emitted a result event. Propagating
324
+ // detectPhantom downstream lets enforcePrAttachmentContract route phantom
325
+ // hard-fails through the _phantomRetryCount budget instead of bypassing
326
+ // the retry counter entirely (P-d9a3e6f4).
327
+ runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config, { detectPhantom: true }).catch(e => log('warn', 'post-completion hooks: ' + e.message));
322
328
 
323
329
  if (hasProcess) {
324
330
  shared.killImmediate(activeProcesses.get(item.id)?.proc);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1872",
3
+ "version": "0.1.1873",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"