npm - @yemi33/minions - Versions diffs - 0.1.2120 → 0.1.2122 - Mend

@yemi33/minions 0.1.2120 → 0.1.2122

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dashboard/js/settings.js CHANGED Viewed

@@ -274,6 +274,8 @@ async function openSettings() {
       settingsField('Restart Grace Period', 'set-restartGracePeriod', e.restartGracePeriod || 1200000, 'ms', 'Grace period before orphan detection on restart') +
       settingsField('Shutdown Timeout', 'set-shutdownTimeout', e.shutdownTimeout || 300000, 'ms', 'Max wait for agents during graceful shutdown') +
       settingsField('Meeting Round Timeout', 'set-meetingRoundTimeout', e.meetingRoundTimeout || 900000, 'ms', 'Auto-advance meeting round after this') +
+      settingsField('Steering Deferred Max', 'set-steeringDeferredMaxMs', e.steeringDeferredMaxMs || 900000, 'ms', 'Max wait for a runtime to emit a resumable checkpoint before a deferred steering message is flagged stranded. After this, the engine warns to live-output, marks _steeringStranded on the dispatch, and (when the steering store is present) sets store status=stranded. Default 15min; range 60s–4h.') +
+      settingsField('Steering Max Kill Retries', 'set-steeringMaxKillRetries', e.steeringMaxKillRetries ?? 3, '', 'Cap on graceful+escalation kill attempts after a steering kill is issued. Ladder waits 30s → 60s → 120s between attempts (last interval reused). Attempt 1 is graceful; attempts 2..cap are platform hard kills (taskkill /F /T on Windows; descendant-tree SIGKILL + pkill on Unix). Past cap, the engine gives up with a [steering-stuck] log + inbox notice. Default 3; range 1–5.') +
     '</div>';
   const paneWorktree =
@@ -839,6 +841,8 @@ async function saveSettings() {
       shutdownTimeout: document.getElementById('set-shutdownTimeout').value,
       restartGracePeriod: document.getElementById('set-restartGracePeriod').value,
       meetingRoundTimeout: document.getElementById('set-meetingRoundTimeout').value,
+      steeringDeferredMaxMs: document.getElementById('set-steeringDeferredMaxMs').value,
+      steeringMaxKillRetries: document.getElementById('set-steeringMaxKillRetries').value,
       operatorLogin: (document.getElementById('set-operatorLogin')?.value ?? '').trim(),
       autoApprovePlans: document.getElementById('set-autoApprovePlans').checked,
       evalLoop: document.getElementById('set-evalLoop').checked,

package/dashboard.js CHANGED Viewed

@@ -9225,6 +9225,9 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           worktreeCreateTimeout: [60000], worktreeCreateRetries: [0, 3],
           idleAlertMinutes: [1], shutdownTimeout: [30000], restartGracePeriod: [60000],
           meetingRoundTimeout: [60000],
+          // W-mq066js7000fff1f-c (Gap B/C): steering safety-net knobs.
+          steeringDeferredMaxMs: [60000, 14400000],
+          steeringMaxKillRetries: [1, 5],
           versionCheckInterval: [60000],
           prPollStatusEvery: [1], prPollCommentsEvery: [1],
           agentBusyReassignMs: [0],

package/engine/ado.js CHANGED Viewed

@@ -698,10 +698,68 @@ function _hasPendingReReviewWi(pr) {
 let _adoTokenCache = { token: null, expiresAt: 0 };
 let _adoTokenFailedUntil = 0; // backoff: skip token acquisition calls until this timestamp
-// ─── ADO Throttle State ─────────────────────────────────────────────────────
-// Tracks rate-limiting (HTTP 429/503) from ADO API responses.
-// Uses shared createThrottleTracker factory: backoffMs starts at 60s, doubles, caps at 32 min.
-const _adoThrottle = createThrottleTracker({ label: 'ado', baseBackoffMs: 60000, maxBackoffMs: 32 * 60000 });
+// ─── ADO Throttle State (per-org) ───────────────────────────────────────────
+// Tracks rate-limiting (HTTP 429/503) from ADO API responses, isolated per ADO
+// org so a throttle storm on org A doesn't stall PR polling for org B.
+// Each tracker uses createThrottleTracker: backoffMs starts at 60s, doubles,
+// caps at 32 min, with 20% jitter (silently ignored on older shared.js until
+// the jitter foundation lands as W-mq03l6zh0006f0a1-a).
+// W-mq03l6zh0006f0a1-b — Per-org ADO throttle isolation.
+const _adoThrottlesByOrg = new Map();
+/** Canonicalize an orgBase URL or already-canonical key to a stable Map key.
+ *  Lowercases the org segment and prefers `dev.azure.com/<org>` even when the
+ *  source uses the legacy `<org>.visualstudio.com` host. */
+function canonicalAdoOrgKey(orgBaseOrUrl) {
+  if (!orgBaseOrUrl) return 'dev.azure.com/__unknown__';
+  const s = String(orgBaseOrUrl);
+  if (/^https?:\/\//i.test(s)) return resolveAdoOrgBaseFromUrl(s);
+  return s.toLowerCase();
+}
+/** Parse an ADO API URL down to a stable orgBase key.
+ *  Examples:
+ *    https://dev.azure.com/Microsoft/...        → dev.azure.com/microsoft
+ *    https://microsoft.visualstudio.com/...     → dev.azure.com/microsoft
+ *    https://microsoft.visualstudio.com/DefaultCollection/... → dev.azure.com/microsoft
+ *  Returns 'dev.azure.com/__unknown__' on parse failure so the throttle map
+ *  always has a non-null key. */
+function resolveAdoOrgBaseFromUrl(url) {
+  if (!url) return 'dev.azure.com/__unknown__';
+  try {
+    const u = new URL(url);
+    const host = u.hostname.toLowerCase();
+    if (host === 'dev.azure.com') {
+      const seg = (u.pathname.split('/').filter(Boolean)[0] || '').toLowerCase();
+      return seg ? `dev.azure.com/${seg}` : 'dev.azure.com/__unknown__';
+    }
+    if (host.endsWith('.visualstudio.com')) {
+      const org = host.slice(0, -'.visualstudio.com'.length);
+      return org ? `dev.azure.com/${org}` : 'dev.azure.com/__unknown__';
+    }
+    // Unknown host shape — derive a stable key from host + first path segment.
+    const seg = (u.pathname.split('/').filter(Boolean)[0] || '').toLowerCase();
+    return seg ? `${host}/${seg}` : host;
+  } catch {
+    return 'dev.azure.com/__unknown__';
+  }
+}
+/** Lazily get-or-create the per-org throttle tracker. */
+function getAdoThrottleForOrg(orgBase) {
+  const key = canonicalAdoOrgKey(orgBase);
+  let tracker = _adoThrottlesByOrg.get(key);
+  if (!tracker) {
+    tracker = createThrottleTracker({
+      label: `ado:${key}`,
+      baseBackoffMs: 60000,
+      maxBackoffMs: 32 * 60000,
+      jitterRatio: 0.2,
+    });
+    _adoThrottlesByOrg.set(key, tracker);
+  }
+  return tracker;
+}
 // ─── Auth Failure Tracking ──────────────────────────────────────────────────
 // Set when pollPrStatus encounters auth errors mid-loop. The engine checks this
@@ -742,6 +800,7 @@ async function adoFetch(url, token, opts = {}) {
   const body = (typeof opts === 'object' && opts.body) || undefined;
   const timeout = (typeof opts === 'object' && Number.isFinite(opts.timeout)) ? opts.timeout : 30000;
   const MAX_RETRIES = ADO_TOKEN_REFRESH_MAX_RETRIES;
+  const throttle = getAdoThrottleForOrg(resolveAdoOrgBaseFromUrl(url));
   const res = await fetch(url, {
     method,
     headers: { 'Authorization': `Bearer ${token}`, 'Content-Type': 'application/json' },
@@ -752,8 +811,8 @@ async function adoFetch(url, token, opts = {}) {
   if (res.status === 429 || res.status === 503) {
     const retryAfterSec = parseInt(res.headers.get('Retry-After'), 10);
     const retryAfterMs = (retryAfterSec > 0) ? retryAfterSec * 1000 : 0;
-    _adoThrottle.recordThrottle(retryAfterMs);
-    const state = _adoThrottle.getState();
+    throttle.recordThrottle(retryAfterMs);
+    const state = throttle.getState();
     throw new Error(`ADO API throttled (${res.status}): retry after ${Math.round((state.retryAfter - Date.now()) / 1000)}s`);
   }
   if (!res.ok) throw new Error(`ADO API ${method} ${res.status}: ${res.statusText}`);
@@ -771,12 +830,13 @@ async function adoFetch(url, token, opts = {}) {
   }
   const json = JSON.parse(text);
   // ── Success decay: decrement consecutiveHits, reset when fully recovered ──
-  _adoThrottle.recordSuccess();
+  throttle.recordSuccess();
   return json;
 }
 /** Fetch raw text from ADO API (for build logs which aren't JSON). */
 async function adoFetchText(url, token) {
+  const throttle = getAdoThrottleForOrg(resolveAdoOrgBaseFromUrl(url));
   const res = await fetch(url, {
     headers: { 'Authorization': `Bearer ${token}` },
     signal: AbortSignal.timeout(30000),
@@ -785,8 +845,8 @@ async function adoFetchText(url, token) {
   if (res.status === 429 || res.status === 503) {
     const retryAfterSec = parseInt(res.headers.get('Retry-After'), 10);
     const retryAfterMs = (retryAfterSec > 0) ? retryAfterSec * 1000 : 0;
-    _adoThrottle.recordThrottle(retryAfterMs);
-    const state = _adoThrottle.getState();
+    throttle.recordThrottle(retryAfterMs);
+    const state = throttle.getState();
     throw new Error(`ADO API throttled (${res.status}): retry after ${Math.round((state.retryAfter - Date.now()) / 1000)}s`);
   }
   if (!res.ok) throw new Error(`ADO API ${res.status}: ${res.statusText}`);
@@ -908,6 +968,12 @@ async function forEachActivePr(config, token, callback) {
     let projectUpdated = 0;
     const updatedRecords = [];
     const orgBase = getAdoOrgBase(project);
+    // W-mq03l6zh0006f0a1-b — Per-org throttle isolation: skip just this
+    // project when its org is rate-limited, keep iterating others.
+    if (isAdoThrottled(orgBase)) {
+      log('info', `[ado] PR polling skipped for ${project.name || project.repoName || orgBase} — ${orgBase} throttled`);
+      continue;
+    }
     // Parallelize PR polling within each project (max 5 concurrent to avoid rate limits)
     const CONCURRENCY = 5;
@@ -2241,11 +2307,53 @@ async function fetchSinglePrBuildStatus(project, prNumber) {
 // ─── ADO Throttle Queries ────────────────────────────────────────────────────
-/** Returns true if ADO is throttled and retryAfter hasn't elapsed. Auto-clears when retryAfter passes. */
-const isAdoThrottled = () => _adoThrottle.isThrottled();
+/** Returns true if ADO is throttled. If orgBase is provided, checks that org's
+ *  tracker only; if omitted, returns true when ANY tracked org is throttled
+ *  (back-compat OR semantics for existing call sites). Auto-clears stale state. */
+const isAdoThrottled = (orgBase) => {
+  if (orgBase != null) {
+    const tracker = _adoThrottlesByOrg.get(canonicalAdoOrgKey(orgBase));
+    return tracker ? tracker.isThrottled() : false;
+  }
+  for (const tracker of _adoThrottlesByOrg.values()) {
+    if (tracker.isThrottled()) return true;
+  }
+  return false;
+};
-/** Returns a snapshot of the current throttle state. Calls isAdoThrottled() for a fresh value. */
-const getAdoThrottleState = () => _adoThrottle.getState();
+/** Returns a snapshot of the throttle state.
+ *  - getAdoThrottleState(orgBase) → that org's `{ throttled, retryAfter, consecutiveHits }`.
+ *    Returns a zero-state default for orgs that have never been touched.
+ *  - getAdoThrottleState() → aggregate snapshot with back-compat fields
+ *    (`throttled` = OR, `retryAfter` = max, `consecutiveHits` = sum) plus a
+ *    `perOrg` map keyed by canonical orgBase. */
+const getAdoThrottleState = (orgBase) => {
+  if (orgBase != null) {
+    const tracker = _adoThrottlesByOrg.get(canonicalAdoOrgKey(orgBase));
+    return tracker ? tracker.getState() : { throttled: false, retryAfter: 0, consecutiveHits: 0 };
+  }
+  let throttled = false;
+  let retryAfter = 0;
+  let consecutiveHits = 0;
+  const perOrg = {};
+  for (const [key, tracker] of _adoThrottlesByOrg) {
+    const state = tracker.getState();
+    perOrg[key] = state;
+    if (state.throttled) throttled = true;
+    if (state.retryAfter > retryAfter) retryAfter = state.retryAfter;
+    consecutiveHits += state.consecutiveHits;
+  }
+  return { throttled, retryAfter, consecutiveHits, perOrg };
+};
+/** Returns the per-org tracker state map keyed by canonical orgBase. */
+const getAdoThrottleStateAll = () => {
+  const out = {};
+  for (const [key, tracker] of _adoThrottlesByOrg) {
+    out[key] = tracker.getState();
+  }
+  return out;
+};
 /**
  * Query ADO for an open PR on a specific branch.
@@ -2263,13 +2371,13 @@ async function findOpenPrOnBranch(project, branch) {
     logMissingAdoRepository(project, 'ADO branch PR lookup');
     return null;
   }
-  if (isAdoThrottled()) {
-    log('debug', `[ado] Skipping branch PR lookup for ${project.name || project.repoName || 'unknown project'}:${branch} — throttled`);
+  const orgBase = shared.getAdoOrgBase(project);
+  if (isAdoThrottled(orgBase)) {
+    log('debug', `[ado] Skipping branch PR lookup for ${project.name || project.repoName || 'unknown project'}:${branch} — ${orgBase} throttled`);
     return null;
   }
   const token = await getAdoToken();
   if (!token) return null;
-  const orgBase = shared.getAdoOrgBase(project);
   const sourceRef = encodeURIComponent(`refs/heads/${branch}`);
   const url = `${orgBase}/${project.adoProject}/_apis/git/repositories/${encodeURIComponent(adoRepositoryId)}/pullrequests?searchCriteria.status=active&searchCriteria.sourceRefName=${sourceRef}&api-version=7.1`;
   const data = await adoFetch(url, token);
@@ -2280,14 +2388,17 @@ async function findOpenPrOnBranch(project, branch) {
   return { prNumber, url: prUrl };
 }
-/** Reset throttle state — exported for testing only. */
+/** Reset throttle state — exported for testing only. Clears the entire per-org Map. */
 function _resetAdoThrottle() {
-  _adoThrottle._reset();
+  _adoThrottlesByOrg.clear();
 }
-/** Set throttle state directly — exported for testing only. */
-function _setAdoThrottleForTest(state) {
-  _adoThrottle._setForTest(state);
+/** Set throttle state directly — exported for testing only.
+ *  Default orgBase keeps back-compat with arg-less callers that just want
+ *  "some org is throttled" semantics through isAdoThrottled() / getAdoThrottleState(). */
+function _setAdoThrottleForTest(state, orgBase = 'dev.azure.com/__test__') {
+  const tracker = getAdoThrottleForOrg(orgBase);
+  tracker._setForTest(state);
 }
 /** Inject a token into the cache — exported for testing only.
@@ -2476,6 +2587,7 @@ module.exports = {
   isAdoAuthError, // exported for testing
   isAdoThrottled,
   getAdoThrottleState,
+  getAdoThrottleStateAll,
   fetchAdoPrMetadata,
   fetchSinglePrBuildStatus,
   findOpenPrOnBranch,

package/engine/shared.js CHANGED Viewed

@@ -2242,6 +2242,7 @@ const ENGINE_DEFAULTS = {
   autoFixHumanComments: true, // auto-dispatch fix agents for actionable human PR comments
   autoConsolidateMemory: false, // opt-in: periodically spawn engine/kb-sweep-runner.js from the tick loop (4h cadence). Inbox→notes consolidation already runs every tick via consolidateInbox; this flag only controls the KB sweep.
   prNoOpFixPauseAttempts: 2, // pause one PR automation cause after repeated no-op fixes for unchanged evidence
+  quarantineAutoRecoveryMax: 2, // #2996 follow-up: cap on auto-flipping WORKTREE_DIRTY/WORKTREE_DIVERGENT failures back to pending (the quarantine is self-healing so the next dispatch starts clean; the cap prevents infinite loops if quarantine itself keeps failing).
   completionReportRetentionDays: 90, // retain completion report sidecars beyond capped dispatch history
   completionReportMaxFiles: 5000, // hard cap for completion report sidecars during cleanup
   // P-bfa2c-cors-wildcard: extra Origins permitted to receive an
@@ -2254,6 +2255,22 @@ const ENGINE_DEFAULTS = {
   allowedDashboardOrigins: [],
   meetingRoundTimeout: 900000, // 15min per meeting round — soft signal; logs a "still waiting" warning each tick
   meetingRoundHardTimeout: 3600000, // 60min hard backstop — non-terminal participants are marked failed and the round advances. Prevents permanent stalls if an agent's dispatch never spawns or its completion gets dropped.
+  // W-mq066js7000fff1f-c (steering Gap B): max wall-clock a steering message may
+  // sit deferred (runtime hasn't emitted a resumable checkpoint yet — Copilot
+  // pre-first-checkpoint, etc.). Past this window the message is flagged
+  // stranded: `[steering-warn]` line on live-output, `_steeringStranded=true` on
+  // the active dispatch row, and the steering store (when present) marked
+  // `status='stranded'`. Default 15min; clamp 60_000..14_400_000 (1 min..4 h).
+  steeringDeferredMaxMs: 900000,
+  // W-mq066js7000fff1f-c (steering Gap C): cap on graceful+escalation kill
+  // attempts after a steering kill is issued. Ladder (between attempts): 30s →
+  // 60s → 120s, last interval reused. attempt 1 = killGracefully; attempts 2..cap
+  // = platform-specific hard kill (taskkill /F /T on Windows, descendant tree +
+  // pkill on Unix); after cap is reached and the process is still alive, the
+  // engine gives up with a `[steering-stuck]` log + non-actionable inbox notice
+  // so the agent surfaces in the dashboard for operator intervention. Default 3;
+  // clamp 1..5.
+  steeringMaxKillRetries: 3,
   evalLoop: true, // enable review→fix loop after implementation completes
   evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
   maxRetries: 3, // max dispatch retries before marking work item as failed

package/engine/timeout.js CHANGED Viewed

@@ -54,8 +54,87 @@ function checkIdleThreshold(config) {
 // ─── Steering Checker ────────────────────────────────────────────────────────
-// How long to wait for a steered agent to exit before retrying the kill
-const STEERING_KILL_RETRY_MS = 30000;
+// W-mq066js7000fff1f-c (Gap C): kill-retry escalation ladder. Intervals between
+// successive kill attempts after a steering kill issues. Attempt i (1-indexed)
+// uses STEERING_KILL_INTERVALS_MS[min(i-1, len-1)]; last value repeats so cap=5
+// keeps stepping every 120s. cap=1 = one graceful retry only (then give-up).
+const STEERING_KILL_INTERVALS_MS = [30000, 60000, 120000];
+// Set of steering-store statuses still "in flight" — Gap G ignores entries that
+// have already terminated (delivered/dropped/etc).
+const STEERING_ACTIVE_STATUSES = new Set(['queued', 'live_kill', 'deferred', 're_spawning']);
+// W-mq066js7000fff1f-c (Gap G): lazy require for engine/steering-store.js.
+// Sibling branch -d ships the store; my branch ships independently and no-ops
+// when the module is absent. Only swallows MODULE_NOT_FOUND for the exact
+// resolution — real syntax/runtime errors propagate so a broken store fails
+// visibly instead of silently degrading.
+let _steeringStoreLoaded = false;
+let _steeringStore = null;
+function getSteeringStore() {
+  if (_steeringStoreLoaded) return _steeringStore;
+  _steeringStoreLoaded = true;
+  try {
+    _steeringStore = require('./steering-store');
+  } catch (err) {
+    const isMissing = err && err.code === 'MODULE_NOT_FOUND' && /steering-store/.test(String(err.message || ''));
+    if (!isMissing) {
+      try { log('warn', `Steering: failed to load engine/steering-store.js: ${err.message}`); } catch {}
+    }
+    _steeringStore = null;
+  }
+  return _steeringStore;
+}
+// Test-only reset.
+function _resetSteeringStoreCacheForTest() { _steeringStoreLoaded = false; _steeringStore = null; }
+// Test-only direct injection — bypasses the lazy require entirely. Pass `null`
+// to simulate the module-absent case without touching require.cache.
+function _setSteeringStoreForTest(store) {
+  _steeringStoreLoaded = true;
+  _steeringStore = store;
+}
+function _steeringIdForEntry(entry) {
+  if (!entry) return null;
+  if (entry.id) return String(entry.id);
+  const filePath = entry.path || entry;
+  if (typeof filePath !== 'string') return null;
+  const m = path.basename(filePath).match(/^steering-(\d+)/);
+  return m ? m[1] : null;
+}
+function _safeStoreCall(fn, ...args) {
+  const store = getSteeringStore();
+  if (!store || typeof store[fn] !== 'function') return null;
+  try { return store[fn](...args); }
+  catch (err) {
+    try { log('warn', `Steering store ${fn} failed: ${err.message}`); } catch {}
+    return null;
+  }
+}
+function _clampSteeringDeferredMaxMs(config) {
+  const raw = Number(config?.engine?.steeringDeferredMaxMs ?? ENGINE_DEFAULTS.steeringDeferredMaxMs);
+  if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringDeferredMaxMs;
+  return Math.max(60000, Math.min(14400000, raw));
+}
+function _clampSteeringMaxKillRetries(config) {
+  const raw = Number(config?.engine?.steeringMaxKillRetries ?? ENGINE_DEFAULTS.steeringMaxKillRetries);
+  if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringMaxKillRetries;
+  return Math.max(1, Math.min(5, Math.round(raw)));
+}
+function _steeringKillIntervalMs(attemptIdx) {
+  const i = Math.max(0, Math.min(STEERING_KILL_INTERVALS_MS.length - 1, attemptIdx));
+  return STEERING_KILL_INTERVALS_MS[i];
+}
+function _appendLiveOutputLine(agentId, line) {
+  try {
+    const liveLogPath = path.join(AGENTS_DIR, agentId, 'live-output.log');
+    fs.appendFileSync(liveLogPath, line.endsWith('\n') ? line : `${line}\n`);
+  } catch { /* optional */ }
+}
 function runtimeSupportsMidRunSessionId(info) {
   if (typeof info?.midRunSessionId === 'boolean') return info.midRunSessionId;
@@ -76,6 +155,16 @@ function rememberDeferredSteering(info, steerEntry) {
   const existing = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
   if (steerEntry?.path) existing.add(steerEntry.path);
   info._deferredSteeringFiles = Array.from(existing);
+  // Stamp per-entry deferred timestamp for the Gap B stranded-sweep. Map keyed
+  // by file path so multiple deferred messages each track their own clock.
+  if (steerEntry?.path) {
+    if (!info._deferredSteeringQueuedAt || typeof info._deferredSteeringQueuedAt !== 'object') {
+      info._deferredSteeringQueuedAt = {};
+    }
+    if (!info._deferredSteeringQueuedAt[steerEntry.path]) {
+      info._deferredSteeringQueuedAt[steerEntry.path] = Date.now();
+    }
+  }
 }
 function deferSteeringUntilCheckpoint(id, info, steerEntry) {
@@ -97,14 +186,157 @@ function deferSteeringUntilCheckpoint(id, info, steerEntry) {
   } catch { /* optional */ }
 }
+// W-mq066js7000fff1f-c (Gap B): per-tick sweep over `_deferredSteeringQueuedAt`
+// entries that are still in `_deferredSteeringFiles` (source-of-truth) and have
+// no sessionId after `engine.steeringDeferredMaxMs`. Each match: append
+// `[steering-warn]` to live-output, mark `_steeringStranded: true` on the matching
+// dispatch active row, and `store.updateStatus(id, 'stranded')` (no-op if store
+// absent). `info._deferredSteeringStrandedFiles` (Set) guards against re-firing
+// the same warning every tick while the file remains stranded.
+function checkDeferredStranded(activeProcesses, config) {
+  const maxMs = _clampSteeringDeferredMaxMs(config);
+  const now = Date.now();
+  for (const [dispatchId, info] of activeProcesses) {
+    if (!info || info.sessionId) continue;
+    const queuedAt = info._deferredSteeringQueuedAt;
+    if (!queuedAt || typeof queuedAt !== 'object') continue;
+    const deferredSet = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
+    if (deferredSet.size === 0) continue;
+    if (!info._deferredSteeringStrandedFiles) info._deferredSteeringStrandedFiles = new Set();
+    const stranded = info._deferredSteeringStrandedFiles;
+    for (const filePath of deferredSet) {
+      if (stranded.has(filePath)) continue;
+      const ts = Number(queuedAt[filePath]);
+      if (!Number.isFinite(ts) || now - ts <= maxMs) continue;
+      const entryId = _steeringIdForEntry({ path: filePath });
+      const ageMin = Math.round((now - ts) / 60000);
+      _appendLiveOutputLine(info.agentId,
+        `\n[steering-warn] Steering message ${entryId || path.basename(filePath)} has been queued for ${ageMin}m without a resumable checkpoint (no sessionId emitted). It will be delivered on the next dispatch.`);
+      log('warn', `Steering: ${info.agentId} (${dispatchId}) deferred steering ${entryId || filePath} stranded after ${ageMin}m — no sessionId`);
+      stranded.add(filePath);
+      try {
+        dispatch().mutateDispatch((data) => {
+          for (const row of data.active || []) {
+            if (row && row.id === dispatchId) row._steeringStranded = true;
+          }
+          return data;
+        });
+      } catch (err) {
+        try { log('warn', `Steering: mutateDispatch _steeringStranded failed for ${dispatchId}: ${err.message}`); } catch {}
+      }
+      if (entryId) _safeStoreCall('updateStatus', entryId, 'stranded', { last_error: 'no-session-after-max-defer' });
+    }
+  }
+}
+// W-mq066js7000fff1f-c (Gap C): runs the kill-retry escalation ladder for an
+// already-steered process whose `_steeringAt` is set but which hasn't exited.
+// Caller is `checkSteering` per-tick loop; `info` is the `activeProcesses`
+// record. Returns true if any rung fired (caller should `continue` and not
+// scan for new steering messages for this dispatch).
+function _runSteeringKillLadder(id, info, config) {
+  if (!info._steeringAt || info._steeringGaveUp) return false;
+  const cap = _clampSteeringMaxKillRetries(config);
+  const attempts = Number(info._steeringKillAttempts) || 0;
+  // Reference time: anchor on `_steeringAt` for attempt 0 (the bookkeeping
+  // sequence), then on `_steeringLastRetryAt` for subsequent rungs.
+  const refTime = attempts === 0 ? info._steeringAt : (info._steeringLastRetryAt || info._steeringAt);
+  const wait = _steeringKillIntervalMs(attempts);
+  if (Date.now() - refTime <= wait) return false;
+  // attempts < cap → run kill attempt (i=0 graceful, else platform escalation).
+  if (attempts < cap) {
+    if (attempts === 0) {
+      log('warn', `Steering: ${info.agentId} (${id}) didn't exit ${Math.round(wait / 1000)}s after kill — retrying gracefully`);
+      try { shared.killGracefully(info.proc, 5000); }
+      catch (err) { try { log('warn', `Steering kill retry (graceful) failed for ${info.agentId}: ${err.message}`); } catch {} }
+    } else {
+      log('warn', `Steering: ${info.agentId} (${id}) survived attempt ${attempts} — escalating (attempt ${attempts + 1}/${cap})`);
+      _escalatePlatformKill(info);
+    }
+    info._steeringKillAttempts = attempts + 1;
+    info._steeringLastRetryAt = Date.now();
+    return true;
+  }
+  // attempts === cap → give up: log + non-actionable inbox notice.
+  const minutes = Math.round((Date.now() - info._steeringAt) / 60000);
+  log('error', `Steering: ${info.agentId} (${id}) did not exit after ${cap} kill attempts (${minutes}m since kill) — giving up`);
+  _appendLiveOutputLine(info.agentId,
+    `\n[steering-stuck] Process did not exit after ${cap} kill attempts over ~${minutes}m. The engine is giving up automatic escalation; operator intervention may be required.`);
+  try {
+    // System notification body is framed as non-actionable: prefix `[engine-system]`
+    // so an agent that re-consumes it from the inbox treats it as metadata. The
+    // primary signal is the live-output line above (operator-facing) and the
+    // give-up log; this inbox write is a belt-and-suspenders notification.
+    steering.writeSteeringMessage(info.agentId,
+      `[engine-system] Steering kill escalation gave up after ${cap} attempts (~${minutes}m). This is an automated notification — do not act on it; the engine has surfaced this dispatch for operator attention.`,
+      { source: 'engine' });
+  } catch (err) {
+    try { log('warn', `Steering: writeSteeringMessage for give-up failed: ${err.message}`); } catch {}
+  }
+  info._steeringGaveUp = true;
+  info._steeringKillAttempts = attempts + 1;
+  info._steeringLastRetryAt = Date.now();
+  return true;
+}
+function _escalatePlatformKill(info) {
+  const pid = info?.proc?.pid;
+  if (!pid) return;
+  if (process.platform === 'win32') {
+    try { shared.exec(`taskkill /F /T /PID ${pid}`, { timeout: 3000 }); }
+    catch { /* may already be dead */ }
+    return;
+  }
+  // Unix: collect descendant PIDs (deepest first), SIGKILL each, then a final
+  // `pkill -KILL -P <pid>` sweep for anything pgrep missed.
+  const descendants = _collectDescendantPids(pid);
+  for (const child of descendants) {
+    try { process.kill(child, 'SIGKILL'); } catch { /* gone */ }
+  }
+  try { shared.exec(`pkill -KILL -P ${pid}`, { timeout: 3000 }); }
+  catch { /* children may already be dead */ }
+  try { process.kill(pid, 'SIGKILL'); } catch { /* parent may already be dead */ }
+}
+function _collectDescendantPids(rootPid) {
+  // Returns descendant PIDs ordered deepest-first so leaves die before parents
+  // (otherwise grandchildren get re-parented to init and disappear from pgrep -P
+  // <pid>). BFS the tree, then reverse.
+  const visited = new Set();
+  const queue = [rootPid];
+  const order = [];
+  while (queue.length) {
+    const current = queue.shift();
+    if (visited.has(current)) continue;
+    visited.add(current);
+    let out = '';
+    try { out = String(shared.exec(`pgrep -P ${current}`, { timeout: 2000 }) || ''); }
+    catch { /* no children or pgrep missing */ continue; }
+    const children = out.split(/\s+/).map(s => Number(s)).filter(n => Number.isFinite(n) && n > 0);
+    for (const child of children) {
+      if (visited.has(child)) continue;
+      order.push(child);
+      queue.push(child);
+    }
+  }
+  return order.reverse();
+}
 function checkSteering(config) {
   const activeProcesses = engine().activeProcesses;
+  // Gap B: stranded-deferred sweep runs BEFORE the per-dispatch scan so a
+  // stranded info still records its warning even if it has no new inbox file.
+  try { checkDeferredStranded(activeProcesses, config); }
+  catch (err) { try { log('warn', `Steering: checkDeferredStranded failed: ${err.message}`); } catch {} }
   for (const [id, info] of activeProcesses) {
-    // Gap A (W-mq066js7000fff1f-b): scan agents/<id>/steering-ack/ for any
-    // ack files the agent has dropped since the last tick. Each <id>.ack
-    // removes its matching inbox file (lookup via frontmatter steerId), so
-    // unread/pending iteration below naturally skips messages already
-    // acknowledged via the explicit contract.
+    // Gap A (W-mq066js7000fff1f-b, master): scan agents/<id>/steering-ack/
+    // for any ack files the agent has dropped since the last tick. Each
+    // <id>.ack removes its matching inbox file (lookup via frontmatter
+    // steerId), so unread/pending iteration below naturally skips messages
+    // already acknowledged via the explicit contract.
     let ackedFromDir = [];
     try {
       ackedFromDir = steering.ackSteeringFromAckDir(info.agentId);
@@ -140,20 +372,12 @@ function checkSteering(config) {
       }
     }
-    // Recovery: if steering kill hasn't resulted in process exit within 30s, force-retry.
-    // This catches cases where killImmediate silently failed (e.g., orphaned subprocess
-    // on Unix where SIGKILL only hit spawn-agent.js, not the Claude CLI tree).
-    if (info._steeringAt && Date.now() - info._steeringAt > STEERING_KILL_RETRY_MS) {
-      if (!info._steeringRetried) {
-        log('warn', `Steering: ${info.agentId} (${id}) didn't exit ${STEERING_KILL_RETRY_MS / 1000}s after kill — retrying`);
-        shared.killImmediate(info.proc);
-        // On Unix, also try to kill children that may have been orphaned
-        if (process.platform !== 'win32' && info.proc?.pid) {
-          try { shared.exec(`pkill -KILL -P ${info.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
-        }
-        info._steeringRetried = true;
-      }
-      continue;
+    // Gap C (this PR): if a steering kill is in-flight, run the escalation
+    // ladder. The ladder owns all retry side effects and the give-up exit,
+    // and replaces the old one-shot STEERING_KILL_RETRY_MS retry path.
+    if (info._steeringAt && !info._steeringGaveUp) {
+      const fired = _runSteeringKillLadder(id, info, config);
+      if (fired) continue;
     }
     // Skip if already being steered (prevents double-kill race)
@@ -217,6 +441,43 @@ function checkSteering(config) {
   }
 }
+// W-mq066js7000fff1f-c (Gap G): when engine.js#onAgentClose's "No conversation
+// found" branch is about to unlink the session.json, any steering-store entries
+// still in flight against that purged sessionId would silently strand. This
+// helper drops them with `status='dropped'` + a `[steering-failed]` live-output
+// line so the human knows to re-send. Caller MUST invoke this BEFORE the unlink.
+// No-ops cleanly when the steering-store module is absent (-d branch unmerged).
+function dropSteeringForPurgedSession(agentId, sessionId, liveOutputPath) {
+  if (!agentId || !sessionId) return { dropped: [], skipped: true };
+  const store = getSteeringStore();
+  if (!store || typeof store.listForAgent !== 'function') return { dropped: [], skipped: true };
+  let entries = [];
+  try { entries = store.listForAgent(agentId) || []; }
+  catch (err) {
+    try { log('warn', `Steering: listForAgent for ${agentId} failed: ${err.message}`); } catch {}
+    return { dropped: [], skipped: true };
+  }
+  const dropped = [];
+  for (const entry of entries) {
+    if (!entry || !STEERING_ACTIVE_STATUSES.has(String(entry.status || ''))) continue;
+    const entrySessionId = entry._steeringSessionId || entry.sessionId || entry.session_id;
+    if (entrySessionId !== sessionId) continue;
+    const entryId = entry.id || _steeringIdForEntry(entry);
+    if (!entryId) continue;
+    _safeStoreCall('updateStatus', entryId, 'dropped', { last_error: 'session-purged' });
+    const line = `\n[steering-failed] Session ${sessionId} was purged by runtime; message ${entryId} dropped, please re-send.\n`;
+    if (liveOutputPath) {
+      try { fs.appendFileSync(liveOutputPath, line); }
+      catch { /* optional */ }
+    } else {
+      _appendLiveOutputLine(agentId, line);
+    }
+    log('warn', `Steering: dropped message ${entryId} for ${agentId} (session ${sessionId} purged)`);
+    dropped.push(entryId);
+  }
+  return { dropped, skipped: false };
+}
 // ─── Timeout Checker ─────────────────────────────────────────────────────────
 function trackedProcessPid(procInfo) {
@@ -676,7 +937,11 @@ module.exports = {
   checkTimeouts,
   checkSteering,
   checkIdleThreshold,
+  dropSteeringForPurgedSession,
   isOsPidAliveForDispatch,
   parseProcessExitCode, terminalResultIndicatesError, parseTerminalResultFallbackExitCode, // exported for testing
   readFileTail, runtimeSupportsMidRunSessionId, // exported for testing
+  // exported for testing
+  rememberDeferredSteering, checkDeferredStranded, _runSteeringKillLadder, _collectDescendantPids,
+  _resetSteeringStoreCacheForTest, _setSteeringStoreForTest,
 };

package/engine.js CHANGED Viewed

@@ -123,7 +123,7 @@ const { mutateDispatch, addToDispatch, addToDispatchWithValidation, isRetryableF
 // ─── Timeout / Steering / Idle (extracted to engine/timeout.js) ──────────────
-const { checkTimeouts, checkSteering, checkIdleThreshold } = require('./engine/timeout');
+const { checkTimeouts, checkSteering, checkIdleThreshold, dropSteeringForPurgedSession } = require('./engine/timeout');
 const steering = require('./engine/steering');
 // ─── Cleanup (extracted to engine/cleanup.js) ────────────────────────────────
@@ -528,6 +528,11 @@ function promoteCheckpointSteeringForClose(agentId, procInfo, runtime, liveOutpu
   const checkpointEntries = mergePendingSteeringEntries(pendingDeferred, lateCheckpoint);
   if (checkpointEntries.length === 0) {
     delete procInfo._deferredSteeringFiles;
+    // Gap B housekeeping: drop the per-entry deferred timestamps + stranded
+    // guard so a future spawn under the same procInfo (test harnesses + engine
+    // re-attach) starts with a clean slate.
+    delete procInfo._deferredSteeringQueuedAt;
+    delete procInfo._deferredSteeringStrandedFiles;
     return { status: 'none', entries: [] };
   }
@@ -548,6 +553,8 @@ function promoteCheckpointSteeringForClose(agentId, procInfo, runtime, liveOutpu
   procInfo._steeringEntry = checkpointEntries;
   procInfo._steeringDeferredCheckpoint = true;
   delete procInfo._deferredSteeringFiles;
+  delete procInfo._deferredSteeringQueuedAt;
+  delete procInfo._deferredSteeringStrandedFiles;
   // W-mq066js7000fff1f-a (Gap D): transition each promoted entry to
   // 're_spawning' — captures that the engine has committed to deliver
   // these messages via session resume at the natural checkpoint.
@@ -2683,9 +2690,14 @@ async function spawnAgent(dispatchItem, config) {
     updateAgentStatus(id, code === 0 ? AGENT_STATUS.FINISHED : AGENT_STATUS.FAILED,
       code === 0 ? 'Agent completed successfully' : `Agent exited with code ${code}`);
-    // Clear stale session if resume failed — prevents burning all retries on the same bad session
+    // Clear stale session if resume failed — prevents burning all retries on the same bad session.
+    // W-mq066js7000fff1f-c (Gap G): drop any in-flight steering-store entries
+    // bound to the purged session BEFORE unlinking session.json, so the human
+    // sees a [steering-failed] line + can re-send instead of silently watching
+    // their message strand against a dead session.
     if (code !== 0 && cachedSessionId && stderr.includes('No conversation found')) {
       log('warn', `Stale session ${cachedSessionId} for ${agentId} — clearing session.json`);
+      try { dropSteeringForPurgedSession(agentId, cachedSessionId, liveOutputPath); } catch {}
       try { shared.safeUnlink(path.join(AGENTS_DIR, agentId, 'session.json')); } catch {}
     }
@@ -5807,6 +5819,52 @@ function discoverFromWorkItems(config, project) {
       }
     }
+    // #2996 follow-up: auto-recover from WORKTREE_DIRTY / WORKTREE_DIVERGENT quarantines.
+    // The quarantine is self-healing (dir renamed away, ref backed up, branch reset to
+    // origin), so the next dispatch starts from a clean worktree. Auto-flip to pending up
+    // to ENGINE_DEFAULTS.quarantineAutoRecoveryMax times so WIs without downstream deps and
+    // no human watching don't rot in `failed`. _failureClass is stamped on the WI only by
+    // the force-demote path (engine/dispatch.js); the standard non-retryable path leaves
+    // it on the dispatch record but embeds the class name in failReason — so detection
+    // covers both shapes.
+    if (item.status === WI_STATUS.FAILED && !isItemCompleted(item)) {
+      const fr = String(item.failReason || '');
+      const isQuarantineFail = item._failureClass === FAILURE_CLASS.WORKTREE_DIRTY
+        || item._failureClass === FAILURE_CLASS.WORKTREE_DIVERGENT
+        || /\bWORKTREE_DIRTY\b/.test(fr)
+        || /\bWORKTREE_DIVERGENT\b/.test(fr);
+      if (isQuarantineFail) {
+        item._quarantineRecoveryCount = (item._quarantineRecoveryCount || 0) + 1;
+        const cap = ENGINE_DEFAULTS.quarantineAutoRecoveryMax || 2;
+        if (item._quarantineRecoveryCount <= cap) {
+          const prevReason = item.failReason;
+          item.status = WI_STATUS.PENDING;
+          delete item.failReason;
+          delete item.failedAt;
+          delete item._failureClass;
+          delete item._lastDispatchResult;
+          delete item.dispatched_at;
+          delete item.dispatched_to;
+          delete item._pendingReason;
+          log('info', `Quarantine auto-recovery: ${item.id} → pending (attempt ${item._quarantineRecoveryCount}/${cap}, was: ${prevReason})`);
+          needsWrite = true;
+          // Intentionally do NOT `continue` — fall through to the normal dispatch path
+          // (self-heal of stale completed dedupe + cooldown + dispatch evaluation) just
+          // like the dependency-failed recovery block above. The dedupe + cooldown gates
+          // still throttle how often the next dispatch fires.
+        } else {
+          // Cap hit: leave failed, log once via the gave-up flag so the warn doesn't
+          // fire every tick. _retriesByAgent is untouched — quarantine failures are
+          // environmental, not agent failures.
+          if (!item._quarantineRecoveryGaveUp) {
+            log('warn', `Quarantine auto-recovery: ${item.id} hit cap (${cap}) — leaving failed for human intervention`);
+            item._quarantineRecoveryGaveUp = true;
+            needsWrite = true;
+          }
+        }
+      }
+    }
     if (item.status !== WI_STATUS.QUEUED && item.status !== WI_STATUS.PENDING) continue;
     // Dependency gate: skip items whose depends_on are not yet met; propagate failure

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@yemi33/minions",
-  "version": "0.1.2120",
+  "version": "0.1.2122",
   "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
   "bin": {
     "minions": "bin/minions.js"