@yemi33/minions 0.1.2120 → 0.1.2122

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -274,6 +274,8 @@ async function openSettings() {
274
274
  settingsField('Restart Grace Period', 'set-restartGracePeriod', e.restartGracePeriod || 1200000, 'ms', 'Grace period before orphan detection on restart') +
275
275
  settingsField('Shutdown Timeout', 'set-shutdownTimeout', e.shutdownTimeout || 300000, 'ms', 'Max wait for agents during graceful shutdown') +
276
276
  settingsField('Meeting Round Timeout', 'set-meetingRoundTimeout', e.meetingRoundTimeout || 900000, 'ms', 'Auto-advance meeting round after this') +
277
+ settingsField('Steering Deferred Max', 'set-steeringDeferredMaxMs', e.steeringDeferredMaxMs || 900000, 'ms', 'Max wait for a runtime to emit a resumable checkpoint before a deferred steering message is flagged stranded. After this, the engine warns to live-output, marks _steeringStranded on the dispatch, and (when the steering store is present) sets store status=stranded. Default 15min; range 60s–4h.') +
278
+ settingsField('Steering Max Kill Retries', 'set-steeringMaxKillRetries', e.steeringMaxKillRetries ?? 3, '', 'Cap on graceful+escalation kill attempts after a steering kill is issued. Ladder waits 30s → 60s → 120s between attempts (last interval reused). Attempt 1 is graceful; attempts 2..cap are platform hard kills (taskkill /F /T on Windows; descendant-tree SIGKILL + pkill on Unix). Past cap, the engine gives up with a [steering-stuck] log + inbox notice. Default 3; range 1–5.') +
277
279
  '</div>';
278
280
 
279
281
  const paneWorktree =
@@ -839,6 +841,8 @@ async function saveSettings() {
839
841
  shutdownTimeout: document.getElementById('set-shutdownTimeout').value,
840
842
  restartGracePeriod: document.getElementById('set-restartGracePeriod').value,
841
843
  meetingRoundTimeout: document.getElementById('set-meetingRoundTimeout').value,
844
+ steeringDeferredMaxMs: document.getElementById('set-steeringDeferredMaxMs').value,
845
+ steeringMaxKillRetries: document.getElementById('set-steeringMaxKillRetries').value,
842
846
  operatorLogin: (document.getElementById('set-operatorLogin')?.value ?? '').trim(),
843
847
  autoApprovePlans: document.getElementById('set-autoApprovePlans').checked,
844
848
  evalLoop: document.getElementById('set-evalLoop').checked,
package/dashboard.js CHANGED
@@ -9225,6 +9225,9 @@ What would you like to discuss or change? When you're happy, say "approve" and I
9225
9225
  worktreeCreateTimeout: [60000], worktreeCreateRetries: [0, 3],
9226
9226
  idleAlertMinutes: [1], shutdownTimeout: [30000], restartGracePeriod: [60000],
9227
9227
  meetingRoundTimeout: [60000],
9228
+ // W-mq066js7000fff1f-c (Gap B/C): steering safety-net knobs.
9229
+ steeringDeferredMaxMs: [60000, 14400000],
9230
+ steeringMaxKillRetries: [1, 5],
9228
9231
  versionCheckInterval: [60000],
9229
9232
  prPollStatusEvery: [1], prPollCommentsEvery: [1],
9230
9233
  agentBusyReassignMs: [0],
package/engine/ado.js CHANGED
@@ -698,10 +698,68 @@ function _hasPendingReReviewWi(pr) {
698
698
  let _adoTokenCache = { token: null, expiresAt: 0 };
699
699
  let _adoTokenFailedUntil = 0; // backoff: skip token acquisition calls until this timestamp
700
700
 
701
- // ─── ADO Throttle State ─────────────────────────────────────────────────────
702
- // Tracks rate-limiting (HTTP 429/503) from ADO API responses.
703
- // Uses shared createThrottleTracker factory: backoffMs starts at 60s, doubles, caps at 32 min.
704
- const _adoThrottle = createThrottleTracker({ label: 'ado', baseBackoffMs: 60000, maxBackoffMs: 32 * 60000 });
701
+ // ─── ADO Throttle State (per-org) ───────────────────────────────────────────
702
+ // Tracks rate-limiting (HTTP 429/503) from ADO API responses, isolated per ADO
703
+ // org so a throttle storm on org A doesn't stall PR polling for org B.
704
+ // Each tracker uses createThrottleTracker: backoffMs starts at 60s, doubles,
705
+ // caps at 32 min, with 20% jitter (silently ignored on older shared.js until
706
+ // the jitter foundation lands as W-mq03l6zh0006f0a1-a).
707
+ // W-mq03l6zh0006f0a1-b — Per-org ADO throttle isolation.
708
+ const _adoThrottlesByOrg = new Map();
709
+
710
+ /** Canonicalize an orgBase URL or already-canonical key to a stable Map key.
711
+ * Lowercases the org segment and prefers `dev.azure.com/<org>` even when the
712
+ * source uses the legacy `<org>.visualstudio.com` host. */
713
+ function canonicalAdoOrgKey(orgBaseOrUrl) {
714
+ if (!orgBaseOrUrl) return 'dev.azure.com/__unknown__';
715
+ const s = String(orgBaseOrUrl);
716
+ if (/^https?:\/\//i.test(s)) return resolveAdoOrgBaseFromUrl(s);
717
+ return s.toLowerCase();
718
+ }
719
+
720
+ /** Parse an ADO API URL down to a stable orgBase key.
721
+ * Examples:
722
+ * https://dev.azure.com/Microsoft/... → dev.azure.com/microsoft
723
+ * https://microsoft.visualstudio.com/... → dev.azure.com/microsoft
724
+ * https://microsoft.visualstudio.com/DefaultCollection/... → dev.azure.com/microsoft
725
+ * Returns 'dev.azure.com/__unknown__' on parse failure so the throttle map
726
+ * always has a non-null key. */
727
+ function resolveAdoOrgBaseFromUrl(url) {
728
+ if (!url) return 'dev.azure.com/__unknown__';
729
+ try {
730
+ const u = new URL(url);
731
+ const host = u.hostname.toLowerCase();
732
+ if (host === 'dev.azure.com') {
733
+ const seg = (u.pathname.split('/').filter(Boolean)[0] || '').toLowerCase();
734
+ return seg ? `dev.azure.com/${seg}` : 'dev.azure.com/__unknown__';
735
+ }
736
+ if (host.endsWith('.visualstudio.com')) {
737
+ const org = host.slice(0, -'.visualstudio.com'.length);
738
+ return org ? `dev.azure.com/${org}` : 'dev.azure.com/__unknown__';
739
+ }
740
+ // Unknown host shape — derive a stable key from host + first path segment.
741
+ const seg = (u.pathname.split('/').filter(Boolean)[0] || '').toLowerCase();
742
+ return seg ? `${host}/${seg}` : host;
743
+ } catch {
744
+ return 'dev.azure.com/__unknown__';
745
+ }
746
+ }
747
+
748
+ /** Lazily get-or-create the per-org throttle tracker. */
749
+ function getAdoThrottleForOrg(orgBase) {
750
+ const key = canonicalAdoOrgKey(orgBase);
751
+ let tracker = _adoThrottlesByOrg.get(key);
752
+ if (!tracker) {
753
+ tracker = createThrottleTracker({
754
+ label: `ado:${key}`,
755
+ baseBackoffMs: 60000,
756
+ maxBackoffMs: 32 * 60000,
757
+ jitterRatio: 0.2,
758
+ });
759
+ _adoThrottlesByOrg.set(key, tracker);
760
+ }
761
+ return tracker;
762
+ }
705
763
 
706
764
  // ─── Auth Failure Tracking ──────────────────────────────────────────────────
707
765
  // Set when pollPrStatus encounters auth errors mid-loop. The engine checks this
@@ -742,6 +800,7 @@ async function adoFetch(url, token, opts = {}) {
742
800
  const body = (typeof opts === 'object' && opts.body) || undefined;
743
801
  const timeout = (typeof opts === 'object' && Number.isFinite(opts.timeout)) ? opts.timeout : 30000;
744
802
  const MAX_RETRIES = ADO_TOKEN_REFRESH_MAX_RETRIES;
803
+ const throttle = getAdoThrottleForOrg(resolveAdoOrgBaseFromUrl(url));
745
804
  const res = await fetch(url, {
746
805
  method,
747
806
  headers: { 'Authorization': `Bearer ${token}`, 'Content-Type': 'application/json' },
@@ -752,8 +811,8 @@ async function adoFetch(url, token, opts = {}) {
752
811
  if (res.status === 429 || res.status === 503) {
753
812
  const retryAfterSec = parseInt(res.headers.get('Retry-After'), 10);
754
813
  const retryAfterMs = (retryAfterSec > 0) ? retryAfterSec * 1000 : 0;
755
- _adoThrottle.recordThrottle(retryAfterMs);
756
- const state = _adoThrottle.getState();
814
+ throttle.recordThrottle(retryAfterMs);
815
+ const state = throttle.getState();
757
816
  throw new Error(`ADO API throttled (${res.status}): retry after ${Math.round((state.retryAfter - Date.now()) / 1000)}s`);
758
817
  }
759
818
  if (!res.ok) throw new Error(`ADO API ${method} ${res.status}: ${res.statusText}`);
@@ -771,12 +830,13 @@ async function adoFetch(url, token, opts = {}) {
771
830
  }
772
831
  const json = JSON.parse(text);
773
832
  // ── Success decay: decrement consecutiveHits, reset when fully recovered ──
774
- _adoThrottle.recordSuccess();
833
+ throttle.recordSuccess();
775
834
  return json;
776
835
  }
777
836
 
778
837
  /** Fetch raw text from ADO API (for build logs which aren't JSON). */
779
838
  async function adoFetchText(url, token) {
839
+ const throttle = getAdoThrottleForOrg(resolveAdoOrgBaseFromUrl(url));
780
840
  const res = await fetch(url, {
781
841
  headers: { 'Authorization': `Bearer ${token}` },
782
842
  signal: AbortSignal.timeout(30000),
@@ -785,8 +845,8 @@ async function adoFetchText(url, token) {
785
845
  if (res.status === 429 || res.status === 503) {
786
846
  const retryAfterSec = parseInt(res.headers.get('Retry-After'), 10);
787
847
  const retryAfterMs = (retryAfterSec > 0) ? retryAfterSec * 1000 : 0;
788
- _adoThrottle.recordThrottle(retryAfterMs);
789
- const state = _adoThrottle.getState();
848
+ throttle.recordThrottle(retryAfterMs);
849
+ const state = throttle.getState();
790
850
  throw new Error(`ADO API throttled (${res.status}): retry after ${Math.round((state.retryAfter - Date.now()) / 1000)}s`);
791
851
  }
792
852
  if (!res.ok) throw new Error(`ADO API ${res.status}: ${res.statusText}`);
@@ -908,6 +968,12 @@ async function forEachActivePr(config, token, callback) {
908
968
  let projectUpdated = 0;
909
969
  const updatedRecords = [];
910
970
  const orgBase = getAdoOrgBase(project);
971
+ // W-mq03l6zh0006f0a1-b — Per-org throttle isolation: skip just this
972
+ // project when its org is rate-limited, keep iterating others.
973
+ if (isAdoThrottled(orgBase)) {
974
+ log('info', `[ado] PR polling skipped for ${project.name || project.repoName || orgBase} — ${orgBase} throttled`);
975
+ continue;
976
+ }
911
977
 
912
978
  // Parallelize PR polling within each project (max 5 concurrent to avoid rate limits)
913
979
  const CONCURRENCY = 5;
@@ -2241,11 +2307,53 @@ async function fetchSinglePrBuildStatus(project, prNumber) {
2241
2307
 
2242
2308
  // ─── ADO Throttle Queries ────────────────────────────────────────────────────
2243
2309
 
2244
- /** Returns true if ADO is throttled and retryAfter hasn't elapsed. Auto-clears when retryAfter passes. */
2245
- const isAdoThrottled = () => _adoThrottle.isThrottled();
2310
+ /** Returns true if ADO is throttled. If orgBase is provided, checks that org's
2311
+ * tracker only; if omitted, returns true when ANY tracked org is throttled
2312
+ * (back-compat OR semantics for existing call sites). Auto-clears stale state. */
2313
+ const isAdoThrottled = (orgBase) => {
2314
+ if (orgBase != null) {
2315
+ const tracker = _adoThrottlesByOrg.get(canonicalAdoOrgKey(orgBase));
2316
+ return tracker ? tracker.isThrottled() : false;
2317
+ }
2318
+ for (const tracker of _adoThrottlesByOrg.values()) {
2319
+ if (tracker.isThrottled()) return true;
2320
+ }
2321
+ return false;
2322
+ };
2246
2323
 
2247
- /** Returns a snapshot of the current throttle state. Calls isAdoThrottled() for a fresh value. */
2248
- const getAdoThrottleState = () => _adoThrottle.getState();
2324
+ /** Returns a snapshot of the throttle state.
2325
+ * - getAdoThrottleState(orgBase) that org's `{ throttled, retryAfter, consecutiveHits }`.
2326
+ * Returns a zero-state default for orgs that have never been touched.
2327
+ * - getAdoThrottleState() → aggregate snapshot with back-compat fields
2328
+ * (`throttled` = OR, `retryAfter` = max, `consecutiveHits` = sum) plus a
2329
+ * `perOrg` map keyed by canonical orgBase. */
2330
+ const getAdoThrottleState = (orgBase) => {
2331
+ if (orgBase != null) {
2332
+ const tracker = _adoThrottlesByOrg.get(canonicalAdoOrgKey(orgBase));
2333
+ return tracker ? tracker.getState() : { throttled: false, retryAfter: 0, consecutiveHits: 0 };
2334
+ }
2335
+ let throttled = false;
2336
+ let retryAfter = 0;
2337
+ let consecutiveHits = 0;
2338
+ const perOrg = {};
2339
+ for (const [key, tracker] of _adoThrottlesByOrg) {
2340
+ const state = tracker.getState();
2341
+ perOrg[key] = state;
2342
+ if (state.throttled) throttled = true;
2343
+ if (state.retryAfter > retryAfter) retryAfter = state.retryAfter;
2344
+ consecutiveHits += state.consecutiveHits;
2345
+ }
2346
+ return { throttled, retryAfter, consecutiveHits, perOrg };
2347
+ };
2348
+
2349
+ /** Returns the per-org tracker state map keyed by canonical orgBase. */
2350
+ const getAdoThrottleStateAll = () => {
2351
+ const out = {};
2352
+ for (const [key, tracker] of _adoThrottlesByOrg) {
2353
+ out[key] = tracker.getState();
2354
+ }
2355
+ return out;
2356
+ };
2249
2357
 
2250
2358
  /**
2251
2359
  * Query ADO for an open PR on a specific branch.
@@ -2263,13 +2371,13 @@ async function findOpenPrOnBranch(project, branch) {
2263
2371
  logMissingAdoRepository(project, 'ADO branch PR lookup');
2264
2372
  return null;
2265
2373
  }
2266
- if (isAdoThrottled()) {
2267
- log('debug', `[ado] Skipping branch PR lookup for ${project.name || project.repoName || 'unknown project'}:${branch} — throttled`);
2374
+ const orgBase = shared.getAdoOrgBase(project);
2375
+ if (isAdoThrottled(orgBase)) {
2376
+ log('debug', `[ado] Skipping branch PR lookup for ${project.name || project.repoName || 'unknown project'}:${branch} — ${orgBase} throttled`);
2268
2377
  return null;
2269
2378
  }
2270
2379
  const token = await getAdoToken();
2271
2380
  if (!token) return null;
2272
- const orgBase = shared.getAdoOrgBase(project);
2273
2381
  const sourceRef = encodeURIComponent(`refs/heads/${branch}`);
2274
2382
  const url = `${orgBase}/${project.adoProject}/_apis/git/repositories/${encodeURIComponent(adoRepositoryId)}/pullrequests?searchCriteria.status=active&searchCriteria.sourceRefName=${sourceRef}&api-version=7.1`;
2275
2383
  const data = await adoFetch(url, token);
@@ -2280,14 +2388,17 @@ async function findOpenPrOnBranch(project, branch) {
2280
2388
  return { prNumber, url: prUrl };
2281
2389
  }
2282
2390
 
2283
- /** Reset throttle state — exported for testing only. */
2391
+ /** Reset throttle state — exported for testing only. Clears the entire per-org Map. */
2284
2392
  function _resetAdoThrottle() {
2285
- _adoThrottle._reset();
2393
+ _adoThrottlesByOrg.clear();
2286
2394
  }
2287
2395
 
2288
- /** Set throttle state directly — exported for testing only. */
2289
- function _setAdoThrottleForTest(state) {
2290
- _adoThrottle._setForTest(state);
2396
+ /** Set throttle state directly — exported for testing only.
2397
+ * Default orgBase keeps back-compat with arg-less callers that just want
2398
+ * "some org is throttled" semantics through isAdoThrottled() / getAdoThrottleState(). */
2399
+ function _setAdoThrottleForTest(state, orgBase = 'dev.azure.com/__test__') {
2400
+ const tracker = getAdoThrottleForOrg(orgBase);
2401
+ tracker._setForTest(state);
2291
2402
  }
2292
2403
 
2293
2404
  /** Inject a token into the cache — exported for testing only.
@@ -2476,6 +2587,7 @@ module.exports = {
2476
2587
  isAdoAuthError, // exported for testing
2477
2588
  isAdoThrottled,
2478
2589
  getAdoThrottleState,
2590
+ getAdoThrottleStateAll,
2479
2591
  fetchAdoPrMetadata,
2480
2592
  fetchSinglePrBuildStatus,
2481
2593
  findOpenPrOnBranch,
package/engine/shared.js CHANGED
@@ -2242,6 +2242,7 @@ const ENGINE_DEFAULTS = {
2242
2242
  autoFixHumanComments: true, // auto-dispatch fix agents for actionable human PR comments
2243
2243
  autoConsolidateMemory: false, // opt-in: periodically spawn engine/kb-sweep-runner.js from the tick loop (4h cadence). Inbox→notes consolidation already runs every tick via consolidateInbox; this flag only controls the KB sweep.
2244
2244
  prNoOpFixPauseAttempts: 2, // pause one PR automation cause after repeated no-op fixes for unchanged evidence
2245
+ quarantineAutoRecoveryMax: 2, // #2996 follow-up: cap on auto-flipping WORKTREE_DIRTY/WORKTREE_DIVERGENT failures back to pending (the quarantine is self-healing so the next dispatch starts clean; the cap prevents infinite loops if quarantine itself keeps failing).
2245
2246
  completionReportRetentionDays: 90, // retain completion report sidecars beyond capped dispatch history
2246
2247
  completionReportMaxFiles: 5000, // hard cap for completion report sidecars during cleanup
2247
2248
  // P-bfa2c-cors-wildcard: extra Origins permitted to receive an
@@ -2254,6 +2255,22 @@ const ENGINE_DEFAULTS = {
2254
2255
  allowedDashboardOrigins: [],
2255
2256
  meetingRoundTimeout: 900000, // 15min per meeting round — soft signal; logs a "still waiting" warning each tick
2256
2257
  meetingRoundHardTimeout: 3600000, // 60min hard backstop — non-terminal participants are marked failed and the round advances. Prevents permanent stalls if an agent's dispatch never spawns or its completion gets dropped.
2258
+ // W-mq066js7000fff1f-c (steering Gap B): max wall-clock a steering message may
2259
+ // sit deferred (runtime hasn't emitted a resumable checkpoint yet — Copilot
2260
+ // pre-first-checkpoint, etc.). Past this window the message is flagged
2261
+ // stranded: `[steering-warn]` line on live-output, `_steeringStranded=true` on
2262
+ // the active dispatch row, and the steering store (when present) marked
2263
+ // `status='stranded'`. Default 15min; clamp 60_000..14_400_000 (1 min..4 h).
2264
+ steeringDeferredMaxMs: 900000,
2265
+ // W-mq066js7000fff1f-c (steering Gap C): cap on graceful+escalation kill
2266
+ // attempts after a steering kill is issued. Ladder (between attempts): 30s →
2267
+ // 60s → 120s, last interval reused. attempt 1 = killGracefully; attempts 2..cap
2268
+ // = platform-specific hard kill (taskkill /F /T on Windows, descendant tree +
2269
+ // pkill on Unix); after cap is reached and the process is still alive, the
2270
+ // engine gives up with a `[steering-stuck]` log + non-actionable inbox notice
2271
+ // so the agent surfaces in the dashboard for operator intervention. Default 3;
2272
+ // clamp 1..5.
2273
+ steeringMaxKillRetries: 3,
2257
2274
  evalLoop: true, // enable review→fix loop after implementation completes
2258
2275
  evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
2259
2276
  maxRetries: 3, // max dispatch retries before marking work item as failed
package/engine/timeout.js CHANGED
@@ -54,8 +54,87 @@ function checkIdleThreshold(config) {
54
54
 
55
55
  // ─── Steering Checker ────────────────────────────────────────────────────────
56
56
 
57
- // How long to wait for a steered agent to exit before retrying the kill
58
- const STEERING_KILL_RETRY_MS = 30000;
57
+ // W-mq066js7000fff1f-c (Gap C): kill-retry escalation ladder. Intervals between
58
+ // successive kill attempts after a steering kill issues. Attempt i (1-indexed)
59
+ // uses STEERING_KILL_INTERVALS_MS[min(i-1, len-1)]; last value repeats so cap=5
60
+ // keeps stepping every 120s. cap=1 = one graceful retry only (then give-up).
61
+ const STEERING_KILL_INTERVALS_MS = [30000, 60000, 120000];
62
+ // Set of steering-store statuses still "in flight" — Gap G ignores entries that
63
+ // have already terminated (delivered/dropped/etc).
64
+ const STEERING_ACTIVE_STATUSES = new Set(['queued', 'live_kill', 'deferred', 're_spawning']);
65
+
66
+ // W-mq066js7000fff1f-c (Gap G): lazy require for engine/steering-store.js.
67
+ // Sibling branch -d ships the store; my branch ships independently and no-ops
68
+ // when the module is absent. Only swallows MODULE_NOT_FOUND for the exact
69
+ // resolution — real syntax/runtime errors propagate so a broken store fails
70
+ // visibly instead of silently degrading.
71
+ let _steeringStoreLoaded = false;
72
+ let _steeringStore = null;
73
+ function getSteeringStore() {
74
+ if (_steeringStoreLoaded) return _steeringStore;
75
+ _steeringStoreLoaded = true;
76
+ try {
77
+ _steeringStore = require('./steering-store');
78
+ } catch (err) {
79
+ const isMissing = err && err.code === 'MODULE_NOT_FOUND' && /steering-store/.test(String(err.message || ''));
80
+ if (!isMissing) {
81
+ try { log('warn', `Steering: failed to load engine/steering-store.js: ${err.message}`); } catch {}
82
+ }
83
+ _steeringStore = null;
84
+ }
85
+ return _steeringStore;
86
+ }
87
+ // Test-only reset.
88
+ function _resetSteeringStoreCacheForTest() { _steeringStoreLoaded = false; _steeringStore = null; }
89
+ // Test-only direct injection — bypasses the lazy require entirely. Pass `null`
90
+ // to simulate the module-absent case without touching require.cache.
91
+ function _setSteeringStoreForTest(store) {
92
+ _steeringStoreLoaded = true;
93
+ _steeringStore = store;
94
+ }
95
+
96
+ function _steeringIdForEntry(entry) {
97
+ if (!entry) return null;
98
+ if (entry.id) return String(entry.id);
99
+ const filePath = entry.path || entry;
100
+ if (typeof filePath !== 'string') return null;
101
+ const m = path.basename(filePath).match(/^steering-(\d+)/);
102
+ return m ? m[1] : null;
103
+ }
104
+
105
+ function _safeStoreCall(fn, ...args) {
106
+ const store = getSteeringStore();
107
+ if (!store || typeof store[fn] !== 'function') return null;
108
+ try { return store[fn](...args); }
109
+ catch (err) {
110
+ try { log('warn', `Steering store ${fn} failed: ${err.message}`); } catch {}
111
+ return null;
112
+ }
113
+ }
114
+
115
+ function _clampSteeringDeferredMaxMs(config) {
116
+ const raw = Number(config?.engine?.steeringDeferredMaxMs ?? ENGINE_DEFAULTS.steeringDeferredMaxMs);
117
+ if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringDeferredMaxMs;
118
+ return Math.max(60000, Math.min(14400000, raw));
119
+ }
120
+
121
+ function _clampSteeringMaxKillRetries(config) {
122
+ const raw = Number(config?.engine?.steeringMaxKillRetries ?? ENGINE_DEFAULTS.steeringMaxKillRetries);
123
+ if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringMaxKillRetries;
124
+ return Math.max(1, Math.min(5, Math.round(raw)));
125
+ }
126
+
127
+ function _steeringKillIntervalMs(attemptIdx) {
128
+ const i = Math.max(0, Math.min(STEERING_KILL_INTERVALS_MS.length - 1, attemptIdx));
129
+ return STEERING_KILL_INTERVALS_MS[i];
130
+ }
131
+
132
+ function _appendLiveOutputLine(agentId, line) {
133
+ try {
134
+ const liveLogPath = path.join(AGENTS_DIR, agentId, 'live-output.log');
135
+ fs.appendFileSync(liveLogPath, line.endsWith('\n') ? line : `${line}\n`);
136
+ } catch { /* optional */ }
137
+ }
59
138
 
60
139
  function runtimeSupportsMidRunSessionId(info) {
61
140
  if (typeof info?.midRunSessionId === 'boolean') return info.midRunSessionId;
@@ -76,6 +155,16 @@ function rememberDeferredSteering(info, steerEntry) {
76
155
  const existing = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
77
156
  if (steerEntry?.path) existing.add(steerEntry.path);
78
157
  info._deferredSteeringFiles = Array.from(existing);
158
+ // Stamp per-entry deferred timestamp for the Gap B stranded-sweep. Map keyed
159
+ // by file path so multiple deferred messages each track their own clock.
160
+ if (steerEntry?.path) {
161
+ if (!info._deferredSteeringQueuedAt || typeof info._deferredSteeringQueuedAt !== 'object') {
162
+ info._deferredSteeringQueuedAt = {};
163
+ }
164
+ if (!info._deferredSteeringQueuedAt[steerEntry.path]) {
165
+ info._deferredSteeringQueuedAt[steerEntry.path] = Date.now();
166
+ }
167
+ }
79
168
  }
80
169
 
81
170
  function deferSteeringUntilCheckpoint(id, info, steerEntry) {
@@ -97,14 +186,157 @@ function deferSteeringUntilCheckpoint(id, info, steerEntry) {
97
186
  } catch { /* optional */ }
98
187
  }
99
188
 
189
+ // W-mq066js7000fff1f-c (Gap B): per-tick sweep over `_deferredSteeringQueuedAt`
190
+ // entries that are still in `_deferredSteeringFiles` (source-of-truth) and have
191
+ // no sessionId after `engine.steeringDeferredMaxMs`. Each match: append
192
+ // `[steering-warn]` to live-output, mark `_steeringStranded: true` on the matching
193
+ // dispatch active row, and `store.updateStatus(id, 'stranded')` (no-op if store
194
+ // absent). `info._deferredSteeringStrandedFiles` (Set) guards against re-firing
195
+ // the same warning every tick while the file remains stranded.
196
+ function checkDeferredStranded(activeProcesses, config) {
197
+ const maxMs = _clampSteeringDeferredMaxMs(config);
198
+ const now = Date.now();
199
+ for (const [dispatchId, info] of activeProcesses) {
200
+ if (!info || info.sessionId) continue;
201
+ const queuedAt = info._deferredSteeringQueuedAt;
202
+ if (!queuedAt || typeof queuedAt !== 'object') continue;
203
+ const deferredSet = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
204
+ if (deferredSet.size === 0) continue;
205
+ if (!info._deferredSteeringStrandedFiles) info._deferredSteeringStrandedFiles = new Set();
206
+ const stranded = info._deferredSteeringStrandedFiles;
207
+ for (const filePath of deferredSet) {
208
+ if (stranded.has(filePath)) continue;
209
+ const ts = Number(queuedAt[filePath]);
210
+ if (!Number.isFinite(ts) || now - ts <= maxMs) continue;
211
+ const entryId = _steeringIdForEntry({ path: filePath });
212
+ const ageMin = Math.round((now - ts) / 60000);
213
+ _appendLiveOutputLine(info.agentId,
214
+ `\n[steering-warn] Steering message ${entryId || path.basename(filePath)} has been queued for ${ageMin}m without a resumable checkpoint (no sessionId emitted). It will be delivered on the next dispatch.`);
215
+ log('warn', `Steering: ${info.agentId} (${dispatchId}) deferred steering ${entryId || filePath} stranded after ${ageMin}m — no sessionId`);
216
+ stranded.add(filePath);
217
+ try {
218
+ dispatch().mutateDispatch((data) => {
219
+ for (const row of data.active || []) {
220
+ if (row && row.id === dispatchId) row._steeringStranded = true;
221
+ }
222
+ return data;
223
+ });
224
+ } catch (err) {
225
+ try { log('warn', `Steering: mutateDispatch _steeringStranded failed for ${dispatchId}: ${err.message}`); } catch {}
226
+ }
227
+ if (entryId) _safeStoreCall('updateStatus', entryId, 'stranded', { last_error: 'no-session-after-max-defer' });
228
+ }
229
+ }
230
+ }
231
+
232
+ // W-mq066js7000fff1f-c (Gap C): runs the kill-retry escalation ladder for an
233
+ // already-steered process whose `_steeringAt` is set but which hasn't exited.
234
+ // Caller is `checkSteering` per-tick loop; `info` is the `activeProcesses`
235
+ // record. Returns true if any rung fired (caller should `continue` and not
236
+ // scan for new steering messages for this dispatch).
237
+ function _runSteeringKillLadder(id, info, config) {
238
+ if (!info._steeringAt || info._steeringGaveUp) return false;
239
+ const cap = _clampSteeringMaxKillRetries(config);
240
+ const attempts = Number(info._steeringKillAttempts) || 0;
241
+ // Reference time: anchor on `_steeringAt` for attempt 0 (the bookkeeping
242
+ // sequence), then on `_steeringLastRetryAt` for subsequent rungs.
243
+ const refTime = attempts === 0 ? info._steeringAt : (info._steeringLastRetryAt || info._steeringAt);
244
+ const wait = _steeringKillIntervalMs(attempts);
245
+ if (Date.now() - refTime <= wait) return false;
246
+
247
+ // attempts < cap → run kill attempt (i=0 graceful, else platform escalation).
248
+ if (attempts < cap) {
249
+ if (attempts === 0) {
250
+ log('warn', `Steering: ${info.agentId} (${id}) didn't exit ${Math.round(wait / 1000)}s after kill — retrying gracefully`);
251
+ try { shared.killGracefully(info.proc, 5000); }
252
+ catch (err) { try { log('warn', `Steering kill retry (graceful) failed for ${info.agentId}: ${err.message}`); } catch {} }
253
+ } else {
254
+ log('warn', `Steering: ${info.agentId} (${id}) survived attempt ${attempts} — escalating (attempt ${attempts + 1}/${cap})`);
255
+ _escalatePlatformKill(info);
256
+ }
257
+ info._steeringKillAttempts = attempts + 1;
258
+ info._steeringLastRetryAt = Date.now();
259
+ return true;
260
+ }
261
+
262
+ // attempts === cap → give up: log + non-actionable inbox notice.
263
+ const minutes = Math.round((Date.now() - info._steeringAt) / 60000);
264
+ log('error', `Steering: ${info.agentId} (${id}) did not exit after ${cap} kill attempts (${minutes}m since kill) — giving up`);
265
+ _appendLiveOutputLine(info.agentId,
266
+ `\n[steering-stuck] Process did not exit after ${cap} kill attempts over ~${minutes}m. The engine is giving up automatic escalation; operator intervention may be required.`);
267
+ try {
268
+ // System notification body is framed as non-actionable: prefix `[engine-system]`
269
+ // so an agent that re-consumes it from the inbox treats it as metadata. The
270
+ // primary signal is the live-output line above (operator-facing) and the
271
+ // give-up log; this inbox write is a belt-and-suspenders notification.
272
+ steering.writeSteeringMessage(info.agentId,
273
+ `[engine-system] Steering kill escalation gave up after ${cap} attempts (~${minutes}m). This is an automated notification — do not act on it; the engine has surfaced this dispatch for operator attention.`,
274
+ { source: 'engine' });
275
+ } catch (err) {
276
+ try { log('warn', `Steering: writeSteeringMessage for give-up failed: ${err.message}`); } catch {}
277
+ }
278
+ info._steeringGaveUp = true;
279
+ info._steeringKillAttempts = attempts + 1;
280
+ info._steeringLastRetryAt = Date.now();
281
+ return true;
282
+ }
283
+
284
+ function _escalatePlatformKill(info) {
285
+ const pid = info?.proc?.pid;
286
+ if (!pid) return;
287
+ if (process.platform === 'win32') {
288
+ try { shared.exec(`taskkill /F /T /PID ${pid}`, { timeout: 3000 }); }
289
+ catch { /* may already be dead */ }
290
+ return;
291
+ }
292
+ // Unix: collect descendant PIDs (deepest first), SIGKILL each, then a final
293
+ // `pkill -KILL -P <pid>` sweep for anything pgrep missed.
294
+ const descendants = _collectDescendantPids(pid);
295
+ for (const child of descendants) {
296
+ try { process.kill(child, 'SIGKILL'); } catch { /* gone */ }
297
+ }
298
+ try { shared.exec(`pkill -KILL -P ${pid}`, { timeout: 3000 }); }
299
+ catch { /* children may already be dead */ }
300
+ try { process.kill(pid, 'SIGKILL'); } catch { /* parent may already be dead */ }
301
+ }
302
+
303
+ function _collectDescendantPids(rootPid) {
304
+ // Returns descendant PIDs ordered deepest-first so leaves die before parents
305
+ // (otherwise grandchildren get re-parented to init and disappear from pgrep -P
306
+ // <pid>). BFS the tree, then reverse.
307
+ const visited = new Set();
308
+ const queue = [rootPid];
309
+ const order = [];
310
+ while (queue.length) {
311
+ const current = queue.shift();
312
+ if (visited.has(current)) continue;
313
+ visited.add(current);
314
+ let out = '';
315
+ try { out = String(shared.exec(`pgrep -P ${current}`, { timeout: 2000 }) || ''); }
316
+ catch { /* no children or pgrep missing */ continue; }
317
+ const children = out.split(/\s+/).map(s => Number(s)).filter(n => Number.isFinite(n) && n > 0);
318
+ for (const child of children) {
319
+ if (visited.has(child)) continue;
320
+ order.push(child);
321
+ queue.push(child);
322
+ }
323
+ }
324
+ return order.reverse();
325
+ }
326
+
100
327
  function checkSteering(config) {
101
328
  const activeProcesses = engine().activeProcesses;
329
+ // Gap B: stranded-deferred sweep runs BEFORE the per-dispatch scan so a
330
+ // stranded info still records its warning even if it has no new inbox file.
331
+ try { checkDeferredStranded(activeProcesses, config); }
332
+ catch (err) { try { log('warn', `Steering: checkDeferredStranded failed: ${err.message}`); } catch {} }
333
+
102
334
  for (const [id, info] of activeProcesses) {
103
- // Gap A (W-mq066js7000fff1f-b): scan agents/<id>/steering-ack/ for any
104
- // ack files the agent has dropped since the last tick. Each <id>.ack
105
- // removes its matching inbox file (lookup via frontmatter steerId), so
106
- // unread/pending iteration below naturally skips messages already
107
- // acknowledged via the explicit contract.
335
+ // Gap A (W-mq066js7000fff1f-b, master): scan agents/<id>/steering-ack/
336
+ // for any ack files the agent has dropped since the last tick. Each
337
+ // <id>.ack removes its matching inbox file (lookup via frontmatter
338
+ // steerId), so unread/pending iteration below naturally skips messages
339
+ // already acknowledged via the explicit contract.
108
340
  let ackedFromDir = [];
109
341
  try {
110
342
  ackedFromDir = steering.ackSteeringFromAckDir(info.agentId);
@@ -140,20 +372,12 @@ function checkSteering(config) {
140
372
  }
141
373
  }
142
374
 
143
- // Recovery: if steering kill hasn't resulted in process exit within 30s, force-retry.
144
- // This catches cases where killImmediate silently failed (e.g., orphaned subprocess
145
- // on Unix where SIGKILL only hit spawn-agent.js, not the Claude CLI tree).
146
- if (info._steeringAt && Date.now() - info._steeringAt > STEERING_KILL_RETRY_MS) {
147
- if (!info._steeringRetried) {
148
- log('warn', `Steering: ${info.agentId} (${id}) didn't exit ${STEERING_KILL_RETRY_MS / 1000}s after kill — retrying`);
149
- shared.killImmediate(info.proc);
150
- // On Unix, also try to kill children that may have been orphaned
151
- if (process.platform !== 'win32' && info.proc?.pid) {
152
- try { shared.exec(`pkill -KILL -P ${info.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
153
- }
154
- info._steeringRetried = true;
155
- }
156
- continue;
375
+ // Gap C (this PR): if a steering kill is in-flight, run the escalation
376
+ // ladder. The ladder owns all retry side effects and the give-up exit,
377
+ // and replaces the old one-shot STEERING_KILL_RETRY_MS retry path.
378
+ if (info._steeringAt && !info._steeringGaveUp) {
379
+ const fired = _runSteeringKillLadder(id, info, config);
380
+ if (fired) continue;
157
381
  }
158
382
 
159
383
  // Skip if already being steered (prevents double-kill race)
@@ -217,6 +441,43 @@ function checkSteering(config) {
217
441
  }
218
442
  }
219
443
 
444
+ // W-mq066js7000fff1f-c (Gap G): when engine.js#onAgentClose's "No conversation
445
+ // found" branch is about to unlink the session.json, any steering-store entries
446
+ // still in flight against that purged sessionId would silently strand. This
447
+ // helper drops them with `status='dropped'` + a `[steering-failed]` live-output
448
+ // line so the human knows to re-send. Caller MUST invoke this BEFORE the unlink.
449
+ // No-ops cleanly when the steering-store module is absent (-d branch unmerged).
450
+ function dropSteeringForPurgedSession(agentId, sessionId, liveOutputPath) {
451
+ if (!agentId || !sessionId) return { dropped: [], skipped: true };
452
+ const store = getSteeringStore();
453
+ if (!store || typeof store.listForAgent !== 'function') return { dropped: [], skipped: true };
454
+ let entries = [];
455
+ try { entries = store.listForAgent(agentId) || []; }
456
+ catch (err) {
457
+ try { log('warn', `Steering: listForAgent for ${agentId} failed: ${err.message}`); } catch {}
458
+ return { dropped: [], skipped: true };
459
+ }
460
+ const dropped = [];
461
+ for (const entry of entries) {
462
+ if (!entry || !STEERING_ACTIVE_STATUSES.has(String(entry.status || ''))) continue;
463
+ const entrySessionId = entry._steeringSessionId || entry.sessionId || entry.session_id;
464
+ if (entrySessionId !== sessionId) continue;
465
+ const entryId = entry.id || _steeringIdForEntry(entry);
466
+ if (!entryId) continue;
467
+ _safeStoreCall('updateStatus', entryId, 'dropped', { last_error: 'session-purged' });
468
+ const line = `\n[steering-failed] Session ${sessionId} was purged by runtime; message ${entryId} dropped, please re-send.\n`;
469
+ if (liveOutputPath) {
470
+ try { fs.appendFileSync(liveOutputPath, line); }
471
+ catch { /* optional */ }
472
+ } else {
473
+ _appendLiveOutputLine(agentId, line);
474
+ }
475
+ log('warn', `Steering: dropped message ${entryId} for ${agentId} (session ${sessionId} purged)`);
476
+ dropped.push(entryId);
477
+ }
478
+ return { dropped, skipped: false };
479
+ }
480
+
220
481
  // ─── Timeout Checker ─────────────────────────────────────────────────────────
221
482
 
222
483
  function trackedProcessPid(procInfo) {
@@ -676,7 +937,11 @@ module.exports = {
676
937
  checkTimeouts,
677
938
  checkSteering,
678
939
  checkIdleThreshold,
940
+ dropSteeringForPurgedSession,
679
941
  isOsPidAliveForDispatch,
680
942
  parseProcessExitCode, terminalResultIndicatesError, parseTerminalResultFallbackExitCode, // exported for testing
681
943
  readFileTail, runtimeSupportsMidRunSessionId, // exported for testing
944
+ // exported for testing
945
+ rememberDeferredSteering, checkDeferredStranded, _runSteeringKillLadder, _collectDescendantPids,
946
+ _resetSteeringStoreCacheForTest, _setSteeringStoreForTest,
682
947
  };
package/engine.js CHANGED
@@ -123,7 +123,7 @@ const { mutateDispatch, addToDispatch, addToDispatchWithValidation, isRetryableF
123
123
 
124
124
  // ─── Timeout / Steering / Idle (extracted to engine/timeout.js) ──────────────
125
125
 
126
- const { checkTimeouts, checkSteering, checkIdleThreshold } = require('./engine/timeout');
126
+ const { checkTimeouts, checkSteering, checkIdleThreshold, dropSteeringForPurgedSession } = require('./engine/timeout');
127
127
  const steering = require('./engine/steering');
128
128
 
129
129
  // ─── Cleanup (extracted to engine/cleanup.js) ────────────────────────────────
@@ -528,6 +528,11 @@ function promoteCheckpointSteeringForClose(agentId, procInfo, runtime, liveOutpu
528
528
  const checkpointEntries = mergePendingSteeringEntries(pendingDeferred, lateCheckpoint);
529
529
  if (checkpointEntries.length === 0) {
530
530
  delete procInfo._deferredSteeringFiles;
531
+ // Gap B housekeeping: drop the per-entry deferred timestamps + stranded
532
+ // guard so a future spawn under the same procInfo (test harnesses + engine
533
+ // re-attach) starts with a clean slate.
534
+ delete procInfo._deferredSteeringQueuedAt;
535
+ delete procInfo._deferredSteeringStrandedFiles;
531
536
  return { status: 'none', entries: [] };
532
537
  }
533
538
 
@@ -548,6 +553,8 @@ function promoteCheckpointSteeringForClose(agentId, procInfo, runtime, liveOutpu
548
553
  procInfo._steeringEntry = checkpointEntries;
549
554
  procInfo._steeringDeferredCheckpoint = true;
550
555
  delete procInfo._deferredSteeringFiles;
556
+ delete procInfo._deferredSteeringQueuedAt;
557
+ delete procInfo._deferredSteeringStrandedFiles;
551
558
  // W-mq066js7000fff1f-a (Gap D): transition each promoted entry to
552
559
  // 're_spawning' — captures that the engine has committed to deliver
553
560
  // these messages via session resume at the natural checkpoint.
@@ -2683,9 +2690,14 @@ async function spawnAgent(dispatchItem, config) {
2683
2690
  updateAgentStatus(id, code === 0 ? AGENT_STATUS.FINISHED : AGENT_STATUS.FAILED,
2684
2691
  code === 0 ? 'Agent completed successfully' : `Agent exited with code ${code}`);
2685
2692
 
2686
- // Clear stale session if resume failed — prevents burning all retries on the same bad session
2693
+ // Clear stale session if resume failed — prevents burning all retries on the same bad session.
2694
+ // W-mq066js7000fff1f-c (Gap G): drop any in-flight steering-store entries
2695
+ // bound to the purged session BEFORE unlinking session.json, so the human
2696
+ // sees a [steering-failed] line + can re-send instead of silently watching
2697
+ // their message strand against a dead session.
2687
2698
  if (code !== 0 && cachedSessionId && stderr.includes('No conversation found')) {
2688
2699
  log('warn', `Stale session ${cachedSessionId} for ${agentId} — clearing session.json`);
2700
+ try { dropSteeringForPurgedSession(agentId, cachedSessionId, liveOutputPath); } catch {}
2689
2701
  try { shared.safeUnlink(path.join(AGENTS_DIR, agentId, 'session.json')); } catch {}
2690
2702
  }
2691
2703
 
@@ -5807,6 +5819,52 @@ function discoverFromWorkItems(config, project) {
5807
5819
  }
5808
5820
  }
5809
5821
 
5822
+ // #2996 follow-up: auto-recover from WORKTREE_DIRTY / WORKTREE_DIVERGENT quarantines.
5823
+ // The quarantine is self-healing (dir renamed away, ref backed up, branch reset to
5824
+ // origin), so the next dispatch starts from a clean worktree. Auto-flip to pending up
5825
+ // to ENGINE_DEFAULTS.quarantineAutoRecoveryMax times so WIs without downstream deps and
5826
+ // no human watching don't rot in `failed`. _failureClass is stamped on the WI only by
5827
+ // the force-demote path (engine/dispatch.js); the standard non-retryable path leaves
5828
+ // it on the dispatch record but embeds the class name in failReason — so detection
5829
+ // covers both shapes.
5830
+ if (item.status === WI_STATUS.FAILED && !isItemCompleted(item)) {
5831
+ const fr = String(item.failReason || '');
5832
+ const isQuarantineFail = item._failureClass === FAILURE_CLASS.WORKTREE_DIRTY
5833
+ || item._failureClass === FAILURE_CLASS.WORKTREE_DIVERGENT
5834
+ || /\bWORKTREE_DIRTY\b/.test(fr)
5835
+ || /\bWORKTREE_DIVERGENT\b/.test(fr);
5836
+ if (isQuarantineFail) {
5837
+ item._quarantineRecoveryCount = (item._quarantineRecoveryCount || 0) + 1;
5838
+ const cap = ENGINE_DEFAULTS.quarantineAutoRecoveryMax || 2;
5839
+ if (item._quarantineRecoveryCount <= cap) {
5840
+ const prevReason = item.failReason;
5841
+ item.status = WI_STATUS.PENDING;
5842
+ delete item.failReason;
5843
+ delete item.failedAt;
5844
+ delete item._failureClass;
5845
+ delete item._lastDispatchResult;
5846
+ delete item.dispatched_at;
5847
+ delete item.dispatched_to;
5848
+ delete item._pendingReason;
5849
+ log('info', `Quarantine auto-recovery: ${item.id} → pending (attempt ${item._quarantineRecoveryCount}/${cap}, was: ${prevReason})`);
5850
+ needsWrite = true;
5851
+ // Intentionally do NOT `continue` — fall through to the normal dispatch path
5852
+ // (self-heal of stale completed dedupe + cooldown + dispatch evaluation) just
5853
+ // like the dependency-failed recovery block above. The dedupe + cooldown gates
5854
+ // still throttle how often the next dispatch fires.
5855
+ } else {
5856
+ // Cap hit: leave failed, log once via the gave-up flag so the warn doesn't
5857
+ // fire every tick. _retriesByAgent is untouched — quarantine failures are
5858
+ // environmental, not agent failures.
5859
+ if (!item._quarantineRecoveryGaveUp) {
5860
+ log('warn', `Quarantine auto-recovery: ${item.id} hit cap (${cap}) — leaving failed for human intervention`);
5861
+ item._quarantineRecoveryGaveUp = true;
5862
+ needsWrite = true;
5863
+ }
5864
+ }
5865
+ }
5866
+ }
5867
+
5810
5868
  if (item.status !== WI_STATUS.QUEUED && item.status !== WI_STATUS.PENDING) continue;
5811
5869
 
5812
5870
  // Dependency gate: skip items whose depends_on are not yet met; propagate failure
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2120",
3
+ "version": "0.1.2122",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"