@yemi33/minions 0.1.2213 → 0.1.2215

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/minions.js CHANGED
@@ -58,6 +58,7 @@
58
58
  const fs = require('fs');
59
59
  const path = require('path');
60
60
  const os = require('os');
61
+ const net = require('net');
61
62
  const { spawn, spawnSync, execSync } = require('child_process');
62
63
 
63
64
  const PKG_ROOT = path.resolve(__dirname, '..');
@@ -136,6 +137,24 @@ function killByPort(port) {
136
137
 
137
138
  const isPortListening = (port) => getListeningPids(port).length > 0;
138
139
 
140
+ /** Authoritative "is something accepting TCP connections on this port?" probe.
141
+ * Resolves true iff a connection to 127.0.0.1:port completes within timeoutMs.
142
+ * Unlike the netstat-based isPortListening, this can't false-negative under
143
+ * CPU/event-loop load: the TCP handshake is satisfied by the OS listen backlog
144
+ * even when the listener's JS event loop is momentarily blocked. Used by the
145
+ * watchdog to overturn a transient `down` verdict before restarting a live
146
+ * dashboard. Never throws; resolves false on any error/timeout. */
147
+ function tcpPortAccepts(port, timeoutMs = 1500) {
148
+ return new Promise((resolve) => {
149
+ let settled = false;
150
+ const done = (val) => { if (!settled) { settled = true; try { socket.destroy(); } catch {} resolve(val); } };
151
+ const socket = net.connect({ host: '127.0.0.1', port: Number(port) });
152
+ socket.once('connect', () => done(true));
153
+ socket.once('error', () => done(false));
154
+ socket.setTimeout(timeoutMs, () => done(false));
155
+ });
156
+ }
157
+
139
158
  /**
140
159
  * Wait until no process is listening on `port`, retrying a kill on each tick
141
160
  * for any stragglers that re-appeared (e.g. orphan child the original kill
@@ -1483,6 +1502,7 @@ ${fs.existsSync(path.join(PKG_ROOT, '.git')) ? `
1483
1502
  dashPort: DASHBOARD_PORT,
1484
1503
  readEnginePid,
1485
1504
  isPortListening,
1505
+ confirmPortUp: tcpPortAccepts,
1486
1506
  isStopIntentSet: shared.isStopIntentSet || (() => false),
1487
1507
  }).then(result => {
1488
1508
  // ALWAYS exit 0 — the scheduler must never see a failure for the
package/dashboard.js CHANGED
@@ -4672,6 +4672,7 @@ async function ccCallStreaming(message, { store = 'cc', sessionKey, extraContext
4672
4672
  engineConfig: CONFIG.engine,
4673
4673
  onChunk,
4674
4674
  onToolUse,
4675
+ onToolUpdate,
4675
4676
  });
4676
4677
  if (onAbortReady) onAbortReady(p1.abort);
4677
4678
  result = await p1;
@@ -4710,6 +4711,7 @@ async function ccCallStreaming(message, { store = 'cc', sessionKey, extraContext
4710
4711
  engineConfig: CONFIG.engine,
4711
4712
  onChunk,
4712
4713
  onToolUse,
4714
+ onToolUpdate,
4713
4715
  });
4714
4716
  if (onAbortReady) onAbortReady(p2.abort);
4715
4717
  result = await p2;
@@ -4730,6 +4732,7 @@ async function ccCallStreaming(message, { store = 'cc', sessionKey, extraContext
4730
4732
  engineConfig: CONFIG.engine,
4731
4733
  onChunk,
4732
4734
  onToolUse,
4735
+ onToolUpdate,
4733
4736
  });
4734
4737
  if (onAbortReady) onAbortReady(p3.abort);
4735
4738
  result = await p3;
@@ -9255,12 +9258,22 @@ What would you like to discuss or change? When you're happy, say "approve" and I
9255
9258
  liveState.text = text;
9256
9259
  if (liveState.writer) liveState.writer({ type: 'chunk', text, segmentId });
9257
9260
  },
9258
- onToolUse: (name, input) => {
9261
+ onToolUse: (name, input, id) => {
9259
9262
  _touchCcLiveStream(liveState);
9260
- const entry = { name, input: input || {}, id: null, status: null };
9263
+ // Claude (direct) now threads a tool_use id; with an id the chip starts
9264
+ // 'pending' and can be flipped to completed/failed by onToolUpdate.
9265
+ const entry = { name, input: input || {}, id: id || null, status: id ? 'pending' : null };
9261
9266
  toolUses.push(entry);
9262
9267
  liveState.tools.push(entry);
9263
- if (liveState.writer) liveState.writer({ type: 'tool', name, input: _lightToolInput(input), id: null });
9268
+ if (liveState.writer) liveState.writer({ type: 'tool', name, input: _lightToolInput(input), id: id || null });
9269
+ },
9270
+ onToolUpdate: (id, status) => {
9271
+ _touchCcLiveStream(liveState);
9272
+ // toolUses and liveState.tools share entry references — patch once so a
9273
+ // reconnect replays the resolved (green/red) state, not just pending.
9274
+ const entry = toolUses.find((e) => e && e.id === id);
9275
+ if (entry) entry.status = status;
9276
+ if (liveState.writer) liveState.writer({ type: 'tool-update', id, status });
9264
9277
  },
9265
9278
  });
9266
9279
  }
@@ -9673,7 +9686,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
9673
9686
  reconnectDone();
9674
9687
  });
9675
9688
  for (const tool of live.tools || []) {
9676
- writeCcEvent({ type: 'tool', name: tool.name, input: _lightToolInput(tool.input) });
9689
+ writeCcEvent({ type: 'tool', name: tool.name, input: _lightToolInput(tool.input), id: tool.id || null });
9690
+ // Replay the resolved chip state (green check / red X) for tools that
9691
+ // already finished, so a reconnect doesn't stick on the pending dot.
9692
+ if (tool.id && tool.status && tool.status !== 'pending') {
9693
+ writeCcEvent({ type: 'tool-update', id: tool.id, status: tool.status });
9694
+ }
9677
9695
  }
9678
9696
  if (live.text) writeCcEvent({ type: 'chunk', text: live.text });
9679
9697
  if (live.donePayload) {
@@ -10891,6 +10909,28 @@ What would you like to discuss or change? When you're happy, say "approve" and I
10891
10909
  } catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
10892
10910
  }
10893
10911
 
10912
+ // W-mqidhwcc000m897e — read-only ADO token-acquisition health snapshot.
10913
+ // Surfaces the graduated-backoff + transient/persistent classification state
10914
+ // so an operator can diagnose "ADO polling paused" from one place:
10915
+ // { token: { lastSuccessAt, lastFailureReason, classification,
10916
+ // backoffUntil, consecutiveFailures } }.
10917
+ async function handleDiagnosticsAdoToken(req, res) {
10918
+ try {
10919
+ let token = { lastSuccessAt: 0, lastFailureReason: null, classification: null, backoffUntil: 0, consecutiveFailures: 0 };
10920
+ if (typeof ado.getAdoTokenHealth === 'function') {
10921
+ const h = ado.getAdoTokenHealth() || {};
10922
+ token = {
10923
+ lastSuccessAt: Number(h.lastSuccessAt) || 0,
10924
+ lastFailureReason: h.lastFailureReason || null,
10925
+ classification: h.classification || null,
10926
+ backoffUntil: Number(h.backoffUntil) || 0,
10927
+ consecutiveFailures: Number(h.consecutiveFailures) || 0,
10928
+ };
10929
+ }
10930
+ return jsonReply(res, 200, { token });
10931
+ } catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
10932
+ }
10933
+
10894
10934
  // P-c3d4e5f6 — /api/diagnostics/memory.
10895
10935
  // Returns the latest in-process dashboard sample, the latest engine
10896
10936
  // sample (read fresh from engine/diagnostics-memory.json via safeJsonObj),
@@ -13633,6 +13673,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
13633
13673
  { method: 'POST', path: '/api/diagnostics/refresh', desc: 'Append a dashboard refresh-diagnostic ring buffer batch to engine/dashboard-diagnostics.log (rotated at 1 MB)', params: 'entries[]', handler: handleDiagnosticsRefresh },
13634
13674
  // Diagnostics — per-org ADO throttle state (W-mq03l6zh0006f0a1-d).
13635
13675
  { method: 'GET', path: '/api/diagnostics/ado-throttle', desc: 'Snapshot of per-org ADO throttle tracker state — { orgs: { [orgBase]: { throttled, retryAfter, consecutiveHits } } }. Falls back to a single `global` key when running against pre-per-org engines.', handler: handleDiagnosticsAdoThrottle },
13676
+ { method: 'GET', path: '/api/diagnostics/ado-token', desc: 'Read-only ADO token-acquisition health — { token: { lastSuccessAt, lastFailureReason, classification, backoffUntil, consecutiveFailures } }. Surfaces the graduated-backoff + transient/persistent classification state (W-mqidhwcc000m897e) so an operator can diagnose paused ADO polling from one line.', handler: handleDiagnosticsAdoToken },
13636
13677
  // Diagnostics — engine + dashboard memory baseline (P-c3d4e5f6).
13637
13678
  { method: 'GET', path: '/api/diagnostics/memory', desc: 'Latest in-process dashboard memory sample plus the most-recent engine sample read from engine/diagnostics-memory.json. engineStale=true when the sidecar is missing or its capturedAt is > 5 min old.', handler: handleDiagnosticsMemory },
13638
13679
  { method: 'GET', path: '/api/diagnostics/memory/history', desc: 'In-memory ring buffer of memory samples. process=dashboard returns the dashboard\'s own collector (populated by startPeriodicSampling on boot). process=engine returns the dashboard\'s polled accumulation of engine/diagnostics-memory.json — engine.js only persists the latest sample to the sidecar, so engine-side history is rebuilt by the dashboard poller (dedup by capturedAt). Optional limit caps returned newest-N samples.', params: 'process (engine|dashboard), limit?', handler: handleDiagnosticsMemoryHistory },
package/engine/ado.js CHANGED
@@ -6,7 +6,7 @@
6
6
  const path = require('path');
7
7
  const childProcess = require('child_process');
8
8
  const shared = require('./shared');
9
- const { exec, execAsync, getAdoOrgBase, log, ts, dateStamp, PR_STATUS, BUILD_STATUS, REVIEW_STATUS, FETCH_TIMEOUT_MS, ADO_TOKEN_REFRESH_MAX_RETRIES, createThrottleTracker } = shared;
9
+ const { exec, execAsync, getAdoOrgBase, log, ts, dateStamp, PR_STATUS, BUILD_STATUS, REVIEW_STATUS, FETCH_TIMEOUT_MS, ADO_TOKEN_REFRESH_MAX_RETRIES, createThrottleTracker, createBackoffTracker, writeToInbox } = shared;
10
10
  const { getPrs } = require('./queries');
11
11
  const { mutateJsonFileLocked } = shared;
12
12
  const { acquireAdoToken } = require('./ado-token');
@@ -714,7 +714,118 @@ function _hasPendingReReviewWi(pr) {
714
714
  // ─── ADO Token Cache ─────────────────────────────────────────────────────────
715
715
 
716
716
  let _adoTokenCache = { token: null, expiresAt: 0 };
717
- let _adoTokenFailedUntil = 0; // backoff: skip token acquisition calls until this timestamp
717
+
718
+ // ── ADO token acquisition backoff + health (W-mqidhwcc000m897e) ──────────────
719
+ // Graduated backoff replaces the old flat 10-min blackout: a single transient
720
+ // blip (broker hiccup, az cold-start, network) now backs off 30s and doubles
721
+ // per consecutive failure, capped at 10m, resetting to zero on the next
722
+ // success. Distinct from the per-org throttle tracker (server-driven 429/503).
723
+ const ADO_TOKEN_BACKOFF_BASE_MS = 30000;
724
+ const ADO_TOKEN_BACKOFF_MAX_MS = 10 * 60 * 1000;
725
+ let _adoTokenBackoff = createBackoffTracker({ baseMs: ADO_TOKEN_BACKOFF_BASE_MS, maxMs: ADO_TOKEN_BACKOFF_MAX_MS });
726
+
727
+ // Read-model for the token health diagnostic + reason-carrying skip warn.
728
+ // Only the three fields the backoff tracker does NOT own live here; backoffUntil
729
+ // and consecutiveFailures are read from `_adoTokenBackoff` on demand in
730
+ // getAdoTokenHealth() so there is a single source of truth.
731
+ let _adoTokenHealth = {
732
+ lastSuccessAt: 0,
733
+ lastFailureReason: null,
734
+ classification: null, // 'transient' | 'persistent' | null
735
+ };
736
+
737
+ // Dedup latch for the persistent-auth operator inbox note. Set when a note is
738
+ // emitted; cleared on the next successful acquisition so a fresh episode
739
+ // (after a human runs `az login`) can surface a new note.
740
+ let _adoTokenAuthNoteSent = false;
741
+ // Backoff-window the last no-token skip warn fired for. The skip warn logs on
742
+ // the STATE TRANSITION into a new no-token window (each real acquisition
743
+ // failure mints a new backoffUntil) and stays silent across the suppressed
744
+ // poll/reconcile cycles within that same window.
745
+ let _adoNoTokenWarnedBackoffUntil = -1;
746
+
747
+ // Persistent (human-actionable) acquisition errors: az not logged in, no
748
+ // subscription, azureauth interactive/broker consent required, missing tooling.
749
+ // Everything else (timeout, ECONN*, broker busy, generic failure) is transient.
750
+ const _ADO_TOKEN_PERSISTENT_RE = /\baz login\b|run ['"]?az login|az account set|no subscription(?:\s+found)?|not logged in|AADSTS\d+|interaction[_ ]required|consent[_ ]required|interactive authentication|reauthenticat|invalid_grant|is not recognized|command not found/i;
751
+
752
+ /** Classify an acquisition error as 'persistent' (human-actionable, needs az
753
+ * login / setup) or 'transient' (retry will likely self-heal). Scans the
754
+ * combined error message plus every per-provider attempt error. Defaults to
755
+ * 'transient' so we never nag the operator on a recoverable blip. */
756
+ function classifyAdoTokenError(err) {
757
+ const parts = [err?.message || ''];
758
+ if (Array.isArray(err?.attempts)) {
759
+ for (const a of err.attempts) parts.push(String(a?.error || ''));
760
+ }
761
+ const haystack = parts.join(' | ');
762
+ return _ADO_TOKEN_PERSISTENT_RE.test(haystack) ? 'persistent' : 'transient';
763
+ }
764
+
765
+ /** Short, single-line reason for the health read-model + skip-warn log. */
766
+ function _summarizeAdoTokenError(err) {
767
+ return String(err?.message || 'unknown error').replace(/\s+/g, ' ').trim().slice(0, 240);
768
+ }
769
+
770
+ // Overridable note writer (test seam). Default routes through the shared
771
+ // date-deduped inbox writer so at most one note lands per day even if the
772
+ // in-memory latch is reset.
773
+ let _adoTokenAuthNoteWriter = null;
774
+ function _defaultAdoTokenAuthNoteWriter(reason) {
775
+ const body = [
776
+ '# ADO auth needs `az login` — PR polling/reconciliation paused',
777
+ '',
778
+ 'ADO token acquisition is failing in a way that will NOT self-heal without a',
779
+ 'human action (e.g. `az login`, selecting a subscription, or completing an',
780
+ 'interactive azureauth consent). Engine PR status polling + reconciliation for',
781
+ 'ADO repos are paused on a graduated backoff until a token can be minted again.',
782
+ '',
783
+ `- reason: ${reason || 'unknown'}`,
784
+ '- remedy: run `az login` (and `az account set --subscription <id>` if prompted)',
785
+ ' on the engine host, then the next successful acquisition auto-resumes polling.',
786
+ '',
787
+ 'This note is deduped — it will not repeat every cycle. A fresh note is allowed',
788
+ 'after the next successful token acquisition.',
789
+ ].join('\n');
790
+ return writeToInbox('engine', 'ado-token-auth-paused', body);
791
+ }
792
+ function _emitAdoTokenAuthNote(reason) {
793
+ try {
794
+ return (_adoTokenAuthNoteWriter || _defaultAdoTokenAuthNoteWriter)(reason);
795
+ } catch (e) {
796
+ try { log('warn', `Failed to write ADO token auth inbox note: ${e.message}`); } catch { /* best-effort */ }
797
+ return false;
798
+ }
799
+ }
800
+
801
+ /** Token-health snapshot for diagnostics (read-only). */
802
+ function getAdoTokenHealth() {
803
+ const bs = _adoTokenBackoff.getState();
804
+ return {
805
+ lastSuccessAt: _adoTokenHealth.lastSuccessAt,
806
+ lastFailureReason: _adoTokenHealth.lastFailureReason,
807
+ classification: _adoTokenHealth.classification,
808
+ backoffUntil: bs.backoffUntil,
809
+ consecutiveFailures: bs.consecutiveFailures,
810
+ };
811
+ }
812
+
813
+ /** Consume a one-shot no-token warning. Returns shouldWarn=true only on the
814
+ * transition into a new no-token backoff window (each real acquisition failure
815
+ * mints a new backoffUntil), so the recurring poll/reconcile cycles within the
816
+ * same window stay silent. Carries the underlying reason + backoffUntil so the
817
+ * caller logs a single diagnosable line. */
818
+ function consumeNoAdoTokenWarning() {
819
+ const backoffUntil = _adoTokenBackoff.getState().backoffUntil;
820
+ const shouldWarn = backoffUntil !== _adoNoTokenWarnedBackoffUntil;
821
+ if (shouldWarn) _adoNoTokenWarnedBackoffUntil = backoffUntil;
822
+ return {
823
+ shouldWarn,
824
+ reason: _adoTokenHealth.lastFailureReason,
825
+ classification: _adoTokenHealth.classification,
826
+ backoffUntil,
827
+ };
828
+ }
718
829
 
719
830
  // ─── ADO Throttle State (per-org) ───────────────────────────────────────────
720
831
  // Tracks rate-limiting (HTTP 429/503) from ADO API responses, isolated per ADO
@@ -793,23 +904,36 @@ function isAdoAuthError(err) {
793
904
  return msg.includes('auth redirect') || msg.includes('HTML instead of JSON') || /ADO API (401|403)/.test(msg);
794
905
  }
795
906
 
796
- async function getAdoToken() {
907
+ async function getAdoToken(opts = {}) {
908
+ // _acquire is a test seam — production callers pass no args (unchanged).
909
+ const acquire = opts._acquire || acquireAdoToken;
797
910
  if (_adoTokenCache.token && Date.now() < _adoTokenCache.expiresAt) {
798
911
  return _adoTokenCache.token;
799
912
  }
800
- // If recent fetch failed, don't retry until backoff expires.
801
- if (Date.now() < _adoTokenFailedUntil) return null;
913
+ // Graduated backoff skip acquisition (return null) while backing off so a
914
+ // transient blip doesn't blackout polling/reconciliation for the full cap.
915
+ if (_adoTokenBackoff.isBackingOff()) return null;
802
916
  try {
803
- const { token } = await acquireAdoToken({ execAsync, timeout: 15000 });
917
+ const { token } = await acquire({ execAsync, timeout: 15000 });
804
918
  _adoTokenCache = { token, expiresAt: Date.now() + 30 * 60 * 1000 };
805
- _adoTokenFailedUntil = 0;
919
+ _adoTokenBackoff.recordSuccess();
920
+ _adoTokenHealth = { lastSuccessAt: Date.now(), lastFailureReason: null, classification: null };
921
+ _adoTokenAuthNoteSent = false; // allow a fresh persistent note next episode
922
+ _adoNoTokenWarnedBackoffUntil = -1; // allow a fresh skip warn next episode
806
923
  return token;
807
924
  } catch (e) {
808
- log('warn', `Failed to get ADO token: ${e.message}`);
925
+ const classification = classifyAdoTokenError(e);
926
+ const reason = _summarizeAdoTokenError(e);
927
+ const { delayMs, consecutiveFailures } = _adoTokenBackoff.recordFailure();
928
+ _adoTokenHealth.lastFailureReason = reason;
929
+ _adoTokenHealth.classification = classification;
930
+ log('warn', `Failed to get ADO token (${classification}, attempt ${consecutiveFailures}): ${e.message} — backing off ${Math.round(delayMs / 1000)}s`);
931
+ if (classification === 'persistent' && !_adoTokenAuthNoteSent) {
932
+ _adoTokenAuthNoteSent = true;
933
+ _emitAdoTokenAuthNote(reason);
934
+ }
935
+ return null;
809
936
  }
810
- // Back off for 10 minutes to avoid spamming auth commands.
811
- _adoTokenFailedUntil = Date.now() + 10 * 60 * 1000;
812
- return null;
813
937
  }
814
938
 
815
939
  async function adoFetch(url, token, opts = {}) {
@@ -1188,8 +1312,11 @@ async function pollPrStatus(config) {
1188
1312
 
1189
1313
  const token = await getAdoToken();
1190
1314
  if (!token) {
1191
- log('warn', 'Skipping PR status poll — no ADO token available');
1192
- // Don't set _adoPollHadAuthFailure — getAdoToken() has its own 10-min backoff,
1315
+ const w = consumeNoAdoTokenWarning();
1316
+ if (w.shouldWarn) {
1317
+ log('warn', `Skipping PR status poll — no ADO token (${w.classification || 'unknown'}: ${w.reason || 'unknown'}); backing off until ${w.backoffUntil ? new Date(w.backoffUntil).toISOString() : 'unknown'}`);
1318
+ }
1319
+ // Don't set _adoPollHadAuthFailure — getAdoToken() has its own graduated backoff,
1193
1320
  // and setting the flag would hammer pollPrStatus() every tick with no useful work.
1194
1321
  return;
1195
1322
  }
@@ -1953,7 +2080,10 @@ async function pollPrHumanComments(config) {
1953
2080
  async function reconcilePrs(config) {
1954
2081
  const token = await getAdoToken();
1955
2082
  if (!token) {
1956
- log('warn', 'Skipping PR reconciliation — no ADO token available');
2083
+ const w = consumeNoAdoTokenWarning();
2084
+ if (w.shouldWarn) {
2085
+ log('warn', `Skipping PR reconciliation — no ADO token (${w.classification || 'unknown'}: ${w.reason || 'unknown'}); backing off until ${w.backoffUntil ? new Date(w.backoffUntil).toISOString() : 'unknown'}`);
2086
+ }
1957
2087
  return;
1958
2088
  }
1959
2089
 
@@ -2568,17 +2698,40 @@ function _setAdoThrottleForTest(state, orgBase = 'dev.azure.com/__test__') {
2568
2698
  * Pass null to force getAdoToken() to return null synchronously (no exec). */
2569
2699
  function _setAdoTokenForTest(token) {
2570
2700
  if (token == null) {
2571
- // Clear cache AND set a future failure backoff so getAdoToken short-circuits
2701
+ // Clear cache AND open a long backoff window so getAdoToken short-circuits
2572
2702
  // to null without spawning azureauth — otherwise tests would hang on the
2573
2703
  // 15s execAsync timeout or open a real auth popup.
2574
2704
  _adoTokenCache = { token: null, expiresAt: 0 };
2575
- _adoTokenFailedUntil = Date.now() + 60 * 60 * 1000;
2705
+ _adoTokenBackoff._setForTest({ consecutiveFailures: 1, backoffUntil: Date.now() + 60 * 60 * 1000 });
2576
2706
  } else {
2577
2707
  _adoTokenCache = { token, expiresAt: Date.now() + 30 * 60 * 1000 };
2578
- _adoTokenFailedUntil = 0;
2708
+ _adoTokenBackoff.recordSuccess();
2709
+ _adoTokenHealth = { lastSuccessAt: Date.now(), lastFailureReason: null, classification: null };
2710
+ _adoTokenAuthNoteSent = false;
2711
+ _adoNoTokenWarnedBackoffUntil = -1;
2579
2712
  }
2580
2713
  }
2581
2714
 
2715
+ /** Reset all token backoff + health + dedup state — exported for testing only. */
2716
+ function _resetAdoTokenForTest() {
2717
+ _adoTokenCache = { token: null, expiresAt: 0 };
2718
+ _adoTokenBackoff._reset();
2719
+ _adoTokenHealth = { lastSuccessAt: 0, lastFailureReason: null, classification: null };
2720
+ _adoTokenAuthNoteSent = false;
2721
+ _adoNoTokenWarnedBackoffUntil = -1;
2722
+ }
2723
+
2724
+ /** Force the token backoff tracker's internal state — exported for testing only. */
2725
+ function _setAdoTokenBackoffForTest(overrides) {
2726
+ _adoTokenBackoff._setForTest(overrides || {});
2727
+ }
2728
+
2729
+ /** Override the persistent-auth inbox note writer — exported for testing only.
2730
+ * Pass null to restore the default (shared.writeToInbox-backed) writer. */
2731
+ function _setAdoTokenAuthNoteWriterForTest(fn) {
2732
+ _adoTokenAuthNoteWriter = (typeof fn === 'function') ? fn : null;
2733
+ }
2734
+
2582
2735
  // ─── One-Shot Startup Reconciliation for Abandoned PRs (W-mp60tw0u000j3931) ───
2583
2736
  //
2584
2737
  // ADO equivalent of engine/github.js reconcileAbandonedPrs. Same shape:
@@ -2748,6 +2901,13 @@ module.exports = {
2748
2901
  resetReviewerNegativeVote, // W-mp7b1g8q000fea45 — reset reviewer's prior negative vote on verdict flip
2749
2902
  needsAdoPollRetry,
2750
2903
  isAdoAuthError, // exported for testing
2904
+ // W-mqidhwcc000m897e — token graduated backoff + classification + health
2905
+ classifyAdoTokenError,
2906
+ getAdoTokenHealth,
2907
+ consumeNoAdoTokenWarning,
2908
+ _resetAdoTokenForTest, // exported for testing
2909
+ _setAdoTokenBackoffForTest, // exported for testing
2910
+ _setAdoTokenAuthNoteWriterForTest, // exported for testing
2751
2911
  isAdoThrottled,
2752
2912
  getAdoThrottleState,
2753
2913
  getAdoThrottleStateAll,
package/engine/llm.js CHANGED
@@ -507,6 +507,7 @@ function _createStreamAccumulator({
507
507
  maxTextLength = 0,
508
508
  onChunk = null,
509
509
  onToolUse = null,
510
+ onToolUpdate = null,
510
511
  onTaskComplete = null,
511
512
  onTerminalResult = null,
512
513
  onThinking = null,
@@ -574,12 +575,21 @@ function _createStreamAccumulator({
574
575
  onTerminalResult();
575
576
  }
576
577
  },
577
- pushToolUse(name, input) {
578
+ pushToolUse(name, input, id = null) {
578
579
  if (!name) return;
579
- const toolUse = { name, input: input || {} };
580
+ const toolUse = { name, input: input || {}, id: id || null };
580
581
  toolUses.push(toolUse);
581
582
  sawToolSinceText = true;
582
- if (onToolUse) onToolUse(toolUse.name, toolUse.input);
583
+ if (onToolUse) onToolUse(toolUse.name, toolUse.input, toolUse.id);
584
+ },
585
+ updateToolUse(id, status) {
586
+ // Runtime-agnostic tool-completion signal. Adapters whose stream carries
587
+ // a per-tool result event (Claude's tool_result, Copilot's
588
+ // tool_call_update) call this to flip a previously-pushed chip from the
589
+ // neutral pending dot to a green check / red X. Adapters without such a
590
+ // signal simply never call it and the chip stays pending.
591
+ if (!id || !onToolUpdate) return;
592
+ onToolUpdate(id, status);
583
593
  },
584
594
  toolUseAlreadySeen(name, input) {
585
595
  if (!name) return false;
@@ -869,7 +879,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
869
879
  function callLLMStreaming(promptText, sysPromptText, opts = {}) {
870
880
  const {
871
881
  timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
872
- sessionId = null, onChunk = () => {}, onToolUse = null,
882
+ sessionId = null, onChunk = () => {}, onToolUse = null, onToolUpdate = null,
873
883
  effort = null, direct = false,
874
884
  model: modelOverride, cli: cliOverride, engineConfig,
875
885
  maxBudget, bare, fallbackModel,
@@ -911,6 +921,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
911
921
  maxLineBufferBytes: ENGINE_DEFAULTS.maxLlmLineBufferBytes,
912
922
  onChunk,
913
923
  onToolUse,
924
+ onToolUpdate,
914
925
  // Terminal text from the runtime adapter signals the LLM has logically
915
926
  // completed — kick the drain timer so we don't block on a delayed
916
927
  // 'exit'/'close' when an inherited pipe keeps the parent's FDs open.
@@ -636,8 +636,9 @@ function parseError(rawOutput) {
636
636
  // tracking) and translates Claude event shapes into ctx callbacks.
637
637
  //
638
638
  // `ctx` shape (provided by accumulator):
639
- // maxTextLength, pushText(value), pushToolUse(name, input),
640
- // notifyThinking(), notifyTaskComplete(summary, success),
639
+ // maxTextLength, pushText(value), pushToolUse(name, input, id),
640
+ // updateToolUse(id, status), notifyThinking(),
641
+ // notifyTaskComplete(summary, success),
641
642
  // setUsage(usage), setSessionId(id), setText(value),
642
643
  // toolUseAlreadySeen(name, input)
643
644
 
@@ -743,11 +744,27 @@ function createStreamConsumer(ctx) {
743
744
  } else if (THINKING_BLOCK_TYPES.has(block?.type)) {
744
745
  ctx.notifyThinking();
745
746
  } else if (block?.type === 'tool_use' && block.name) {
746
- ctx.pushToolUse(block.name, block.input || {});
747
+ // Thread the tool_use id so the dashboard can flip this chip from
748
+ // pending → completed/failed when the matching tool_result lands.
749
+ ctx.pushToolUse(block.name, block.input || {}, block.id);
747
750
  }
748
751
  }
749
752
  if (assistantText) ctx.pushText(assistantText);
750
753
  }
754
+
755
+ if (obj.type === 'user' && Array.isArray(obj.message?.content)) {
756
+ // Claude reports tool execution results in a subsequent `user` turn:
757
+ // each block is {type:'tool_result', tool_use_id, is_error, content}.
758
+ // Flip the previously-pushed chip to completed (success) or failed
759
+ // (is_error). This is the Claude-direct analogue of Copilot's ACP
760
+ // tool_call_update; engine/llm.js stays runtime-agnostic — the chip
761
+ // turns green/red only because this adapter calls ctx.updateToolUse.
762
+ for (const block of obj.message.content) {
763
+ if (block?.type === 'tool_result' && block.tool_use_id) {
764
+ ctx.updateToolUse(block.tool_use_id, block.is_error ? 'failed' : 'completed');
765
+ }
766
+ }
767
+ }
751
768
  }
752
769
 
753
770
  function reset() {
package/engine/shared.js CHANGED
@@ -8294,6 +8294,50 @@ function createThrottleTracker({ label, baseBackoffMs = 60000, maxBackoffMs = 32
8294
8294
  return { recordThrottle, recordSuccess, isThrottled, getState, _reset, _setForTest };
8295
8295
  }
8296
8296
 
8297
+ // ── Backoff Tracker Factory ─────────────────────────────────────────────────
8298
+ // Generic graduated-backoff tracker for retryable *acquisition* failures (e.g.
8299
+ // ADO token minting via engine/ado-token.js). Distinct from
8300
+ // createThrottleTracker on three points so callers don't reach for the wrong
8301
+ // one: (1) the FIRST failure backs off `baseMs`, not `baseMs*2` — a single
8302
+ // transient blip must not blackout for the full cap; (2) recordSuccess RESETS
8303
+ // consecutiveFailures to zero rather than the throttle tracker's gradual
8304
+ // per-success decay; (3) there is no server Retry-After / jitter concept. The
8305
+ // delay grows baseMs, 2·baseMs, 4·baseMs, … doubling, capped at maxMs.
8306
+ // now is injectable so tests don't depend on Date.now().
8307
+ function createBackoffTracker({ baseMs = 30000, maxMs = 10 * 60 * 1000 } = {}) {
8308
+ let consecutiveFailures = 0;
8309
+ let backoffUntil = 0;
8310
+
8311
+ function recordFailure(now = Date.now()) {
8312
+ consecutiveFailures++;
8313
+ const delayMs = Math.min(baseMs * Math.pow(2, consecutiveFailures - 1), maxMs);
8314
+ backoffUntil = now + delayMs;
8315
+ return { consecutiveFailures, delayMs, backoffUntil };
8316
+ }
8317
+
8318
+ function recordSuccess() {
8319
+ consecutiveFailures = 0;
8320
+ backoffUntil = 0;
8321
+ }
8322
+
8323
+ function isBackingOff(now = Date.now()) {
8324
+ return now < backoffUntil;
8325
+ }
8326
+
8327
+ function getState() {
8328
+ return { consecutiveFailures, backoffUntil };
8329
+ }
8330
+
8331
+ // Testing helpers
8332
+ function _reset() { consecutiveFailures = 0; backoffUntil = 0; }
8333
+ function _setForTest(overrides = {}) {
8334
+ if (Number.isFinite(overrides.consecutiveFailures)) consecutiveFailures = overrides.consecutiveFailures;
8335
+ if (Number.isFinite(overrides.backoffUntil)) backoffUntil = overrides.backoffUntil;
8336
+ }
8337
+
8338
+ return { recordFailure, recordSuccess, isBackingOff, getState, _reset, _setForTest };
8339
+ }
8340
+
8297
8341
  module.exports = {
8298
8342
  MINIONS_DIR,
8299
8343
  ENGINE_DIR,
@@ -8566,5 +8610,6 @@ module.exports = {
8566
8610
  getPinnedItems,
8567
8611
  _logBuffer, // exported for testing
8568
8612
  createThrottleTracker,
8613
+ createBackoffTracker,
8569
8614
  backfillPrPrdItems,
8570
8615
  };
@@ -90,6 +90,16 @@ async function tick(opts) {
90
90
  const dashPort = opts.dashPort || DEFAULT_DASH_PORT;
91
91
  const readEnginePid = opts.readEnginePid;
92
92
  const isPortListening = opts.isPortListening;
93
+ // Optional authoritative "is the port actually accepting connections?" probe
94
+ // (a direct TCP connect). Used ONLY to overturn a `down` verdict from the
95
+ // primary `isPortListening` probe — never to manufacture a `down`. Rationale:
96
+ // the primary probe on Windows shells out to `netstat -ano | findstr` with a
97
+ // 5s timeout; on a loaded box (e.g. a self-hosted CI runner mid-test-suite)
98
+ // netstat blows past the timeout, throws, and is swallowed as "port down",
99
+ // which would restart a perfectly live dashboard. A TCP handshake is
100
+ // completed by the kernel's listen backlog even when the dashboard's JS event
101
+ // loop is briefly blocked, so it stays accurate exactly when netstat fails.
102
+ const confirmPortUp = opts.confirmPortUp;
93
103
  const isStopIntentSet = opts.isStopIntentSet;
94
104
  const spawner = opts.spawner || spawn;
95
105
  const now = opts.now || (() => new Date());
@@ -125,6 +135,19 @@ async function tick(opts) {
125
135
  let dashUp = false;
126
136
  try { dashUp = !!isPortListening(dashPort); } catch { dashUp = false; }
127
137
 
138
+ // Confirm a `down` verdict with a direct TCP connect before acting on it.
139
+ // This converts the common false-negative (netstat timed out under load) back
140
+ // to the truth without ever masking a real outage: confirmPortUp can only
141
+ // flip down→up, and only when the kernel actually accepts a connection.
142
+ if (!dashUp && typeof confirmPortUp === 'function') {
143
+ let tcpUp = false;
144
+ try { tcpUp = !!(await confirmPortUp(dashPort)); } catch { tcpUp = false; }
145
+ if (tcpUp) {
146
+ logLine(minionsHome, `port=${dashPort} reported down by primary probe but TCP connect succeeded — treating as up (false-negative suppressed)`);
147
+ dashUp = true;
148
+ }
149
+ }
150
+
128
151
  if (engineAlive && dashUp) {
129
152
  logLine(minionsHome, `ok engine=${enginePid} port=${dashPort} ts=${now().toISOString()}`);
130
153
  return { healthy: true, action: 'none' };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2213",
3
+ "version": "0.1.2215",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"