@yemi33/minions 0.1.2214 → 0.1.2215
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/minions.js +20 -0
- package/dashboard.js +23 -0
- package/engine/ado.js +177 -17
- package/engine/shared.js +45 -0
- package/engine/watchdog.js +23 -0
- package/package.json +1 -1
package/bin/minions.js
CHANGED
|
@@ -58,6 +58,7 @@
|
|
|
58
58
|
const fs = require('fs');
|
|
59
59
|
const path = require('path');
|
|
60
60
|
const os = require('os');
|
|
61
|
+
const net = require('net');
|
|
61
62
|
const { spawn, spawnSync, execSync } = require('child_process');
|
|
62
63
|
|
|
63
64
|
const PKG_ROOT = path.resolve(__dirname, '..');
|
|
@@ -136,6 +137,24 @@ function killByPort(port) {
|
|
|
136
137
|
|
|
137
138
|
const isPortListening = (port) => getListeningPids(port).length > 0;
|
|
138
139
|
|
|
140
|
+
/** Authoritative "is something accepting TCP connections on this port?" probe.
|
|
141
|
+
* Resolves true iff a connection to 127.0.0.1:port completes within timeoutMs.
|
|
142
|
+
* Unlike the netstat-based isPortListening, this can't false-negative under
|
|
143
|
+
* CPU/event-loop load: the TCP handshake is satisfied by the OS listen backlog
|
|
144
|
+
* even when the listener's JS event loop is momentarily blocked. Used by the
|
|
145
|
+
* watchdog to overturn a transient `down` verdict before restarting a live
|
|
146
|
+
* dashboard. Never throws; resolves false on any error/timeout. */
|
|
147
|
+
function tcpPortAccepts(port, timeoutMs = 1500) {
|
|
148
|
+
return new Promise((resolve) => {
|
|
149
|
+
let settled = false;
|
|
150
|
+
const done = (val) => { if (!settled) { settled = true; try { socket.destroy(); } catch {} resolve(val); } };
|
|
151
|
+
const socket = net.connect({ host: '127.0.0.1', port: Number(port) });
|
|
152
|
+
socket.once('connect', () => done(true));
|
|
153
|
+
socket.once('error', () => done(false));
|
|
154
|
+
socket.setTimeout(timeoutMs, () => done(false));
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
|
|
139
158
|
/**
|
|
140
159
|
* Wait until no process is listening on `port`, retrying a kill on each tick
|
|
141
160
|
* for any stragglers that re-appeared (e.g. orphan child the original kill
|
|
@@ -1483,6 +1502,7 @@ ${fs.existsSync(path.join(PKG_ROOT, '.git')) ? `
|
|
|
1483
1502
|
dashPort: DASHBOARD_PORT,
|
|
1484
1503
|
readEnginePid,
|
|
1485
1504
|
isPortListening,
|
|
1505
|
+
confirmPortUp: tcpPortAccepts,
|
|
1486
1506
|
isStopIntentSet: shared.isStopIntentSet || (() => false),
|
|
1487
1507
|
}).then(result => {
|
|
1488
1508
|
// ALWAYS exit 0 — the scheduler must never see a failure for the
|
package/dashboard.js
CHANGED
|
@@ -10909,6 +10909,28 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
10909
10909
|
} catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
|
|
10910
10910
|
}
|
|
10911
10911
|
|
|
10912
|
+
// W-mqidhwcc000m897e — read-only ADO token-acquisition health snapshot.
|
|
10913
|
+
// Surfaces the graduated-backoff + transient/persistent classification state
|
|
10914
|
+
// so an operator can diagnose "ADO polling paused" from one place:
|
|
10915
|
+
// { token: { lastSuccessAt, lastFailureReason, classification,
|
|
10916
|
+
// backoffUntil, consecutiveFailures } }.
|
|
10917
|
+
async function handleDiagnosticsAdoToken(req, res) {
|
|
10918
|
+
try {
|
|
10919
|
+
let token = { lastSuccessAt: 0, lastFailureReason: null, classification: null, backoffUntil: 0, consecutiveFailures: 0 };
|
|
10920
|
+
if (typeof ado.getAdoTokenHealth === 'function') {
|
|
10921
|
+
const h = ado.getAdoTokenHealth() || {};
|
|
10922
|
+
token = {
|
|
10923
|
+
lastSuccessAt: Number(h.lastSuccessAt) || 0,
|
|
10924
|
+
lastFailureReason: h.lastFailureReason || null,
|
|
10925
|
+
classification: h.classification || null,
|
|
10926
|
+
backoffUntil: Number(h.backoffUntil) || 0,
|
|
10927
|
+
consecutiveFailures: Number(h.consecutiveFailures) || 0,
|
|
10928
|
+
};
|
|
10929
|
+
}
|
|
10930
|
+
return jsonReply(res, 200, { token });
|
|
10931
|
+
} catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
|
|
10932
|
+
}
|
|
10933
|
+
|
|
10912
10934
|
// P-c3d4e5f6 — /api/diagnostics/memory.
|
|
10913
10935
|
// Returns the latest in-process dashboard sample, the latest engine
|
|
10914
10936
|
// sample (read fresh from engine/diagnostics-memory.json via safeJsonObj),
|
|
@@ -13651,6 +13673,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
13651
13673
|
{ method: 'POST', path: '/api/diagnostics/refresh', desc: 'Append a dashboard refresh-diagnostic ring buffer batch to engine/dashboard-diagnostics.log (rotated at 1 MB)', params: 'entries[]', handler: handleDiagnosticsRefresh },
|
|
13652
13674
|
// Diagnostics — per-org ADO throttle state (W-mq03l6zh0006f0a1-d).
|
|
13653
13675
|
{ method: 'GET', path: '/api/diagnostics/ado-throttle', desc: 'Snapshot of per-org ADO throttle tracker state — { orgs: { [orgBase]: { throttled, retryAfter, consecutiveHits } } }. Falls back to a single `global` key when running against pre-per-org engines.', handler: handleDiagnosticsAdoThrottle },
|
|
13676
|
+
{ method: 'GET', path: '/api/diagnostics/ado-token', desc: 'Read-only ADO token-acquisition health — { token: { lastSuccessAt, lastFailureReason, classification, backoffUntil, consecutiveFailures } }. Surfaces the graduated-backoff + transient/persistent classification state (W-mqidhwcc000m897e) so an operator can diagnose paused ADO polling from one line.', handler: handleDiagnosticsAdoToken },
|
|
13654
13677
|
// Diagnostics — engine + dashboard memory baseline (P-c3d4e5f6).
|
|
13655
13678
|
{ method: 'GET', path: '/api/diagnostics/memory', desc: 'Latest in-process dashboard memory sample plus the most-recent engine sample read from engine/diagnostics-memory.json. engineStale=true when the sidecar is missing or its capturedAt is > 5 min old.', handler: handleDiagnosticsMemory },
|
|
13656
13679
|
{ method: 'GET', path: '/api/diagnostics/memory/history', desc: 'In-memory ring buffer of memory samples. process=dashboard returns the dashboard\'s own collector (populated by startPeriodicSampling on boot). process=engine returns the dashboard\'s polled accumulation of engine/diagnostics-memory.json — engine.js only persists the latest sample to the sidecar, so engine-side history is rebuilt by the dashboard poller (dedup by capturedAt). Optional limit caps returned newest-N samples.', params: 'process (engine|dashboard), limit?', handler: handleDiagnosticsMemoryHistory },
|
package/engine/ado.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
const path = require('path');
|
|
7
7
|
const childProcess = require('child_process');
|
|
8
8
|
const shared = require('./shared');
|
|
9
|
-
const { exec, execAsync, getAdoOrgBase, log, ts, dateStamp, PR_STATUS, BUILD_STATUS, REVIEW_STATUS, FETCH_TIMEOUT_MS, ADO_TOKEN_REFRESH_MAX_RETRIES, createThrottleTracker } = shared;
|
|
9
|
+
const { exec, execAsync, getAdoOrgBase, log, ts, dateStamp, PR_STATUS, BUILD_STATUS, REVIEW_STATUS, FETCH_TIMEOUT_MS, ADO_TOKEN_REFRESH_MAX_RETRIES, createThrottleTracker, createBackoffTracker, writeToInbox } = shared;
|
|
10
10
|
const { getPrs } = require('./queries');
|
|
11
11
|
const { mutateJsonFileLocked } = shared;
|
|
12
12
|
const { acquireAdoToken } = require('./ado-token');
|
|
@@ -714,7 +714,118 @@ function _hasPendingReReviewWi(pr) {
|
|
|
714
714
|
// ─── ADO Token Cache ─────────────────────────────────────────────────────────
|
|
715
715
|
|
|
716
716
|
let _adoTokenCache = { token: null, expiresAt: 0 };
|
|
717
|
-
|
|
717
|
+
|
|
718
|
+
// ── ADO token acquisition backoff + health (W-mqidhwcc000m897e) ──────────────
|
|
719
|
+
// Graduated backoff replaces the old flat 10-min blackout: a single transient
|
|
720
|
+
// blip (broker hiccup, az cold-start, network) now backs off 30s and doubles
|
|
721
|
+
// per consecutive failure, capped at 10m, resetting to zero on the next
|
|
722
|
+
// success. Distinct from the per-org throttle tracker (server-driven 429/503).
|
|
723
|
+
const ADO_TOKEN_BACKOFF_BASE_MS = 30000;
|
|
724
|
+
const ADO_TOKEN_BACKOFF_MAX_MS = 10 * 60 * 1000;
|
|
725
|
+
let _adoTokenBackoff = createBackoffTracker({ baseMs: ADO_TOKEN_BACKOFF_BASE_MS, maxMs: ADO_TOKEN_BACKOFF_MAX_MS });
|
|
726
|
+
|
|
727
|
+
// Read-model for the token health diagnostic + reason-carrying skip warn.
|
|
728
|
+
// Only the three fields the backoff tracker does NOT own live here; backoffUntil
|
|
729
|
+
// and consecutiveFailures are read from `_adoTokenBackoff` on demand in
|
|
730
|
+
// getAdoTokenHealth() so there is a single source of truth.
|
|
731
|
+
let _adoTokenHealth = {
|
|
732
|
+
lastSuccessAt: 0,
|
|
733
|
+
lastFailureReason: null,
|
|
734
|
+
classification: null, // 'transient' | 'persistent' | null
|
|
735
|
+
};
|
|
736
|
+
|
|
737
|
+
// Dedup latch for the persistent-auth operator inbox note. Set when a note is
|
|
738
|
+
// emitted; cleared on the next successful acquisition so a fresh episode
|
|
739
|
+
// (after a human runs `az login`) can surface a new note.
|
|
740
|
+
let _adoTokenAuthNoteSent = false;
|
|
741
|
+
// Backoff-window the last no-token skip warn fired for. The skip warn logs on
|
|
742
|
+
// the STATE TRANSITION into a new no-token window (each real acquisition
|
|
743
|
+
// failure mints a new backoffUntil) and stays silent across the suppressed
|
|
744
|
+
// poll/reconcile cycles within that same window.
|
|
745
|
+
let _adoNoTokenWarnedBackoffUntil = -1;
|
|
746
|
+
|
|
747
|
+
// Persistent (human-actionable) acquisition errors: az not logged in, no
|
|
748
|
+
// subscription, azureauth interactive/broker consent required, missing tooling.
|
|
749
|
+
// Everything else (timeout, ECONN*, broker busy, generic failure) is transient.
|
|
750
|
+
const _ADO_TOKEN_PERSISTENT_RE = /\baz login\b|run ['"]?az login|az account set|no subscription(?:\s+found)?|not logged in|AADSTS\d+|interaction[_ ]required|consent[_ ]required|interactive authentication|reauthenticat|invalid_grant|is not recognized|command not found/i;
|
|
751
|
+
|
|
752
|
+
/** Classify an acquisition error as 'persistent' (human-actionable, needs az
|
|
753
|
+
* login / setup) or 'transient' (retry will likely self-heal). Scans the
|
|
754
|
+
* combined error message plus every per-provider attempt error. Defaults to
|
|
755
|
+
* 'transient' so we never nag the operator on a recoverable blip. */
|
|
756
|
+
function classifyAdoTokenError(err) {
|
|
757
|
+
const parts = [err?.message || ''];
|
|
758
|
+
if (Array.isArray(err?.attempts)) {
|
|
759
|
+
for (const a of err.attempts) parts.push(String(a?.error || ''));
|
|
760
|
+
}
|
|
761
|
+
const haystack = parts.join(' | ');
|
|
762
|
+
return _ADO_TOKEN_PERSISTENT_RE.test(haystack) ? 'persistent' : 'transient';
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
/** Short, single-line reason for the health read-model + skip-warn log. */
|
|
766
|
+
function _summarizeAdoTokenError(err) {
|
|
767
|
+
return String(err?.message || 'unknown error').replace(/\s+/g, ' ').trim().slice(0, 240);
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// Overridable note writer (test seam). Default routes through the shared
|
|
771
|
+
// date-deduped inbox writer so at most one note lands per day even if the
|
|
772
|
+
// in-memory latch is reset.
|
|
773
|
+
let _adoTokenAuthNoteWriter = null;
|
|
774
|
+
function _defaultAdoTokenAuthNoteWriter(reason) {
|
|
775
|
+
const body = [
|
|
776
|
+
'# ADO auth needs `az login` — PR polling/reconciliation paused',
|
|
777
|
+
'',
|
|
778
|
+
'ADO token acquisition is failing in a way that will NOT self-heal without a',
|
|
779
|
+
'human action (e.g. `az login`, selecting a subscription, or completing an',
|
|
780
|
+
'interactive azureauth consent). Engine PR status polling + reconciliation for',
|
|
781
|
+
'ADO repos are paused on a graduated backoff until a token can be minted again.',
|
|
782
|
+
'',
|
|
783
|
+
`- reason: ${reason || 'unknown'}`,
|
|
784
|
+
'- remedy: run `az login` (and `az account set --subscription <id>` if prompted)',
|
|
785
|
+
' on the engine host, then the next successful acquisition auto-resumes polling.',
|
|
786
|
+
'',
|
|
787
|
+
'This note is deduped — it will not repeat every cycle. A fresh note is allowed',
|
|
788
|
+
'after the next successful token acquisition.',
|
|
789
|
+
].join('\n');
|
|
790
|
+
return writeToInbox('engine', 'ado-token-auth-paused', body);
|
|
791
|
+
}
|
|
792
|
+
function _emitAdoTokenAuthNote(reason) {
|
|
793
|
+
try {
|
|
794
|
+
return (_adoTokenAuthNoteWriter || _defaultAdoTokenAuthNoteWriter)(reason);
|
|
795
|
+
} catch (e) {
|
|
796
|
+
try { log('warn', `Failed to write ADO token auth inbox note: ${e.message}`); } catch { /* best-effort */ }
|
|
797
|
+
return false;
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
/** Token-health snapshot for diagnostics (read-only). */
|
|
802
|
+
function getAdoTokenHealth() {
|
|
803
|
+
const bs = _adoTokenBackoff.getState();
|
|
804
|
+
return {
|
|
805
|
+
lastSuccessAt: _adoTokenHealth.lastSuccessAt,
|
|
806
|
+
lastFailureReason: _adoTokenHealth.lastFailureReason,
|
|
807
|
+
classification: _adoTokenHealth.classification,
|
|
808
|
+
backoffUntil: bs.backoffUntil,
|
|
809
|
+
consecutiveFailures: bs.consecutiveFailures,
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
/** Consume a one-shot no-token warning. Returns shouldWarn=true only on the
|
|
814
|
+
* transition into a new no-token backoff window (each real acquisition failure
|
|
815
|
+
* mints a new backoffUntil), so the recurring poll/reconcile cycles within the
|
|
816
|
+
* same window stay silent. Carries the underlying reason + backoffUntil so the
|
|
817
|
+
* caller logs a single diagnosable line. */
|
|
818
|
+
function consumeNoAdoTokenWarning() {
|
|
819
|
+
const backoffUntil = _adoTokenBackoff.getState().backoffUntil;
|
|
820
|
+
const shouldWarn = backoffUntil !== _adoNoTokenWarnedBackoffUntil;
|
|
821
|
+
if (shouldWarn) _adoNoTokenWarnedBackoffUntil = backoffUntil;
|
|
822
|
+
return {
|
|
823
|
+
shouldWarn,
|
|
824
|
+
reason: _adoTokenHealth.lastFailureReason,
|
|
825
|
+
classification: _adoTokenHealth.classification,
|
|
826
|
+
backoffUntil,
|
|
827
|
+
};
|
|
828
|
+
}
|
|
718
829
|
|
|
719
830
|
// ─── ADO Throttle State (per-org) ───────────────────────────────────────────
|
|
720
831
|
// Tracks rate-limiting (HTTP 429/503) from ADO API responses, isolated per ADO
|
|
@@ -793,23 +904,36 @@ function isAdoAuthError(err) {
|
|
|
793
904
|
return msg.includes('auth redirect') || msg.includes('HTML instead of JSON') || /ADO API (401|403)/.test(msg);
|
|
794
905
|
}
|
|
795
906
|
|
|
796
|
-
async function getAdoToken() {
|
|
907
|
+
async function getAdoToken(opts = {}) {
|
|
908
|
+
// _acquire is a test seam — production callers pass no args (unchanged).
|
|
909
|
+
const acquire = opts._acquire || acquireAdoToken;
|
|
797
910
|
if (_adoTokenCache.token && Date.now() < _adoTokenCache.expiresAt) {
|
|
798
911
|
return _adoTokenCache.token;
|
|
799
912
|
}
|
|
800
|
-
//
|
|
801
|
-
|
|
913
|
+
// Graduated backoff — skip acquisition (return null) while backing off so a
|
|
914
|
+
// transient blip doesn't blackout polling/reconciliation for the full cap.
|
|
915
|
+
if (_adoTokenBackoff.isBackingOff()) return null;
|
|
802
916
|
try {
|
|
803
|
-
const { token } = await
|
|
917
|
+
const { token } = await acquire({ execAsync, timeout: 15000 });
|
|
804
918
|
_adoTokenCache = { token, expiresAt: Date.now() + 30 * 60 * 1000 };
|
|
805
|
-
|
|
919
|
+
_adoTokenBackoff.recordSuccess();
|
|
920
|
+
_adoTokenHealth = { lastSuccessAt: Date.now(), lastFailureReason: null, classification: null };
|
|
921
|
+
_adoTokenAuthNoteSent = false; // allow a fresh persistent note next episode
|
|
922
|
+
_adoNoTokenWarnedBackoffUntil = -1; // allow a fresh skip warn next episode
|
|
806
923
|
return token;
|
|
807
924
|
} catch (e) {
|
|
808
|
-
|
|
925
|
+
const classification = classifyAdoTokenError(e);
|
|
926
|
+
const reason = _summarizeAdoTokenError(e);
|
|
927
|
+
const { delayMs, consecutiveFailures } = _adoTokenBackoff.recordFailure();
|
|
928
|
+
_adoTokenHealth.lastFailureReason = reason;
|
|
929
|
+
_adoTokenHealth.classification = classification;
|
|
930
|
+
log('warn', `Failed to get ADO token (${classification}, attempt ${consecutiveFailures}): ${e.message} — backing off ${Math.round(delayMs / 1000)}s`);
|
|
931
|
+
if (classification === 'persistent' && !_adoTokenAuthNoteSent) {
|
|
932
|
+
_adoTokenAuthNoteSent = true;
|
|
933
|
+
_emitAdoTokenAuthNote(reason);
|
|
934
|
+
}
|
|
935
|
+
return null;
|
|
809
936
|
}
|
|
810
|
-
// Back off for 10 minutes to avoid spamming auth commands.
|
|
811
|
-
_adoTokenFailedUntil = Date.now() + 10 * 60 * 1000;
|
|
812
|
-
return null;
|
|
813
937
|
}
|
|
814
938
|
|
|
815
939
|
async function adoFetch(url, token, opts = {}) {
|
|
@@ -1188,8 +1312,11 @@ async function pollPrStatus(config) {
|
|
|
1188
1312
|
|
|
1189
1313
|
const token = await getAdoToken();
|
|
1190
1314
|
if (!token) {
|
|
1191
|
-
|
|
1192
|
-
|
|
1315
|
+
const w = consumeNoAdoTokenWarning();
|
|
1316
|
+
if (w.shouldWarn) {
|
|
1317
|
+
log('warn', `Skipping PR status poll — no ADO token (${w.classification || 'unknown'}: ${w.reason || 'unknown'}); backing off until ${w.backoffUntil ? new Date(w.backoffUntil).toISOString() : 'unknown'}`);
|
|
1318
|
+
}
|
|
1319
|
+
// Don't set _adoPollHadAuthFailure — getAdoToken() has its own graduated backoff,
|
|
1193
1320
|
// and setting the flag would hammer pollPrStatus() every tick with no useful work.
|
|
1194
1321
|
return;
|
|
1195
1322
|
}
|
|
@@ -1953,7 +2080,10 @@ async function pollPrHumanComments(config) {
|
|
|
1953
2080
|
async function reconcilePrs(config) {
|
|
1954
2081
|
const token = await getAdoToken();
|
|
1955
2082
|
if (!token) {
|
|
1956
|
-
|
|
2083
|
+
const w = consumeNoAdoTokenWarning();
|
|
2084
|
+
if (w.shouldWarn) {
|
|
2085
|
+
log('warn', `Skipping PR reconciliation — no ADO token (${w.classification || 'unknown'}: ${w.reason || 'unknown'}); backing off until ${w.backoffUntil ? new Date(w.backoffUntil).toISOString() : 'unknown'}`);
|
|
2086
|
+
}
|
|
1957
2087
|
return;
|
|
1958
2088
|
}
|
|
1959
2089
|
|
|
@@ -2568,17 +2698,40 @@ function _setAdoThrottleForTest(state, orgBase = 'dev.azure.com/__test__') {
|
|
|
2568
2698
|
* Pass null to force getAdoToken() to return null synchronously (no exec). */
|
|
2569
2699
|
function _setAdoTokenForTest(token) {
|
|
2570
2700
|
if (token == null) {
|
|
2571
|
-
// Clear cache AND
|
|
2701
|
+
// Clear cache AND open a long backoff window so getAdoToken short-circuits
|
|
2572
2702
|
// to null without spawning azureauth — otherwise tests would hang on the
|
|
2573
2703
|
// 15s execAsync timeout or open a real auth popup.
|
|
2574
2704
|
_adoTokenCache = { token: null, expiresAt: 0 };
|
|
2575
|
-
|
|
2705
|
+
_adoTokenBackoff._setForTest({ consecutiveFailures: 1, backoffUntil: Date.now() + 60 * 60 * 1000 });
|
|
2576
2706
|
} else {
|
|
2577
2707
|
_adoTokenCache = { token, expiresAt: Date.now() + 30 * 60 * 1000 };
|
|
2578
|
-
|
|
2708
|
+
_adoTokenBackoff.recordSuccess();
|
|
2709
|
+
_adoTokenHealth = { lastSuccessAt: Date.now(), lastFailureReason: null, classification: null };
|
|
2710
|
+
_adoTokenAuthNoteSent = false;
|
|
2711
|
+
_adoNoTokenWarnedBackoffUntil = -1;
|
|
2579
2712
|
}
|
|
2580
2713
|
}
|
|
2581
2714
|
|
|
2715
|
+
/** Reset all token backoff + health + dedup state — exported for testing only. */
|
|
2716
|
+
function _resetAdoTokenForTest() {
|
|
2717
|
+
_adoTokenCache = { token: null, expiresAt: 0 };
|
|
2718
|
+
_adoTokenBackoff._reset();
|
|
2719
|
+
_adoTokenHealth = { lastSuccessAt: 0, lastFailureReason: null, classification: null };
|
|
2720
|
+
_adoTokenAuthNoteSent = false;
|
|
2721
|
+
_adoNoTokenWarnedBackoffUntil = -1;
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2724
|
+
/** Force the token backoff tracker's internal state — exported for testing only. */
|
|
2725
|
+
function _setAdoTokenBackoffForTest(overrides) {
|
|
2726
|
+
_adoTokenBackoff._setForTest(overrides || {});
|
|
2727
|
+
}
|
|
2728
|
+
|
|
2729
|
+
/** Override the persistent-auth inbox note writer — exported for testing only.
|
|
2730
|
+
* Pass null to restore the default (shared.writeToInbox-backed) writer. */
|
|
2731
|
+
function _setAdoTokenAuthNoteWriterForTest(fn) {
|
|
2732
|
+
_adoTokenAuthNoteWriter = (typeof fn === 'function') ? fn : null;
|
|
2733
|
+
}
|
|
2734
|
+
|
|
2582
2735
|
// ─── One-Shot Startup Reconciliation for Abandoned PRs (W-mp60tw0u000j3931) ───
|
|
2583
2736
|
//
|
|
2584
2737
|
// ADO equivalent of engine/github.js reconcileAbandonedPrs. Same shape:
|
|
@@ -2748,6 +2901,13 @@ module.exports = {
|
|
|
2748
2901
|
resetReviewerNegativeVote, // W-mp7b1g8q000fea45 — reset reviewer's prior negative vote on verdict flip
|
|
2749
2902
|
needsAdoPollRetry,
|
|
2750
2903
|
isAdoAuthError, // exported for testing
|
|
2904
|
+
// W-mqidhwcc000m897e — token graduated backoff + classification + health
|
|
2905
|
+
classifyAdoTokenError,
|
|
2906
|
+
getAdoTokenHealth,
|
|
2907
|
+
consumeNoAdoTokenWarning,
|
|
2908
|
+
_resetAdoTokenForTest, // exported for testing
|
|
2909
|
+
_setAdoTokenBackoffForTest, // exported for testing
|
|
2910
|
+
_setAdoTokenAuthNoteWriterForTest, // exported for testing
|
|
2751
2911
|
isAdoThrottled,
|
|
2752
2912
|
getAdoThrottleState,
|
|
2753
2913
|
getAdoThrottleStateAll,
|
package/engine/shared.js
CHANGED
|
@@ -8294,6 +8294,50 @@ function createThrottleTracker({ label, baseBackoffMs = 60000, maxBackoffMs = 32
|
|
|
8294
8294
|
return { recordThrottle, recordSuccess, isThrottled, getState, _reset, _setForTest };
|
|
8295
8295
|
}
|
|
8296
8296
|
|
|
8297
|
+
// ── Backoff Tracker Factory ─────────────────────────────────────────────────
|
|
8298
|
+
// Generic graduated-backoff tracker for retryable *acquisition* failures (e.g.
|
|
8299
|
+
// ADO token minting via engine/ado-token.js). Distinct from
|
|
8300
|
+
// createThrottleTracker on three points so callers don't reach for the wrong
|
|
8301
|
+
// one: (1) the FIRST failure backs off `baseMs`, not `baseMs*2` — a single
|
|
8302
|
+
// transient blip must not blackout for the full cap; (2) recordSuccess RESETS
|
|
8303
|
+
// consecutiveFailures to zero rather than the throttle tracker's gradual
|
|
8304
|
+
// per-success decay; (3) there is no server Retry-After / jitter concept. The
|
|
8305
|
+
// delay grows baseMs, 2·baseMs, 4·baseMs, … doubling, capped at maxMs.
|
|
8306
|
+
// now is injectable so tests don't depend on Date.now().
|
|
8307
|
+
function createBackoffTracker({ baseMs = 30000, maxMs = 10 * 60 * 1000 } = {}) {
|
|
8308
|
+
let consecutiveFailures = 0;
|
|
8309
|
+
let backoffUntil = 0;
|
|
8310
|
+
|
|
8311
|
+
function recordFailure(now = Date.now()) {
|
|
8312
|
+
consecutiveFailures++;
|
|
8313
|
+
const delayMs = Math.min(baseMs * Math.pow(2, consecutiveFailures - 1), maxMs);
|
|
8314
|
+
backoffUntil = now + delayMs;
|
|
8315
|
+
return { consecutiveFailures, delayMs, backoffUntil };
|
|
8316
|
+
}
|
|
8317
|
+
|
|
8318
|
+
function recordSuccess() {
|
|
8319
|
+
consecutiveFailures = 0;
|
|
8320
|
+
backoffUntil = 0;
|
|
8321
|
+
}
|
|
8322
|
+
|
|
8323
|
+
function isBackingOff(now = Date.now()) {
|
|
8324
|
+
return now < backoffUntil;
|
|
8325
|
+
}
|
|
8326
|
+
|
|
8327
|
+
function getState() {
|
|
8328
|
+
return { consecutiveFailures, backoffUntil };
|
|
8329
|
+
}
|
|
8330
|
+
|
|
8331
|
+
// Testing helpers
|
|
8332
|
+
function _reset() { consecutiveFailures = 0; backoffUntil = 0; }
|
|
8333
|
+
function _setForTest(overrides = {}) {
|
|
8334
|
+
if (Number.isFinite(overrides.consecutiveFailures)) consecutiveFailures = overrides.consecutiveFailures;
|
|
8335
|
+
if (Number.isFinite(overrides.backoffUntil)) backoffUntil = overrides.backoffUntil;
|
|
8336
|
+
}
|
|
8337
|
+
|
|
8338
|
+
return { recordFailure, recordSuccess, isBackingOff, getState, _reset, _setForTest };
|
|
8339
|
+
}
|
|
8340
|
+
|
|
8297
8341
|
module.exports = {
|
|
8298
8342
|
MINIONS_DIR,
|
|
8299
8343
|
ENGINE_DIR,
|
|
@@ -8566,5 +8610,6 @@ module.exports = {
|
|
|
8566
8610
|
getPinnedItems,
|
|
8567
8611
|
_logBuffer, // exported for testing
|
|
8568
8612
|
createThrottleTracker,
|
|
8613
|
+
createBackoffTracker,
|
|
8569
8614
|
backfillPrPrdItems,
|
|
8570
8615
|
};
|
package/engine/watchdog.js
CHANGED
|
@@ -90,6 +90,16 @@ async function tick(opts) {
|
|
|
90
90
|
const dashPort = opts.dashPort || DEFAULT_DASH_PORT;
|
|
91
91
|
const readEnginePid = opts.readEnginePid;
|
|
92
92
|
const isPortListening = opts.isPortListening;
|
|
93
|
+
// Optional authoritative "is the port actually accepting connections?" probe
|
|
94
|
+
// (a direct TCP connect). Used ONLY to overturn a `down` verdict from the
|
|
95
|
+
// primary `isPortListening` probe — never to manufacture a `down`. Rationale:
|
|
96
|
+
// the primary probe on Windows shells out to `netstat -ano | findstr` with a
|
|
97
|
+
// 5s timeout; on a loaded box (e.g. a self-hosted CI runner mid-test-suite)
|
|
98
|
+
// netstat blows past the timeout, throws, and is swallowed as "port down",
|
|
99
|
+
// which would restart a perfectly live dashboard. A TCP handshake is
|
|
100
|
+
// completed by the kernel's listen backlog even when the dashboard's JS event
|
|
101
|
+
// loop is briefly blocked, so it stays accurate exactly when netstat fails.
|
|
102
|
+
const confirmPortUp = opts.confirmPortUp;
|
|
93
103
|
const isStopIntentSet = opts.isStopIntentSet;
|
|
94
104
|
const spawner = opts.spawner || spawn;
|
|
95
105
|
const now = opts.now || (() => new Date());
|
|
@@ -125,6 +135,19 @@ async function tick(opts) {
|
|
|
125
135
|
let dashUp = false;
|
|
126
136
|
try { dashUp = !!isPortListening(dashPort); } catch { dashUp = false; }
|
|
127
137
|
|
|
138
|
+
// Confirm a `down` verdict with a direct TCP connect before acting on it.
|
|
139
|
+
// This converts the common false-negative (netstat timed out under load) back
|
|
140
|
+
// to the truth without ever masking a real outage: confirmPortUp can only
|
|
141
|
+
// flip down→up, and only when the kernel actually accepts a connection.
|
|
142
|
+
if (!dashUp && typeof confirmPortUp === 'function') {
|
|
143
|
+
let tcpUp = false;
|
|
144
|
+
try { tcpUp = !!(await confirmPortUp(dashPort)); } catch { tcpUp = false; }
|
|
145
|
+
if (tcpUp) {
|
|
146
|
+
logLine(minionsHome, `port=${dashPort} reported down by primary probe but TCP connect succeeded — treating as up (false-negative suppressed)`);
|
|
147
|
+
dashUp = true;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
128
151
|
if (engineAlive && dashUp) {
|
|
129
152
|
logLine(minionsHome, `ok engine=${enginePid} port=${dashPort} ts=${now().toISOString()}`);
|
|
130
153
|
return { healthy: true, action: 'none' };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2215",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|