@yemi33/minions 0.1.1984 → 0.1.1986
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/minions.js +3 -1
- package/dashboard/js/qa.js +53 -0
- package/dashboard/js/refresh.js +4 -2
- package/dashboard/js/render-managed.js +43 -9
- package/dashboard/js/render-other.js +41 -11
- package/dashboard/layout.html +1 -0
- package/dashboard/pages/qa.html +23 -0
- package/dashboard-build.js +2 -2
- package/dashboard.js +135 -24
- package/docs/.nojekyll +0 -0
- package/docs/README.md +2 -0
- package/docs/constellation-bridge.md +94 -0
- package/docs/security.md +177 -0
- package/engine/ado-git-auth.js +206 -0
- package/engine/bridge.js +124 -0
- package/engine/cc-worker-pool.js +48 -1
- package/engine/cleanup.js +72 -23
- package/engine/cli.js +169 -12
- package/engine/dispatch.js +26 -11
- package/engine/github.js +79 -26
- package/engine/issues.js +14 -3
- package/engine/lifecycle.js +55 -14
- package/engine/llm.js +16 -9
- package/engine/meeting.js +16 -5
- package/engine/queries.js +123 -52
- package/engine/recovery.js +6 -0
- package/engine/shared.js +281 -9
- package/engine/spawn-agent.js +13 -5
- package/engine/timeout.js +4 -2
- package/engine.js +242 -52
- package/package.json +1 -1
package/engine/cc-worker-pool.js
CHANGED
|
@@ -136,6 +136,12 @@ class Worker {
|
|
|
136
136
|
this.killed = false;
|
|
137
137
|
this.spawnError = null;
|
|
138
138
|
this.firstSystemPromptSent = false;
|
|
139
|
+
// In-flight spawn+initialize+session/new promise. Set by getSession()
|
|
140
|
+
// before the worker is registered in _tabs, cleared after the handshake
|
|
141
|
+
// settles. Racing getSession() callers await this to avoid the
|
|
142
|
+
// "warm-reuse path returns sessionId=null while init is still pending"
|
|
143
|
+
// hang on first message of a freshly-warmed tab (W-mpd45blx00072f04).
|
|
144
|
+
this.initPromise = null;
|
|
139
145
|
}
|
|
140
146
|
|
|
141
147
|
// ── Spawn + initialize handshake ────────────────────────────────────────
|
|
@@ -499,6 +505,31 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
|
|
|
499
505
|
// 'cold-spawn' — fresh proc + initialize + session/new
|
|
500
506
|
let lifecycle = 'warm-reuse';
|
|
501
507
|
|
|
508
|
+
if (worker) {
|
|
509
|
+
// W-mpd45blx00072f04: if the existing worker is still mid-init (warm
|
|
510
|
+
// fired but session/new hasn't resolved yet), await the in-flight init
|
|
511
|
+
// BEFORE evaluating warm-reuse / newSession / cold-spawn — otherwise we
|
|
512
|
+
// return a SessionHandle with sessionId=null and the caller's first
|
|
513
|
+
// session/prompt fires with a null sessionId, causing every subsequent
|
|
514
|
+
// session/update notification to be dropped by _handleMessage's
|
|
515
|
+
// sessionId-match guard. User-visible symptom: first message on a
|
|
516
|
+
// freshly-warmed CC tab hangs (no chunks streamed, eventual onDone
|
|
517
|
+
// with empty text).
|
|
518
|
+
if (worker.initPromise) {
|
|
519
|
+
try {
|
|
520
|
+
await worker.initPromise;
|
|
521
|
+
} catch (err) {
|
|
522
|
+
// Warm init failed (e.g., auth). The originating call has already
|
|
523
|
+
// (or is about to) delete _tabs[tabId] and close the worker in its
|
|
524
|
+
// own catch handler. Surface the same error to this caller so the
|
|
525
|
+
// dashboard's spawn-failed path runs instead of hanging.
|
|
526
|
+
throw err;
|
|
527
|
+
}
|
|
528
|
+
// Re-read in case the failing initPromise's cleanup already ran.
|
|
529
|
+
worker = _tabs.get(tabId) || null;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
502
533
|
if (worker) {
|
|
503
534
|
if (worker.killed) {
|
|
504
535
|
_tabs.delete(tabId);
|
|
@@ -533,8 +564,24 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
|
|
|
533
564
|
tabId, model, effort, mcpServers, mcpServersHash, systemPromptHash, cwd,
|
|
534
565
|
});
|
|
535
566
|
_tabs.set(tabId, worker);
|
|
567
|
+
// Set initPromise BEFORE awaiting so concurrent getSession() callers
|
|
568
|
+
// landing during the spawn+initialize+session/new round-trip can detect
|
|
569
|
+
// and await it (W-mpd45blx00072f04). Clear on settle so callers that
|
|
570
|
+
// arrive AFTER init succeeds skip the no-op await. Attach the clear
|
|
571
|
+
// handler as both success+failure listeners (not .finally()) so the
|
|
572
|
+
// chained promise has a rejection handler and doesn't surface as an
|
|
573
|
+
// unhandled rejection when init throws.
|
|
574
|
+
const initPromise = worker._spawnAndInit();
|
|
575
|
+
worker.initPromise = initPromise;
|
|
576
|
+
const clearInit = () => {
|
|
577
|
+
// Only clear if we're still the active promise — defensive against
|
|
578
|
+
// a future refactor that calls _spawnAndInit twice for the same
|
|
579
|
+
// Worker (current code path never does).
|
|
580
|
+
if (worker.initPromise === initPromise) worker.initPromise = null;
|
|
581
|
+
};
|
|
582
|
+
initPromise.then(clearInit, clearInit);
|
|
536
583
|
try {
|
|
537
|
-
await
|
|
584
|
+
await initPromise;
|
|
538
585
|
} catch (err) {
|
|
539
586
|
_tabs.delete(tabId);
|
|
540
587
|
try { worker.close(); } catch { /* already torn down */ }
|
package/engine/cleanup.js
CHANGED
|
@@ -273,35 +273,42 @@ function _killProcessInWorktree(dir, activeProcesses, activeIds) {
|
|
|
273
273
|
log('info', `Killed orphaned process for dispatch ${id} before worktree removal`);
|
|
274
274
|
}
|
|
275
275
|
|
|
276
|
-
// Check PID files in engine/tmp/ —
|
|
276
|
+
// Check PID files in engine/tmp/ — both legacy flat layout and per-dispatch
|
|
277
|
+
// dirs (P-f6-tmp-toctou). Only kill if no active dispatch matches.
|
|
277
278
|
try {
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
if (!
|
|
281
|
-
const pidFileName = f.replace(/^pid-/, '').replace(/\.pid$/, '');
|
|
282
|
-
if (!dirLower.includes(pidFileName.slice(-8))) continue;
|
|
279
|
+
shared.forEachPidFile((pidFilePath, fileName, layout) => {
|
|
280
|
+
const pidFileName = fileName.replace(/^pid-/, '').replace(/\.pid$/, '');
|
|
281
|
+
if (!dirLower.includes(pidFileName.slice(-8))) return;
|
|
283
282
|
// Verify this PID file's dispatch is not active
|
|
284
283
|
let isActive = false;
|
|
285
284
|
for (const id of activeIds) { if (pidFileName.includes(id.slice(-8))) { isActive = true; break; } }
|
|
286
|
-
if (isActive)
|
|
287
|
-
|
|
285
|
+
if (isActive) return; // still active — do not kill
|
|
286
|
+
let pid;
|
|
287
|
+
try { pid = parseInt(fs.readFileSync(pidFilePath, 'utf8').trim(), 10); }
|
|
288
|
+
catch { return; }
|
|
288
289
|
if (pid > 0) {
|
|
289
290
|
// Verify the PID still belongs to a Minions runtime process before killing.
|
|
290
291
|
// The shared helper inspects the PID's full command line for `claude` /
|
|
291
292
|
// `copilot` so a recycled PID running an unrelated process is skipped.
|
|
292
293
|
try {
|
|
293
294
|
if (process.platform === 'win32') {
|
|
294
|
-
if (!shared.isProcessCommandLineMatchingAgent(pid))
|
|
295
|
+
if (!shared.isProcessCommandLineMatchingAgent(pid)) return;
|
|
295
296
|
exec(`taskkill /F /T /PID ${pid}`, { stdio: 'pipe', timeout: 5000, windowsHide: true });
|
|
296
297
|
} else {
|
|
297
|
-
if (!shared.isProcessCommandLineMatchingAgent(pid))
|
|
298
|
+
if (!shared.isProcessCommandLineMatchingAgent(pid)) return;
|
|
298
299
|
try { process.kill(-pid, 'SIGKILL'); } catch { process.kill(pid, 'SIGKILL'); }
|
|
299
300
|
}
|
|
300
|
-
log('info', `Killed orphaned PID ${pid} (${
|
|
301
|
+
log('info', `Killed orphaned PID ${pid} (${fileName}, ${layout}) before worktree removal`);
|
|
301
302
|
} catch {} // process may already be dead
|
|
302
303
|
}
|
|
303
|
-
|
|
304
|
-
|
|
304
|
+
if (layout === 'dispatch-dir') {
|
|
305
|
+
// Remove the entire per-dispatch dir — its remaining sidecars are
|
|
306
|
+
// orphans of the same dead process.
|
|
307
|
+
try { shared.removeDispatchTmpDir(path.dirname(pidFilePath)); } catch {}
|
|
308
|
+
} else {
|
|
309
|
+
try { fs.unlinkSync(pidFilePath); } catch {}
|
|
310
|
+
}
|
|
311
|
+
});
|
|
305
312
|
} catch {} // tmp dir may not exist
|
|
306
313
|
}
|
|
307
314
|
|
|
@@ -313,9 +320,35 @@ async function runCleanup(config, verbose = false) {
|
|
|
313
320
|
let cleaned = { tempFiles: 0, liveOutputs: 0, worktrees: 0, zombies: 0 };
|
|
314
321
|
|
|
315
322
|
// 1. Clean stale temp prompt/sysprompt files and orphaned safeWrite .tmp.* files (older than 1 hour)
|
|
323
|
+
// P-f6-tmp-toctou: also sweep abandoned per-dispatch dirs (engine/tmp/dispatch-*),
|
|
324
|
+
// and recurse into them so leftover prompt/sysprompt sidecars from crashed
|
|
325
|
+
// dispatches don't accumulate.
|
|
316
326
|
const oneHourAgo = Date.now() - 3600000;
|
|
317
327
|
const tmpDir = path.join(ENGINE_DIR, 'tmp');
|
|
318
328
|
const scanDirs = [ENGINE_DIR, ...(fs.existsSync(tmpDir) ? [tmpDir] : [])];
|
|
329
|
+
// Discover dispatch-* dirs under engine/tmp/ and scan their contents too.
|
|
330
|
+
if (fs.existsSync(tmpDir)) {
|
|
331
|
+
try {
|
|
332
|
+
for (const entry of fs.readdirSync(tmpDir, { withFileTypes: true })) {
|
|
333
|
+
if (!entry.isDirectory()) continue;
|
|
334
|
+
if (!entry.name.startsWith('dispatch-')) continue;
|
|
335
|
+
const full = path.join(tmpDir, entry.name);
|
|
336
|
+
if (!shared.validateDispatchTmpDir(full)) continue;
|
|
337
|
+
scanDirs.push(full);
|
|
338
|
+
}
|
|
339
|
+
} catch { /* tmp dir may be empty/missing */ }
|
|
340
|
+
}
|
|
341
|
+
// Track which dispatch dirs we touch so we can rm empty ones whose owning
|
|
342
|
+
// dispatch is no longer in the active set.
|
|
343
|
+
const activeDispatchTmpDirs = new Set();
|
|
344
|
+
try {
|
|
345
|
+
const dispatch = getDispatch();
|
|
346
|
+
for (const queue of ['pending', 'active']) {
|
|
347
|
+
for (const e of dispatch[queue] || []) {
|
|
348
|
+
if (e?.tmpDir) activeDispatchTmpDirs.add(path.resolve(e.tmpDir));
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
} catch { /* dispatch.json may be empty */ }
|
|
319
352
|
for (const dir of scanDirs) {
|
|
320
353
|
// Each directory gets its own try-catch so one failure doesn't abort other directories (Bug #27)
|
|
321
354
|
let dirEntries;
|
|
@@ -341,6 +374,22 @@ async function runCleanup(config, verbose = false) {
|
|
|
341
374
|
}
|
|
342
375
|
}
|
|
343
376
|
}
|
|
377
|
+
// Reap empty/stale per-dispatch tmp dirs not referenced by an active entry.
|
|
378
|
+
cleaned.dispatchDirs = 0;
|
|
379
|
+
if (fs.existsSync(tmpDir)) {
|
|
380
|
+
try {
|
|
381
|
+
for (const entry of fs.readdirSync(tmpDir, { withFileTypes: true })) {
|
|
382
|
+
if (!entry.isDirectory() || !entry.name.startsWith('dispatch-')) continue;
|
|
383
|
+
const full = path.join(tmpDir, entry.name);
|
|
384
|
+
if (!shared.validateDispatchTmpDir(full)) continue;
|
|
385
|
+
if (activeDispatchTmpDirs.has(path.resolve(full))) continue;
|
|
386
|
+
let stat;
|
|
387
|
+
try { stat = fs.statSync(full); } catch { continue; }
|
|
388
|
+
if (stat.mtimeMs >= oneHourAgo) continue;
|
|
389
|
+
if (shared.removeDispatchTmpDir(full)) cleaned.dispatchDirs++;
|
|
390
|
+
}
|
|
391
|
+
} catch { /* sweep is best-effort */ }
|
|
392
|
+
}
|
|
344
393
|
|
|
345
394
|
// 2. Clean live-output.log and live-output-prev.log for idle agents (not currently working)
|
|
346
395
|
for (const [agentId] of Object.entries(config.agents || {})) {
|
|
@@ -1111,31 +1160,31 @@ async function runCleanup(config, verbose = false) {
|
|
|
1111
1160
|
} catch (e) { log('warn', 'cap cooldowns: ' + e.message); }
|
|
1112
1161
|
|
|
1113
1162
|
// 12. Clean stale PID files — remove PID files whose process is no longer running
|
|
1163
|
+
// P-f6-tmp-toctou: walks BOTH legacy flat layout and per-dispatch-dir layout
|
|
1164
|
+
// via shared.forEachPidFile.
|
|
1114
1165
|
cleaned.pidFiles = 0;
|
|
1115
1166
|
try {
|
|
1116
1167
|
const tmpDir = path.join(ENGINE_DIR, 'tmp');
|
|
1117
1168
|
if (fs.existsSync(tmpDir)) {
|
|
1118
|
-
let pidDirEntries;
|
|
1119
|
-
try { pidDirEntries = fs.readdirSync(tmpDir); } catch { pidDirEntries = []; }
|
|
1120
1169
|
const activePids = new Set();
|
|
1121
1170
|
for (const [, info] of activeProcesses) {
|
|
1122
1171
|
if (info.proc?.pid) activePids.add(String(info.proc.pid));
|
|
1123
1172
|
}
|
|
1124
|
-
|
|
1125
|
-
if (!f.startsWith('pid-') || !f.endsWith('.pid')) continue;
|
|
1126
|
-
const fp = path.join(tmpDir, f);
|
|
1173
|
+
shared.forEachPidFile((pidFilePath, fileName, layout) => {
|
|
1127
1174
|
try {
|
|
1128
|
-
const pidStr = fs.readFileSync(
|
|
1175
|
+
const pidStr = fs.readFileSync(pidFilePath, 'utf8').trim();
|
|
1129
1176
|
// Skip if actively tracked
|
|
1130
|
-
if (activePids.has(pidStr))
|
|
1177
|
+
if (activePids.has(pidStr)) return;
|
|
1131
1178
|
// Check if file is stale (>1 hour old)
|
|
1132
|
-
const stat = fs.statSync(
|
|
1179
|
+
const stat = fs.statSync(pidFilePath);
|
|
1133
1180
|
if (stat.mtimeMs < oneHourAgo) {
|
|
1134
|
-
fs.unlinkSync(
|
|
1181
|
+
fs.unlinkSync(pidFilePath);
|
|
1135
1182
|
cleaned.pidFiles++;
|
|
1183
|
+
// For dispatch-dir layout, the empty/stale dispatch dir gets reaped
|
|
1184
|
+
// by the stale-dispatch-dir sweep in step 1.
|
|
1136
1185
|
}
|
|
1137
1186
|
} catch { /* cleanup */ }
|
|
1138
|
-
}
|
|
1187
|
+
});
|
|
1139
1188
|
}
|
|
1140
1189
|
} catch (e) { log('warn', 'clean stale PID files: ' + e.message); }
|
|
1141
1190
|
|
package/engine/cli.js
CHANGED
|
@@ -57,7 +57,9 @@ function dispatchSafeId(itemId) {
|
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
function readDispatchPid(itemId) {
|
|
60
|
-
|
|
60
|
+
// P-f6-tmp-toctou: prefer per-dispatch dir layout, fall back to legacy flat.
|
|
61
|
+
const pidFile = shared.findDispatchPidFile(itemId);
|
|
62
|
+
if (!pidFile) return null;
|
|
61
63
|
let raw;
|
|
62
64
|
try { raw = fs.readFileSync(pidFile, 'utf8').trim(); }
|
|
63
65
|
catch { return null; }
|
|
@@ -90,6 +92,32 @@ function summarizeActiveDispatchPids(activeItems = []) {
|
|
|
90
92
|
return summary;
|
|
91
93
|
}
|
|
92
94
|
|
|
95
|
+
// W-mpcyvff6000pf828 (#2653) — Heartbeat writer decoupled from tickInner.
|
|
96
|
+
// `writeHeartbeatNow` mirrors the legacy in-tick write (synchronous, fast,
|
|
97
|
+
// non-throwing) but is driven from the main start loop on a 15s setInterval
|
|
98
|
+
// instead. See engine.js#tickInner for the load-bearing rationale comment.
|
|
99
|
+
const HEARTBEAT_INTERVAL_MS = 15000;
|
|
100
|
+
|
|
101
|
+
function writeHeartbeatNow() {
|
|
102
|
+
try {
|
|
103
|
+
// Synchronous callback inside the lock — just `s.heartbeat = Date.now()`,
|
|
104
|
+
// no awaits, no other state mutation. Mirrors mutateControl's contract.
|
|
105
|
+
shared.mutateControl(c => { c.heartbeat = Date.now(); return c; });
|
|
106
|
+
} catch (err) {
|
|
107
|
+
try { engine().log('warn', `write heartbeat: ${err.message}`); }
|
|
108
|
+
catch { /* during shutdown logger can be torn down — silent */ }
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Factory used by tests to drive the heartbeat interval without spinning up
|
|
113
|
+
// the full engine start loop. Returns the underlying timer so callers (the
|
|
114
|
+
// real start handler in production, tests in unit/engine-heartbeat.test.js)
|
|
115
|
+
// can clearInterval(...) on shutdown. Default to the production cadence.
|
|
116
|
+
function createHeartbeatInterval(intervalMs = HEARTBEAT_INTERVAL_MS, writer = writeHeartbeatNow) {
|
|
117
|
+
return setInterval(writer, intervalMs);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
93
121
|
function createControlOwner(pid = process.pid) {
|
|
94
122
|
return { pid, ownerToken: `${pid}-${shared.uid()}` };
|
|
95
123
|
}
|
|
@@ -162,6 +190,7 @@ const CLI_COMMAND_DOCS = Object.freeze({
|
|
|
162
190
|
doctor: { args: '', summary: 'Check prerequisites and runtime health' },
|
|
163
191
|
config: { args: 'set-cli <R> [--model M]', summary: 'Persist defaultCli/defaultModel without starting' },
|
|
164
192
|
pr: { args: 'comment <repo> <prNumber> --agent <id> --kind <k> [--wi <id>] [--body-file <f>|--body <text>]', summary: 'Post a marker-prepended PR comment via gh' },
|
|
193
|
+
bridge: { args: 'status|health|enable|disable', summary: 'Constellation bridge: toggle and inspect the read-only cross-repo feed' },
|
|
165
194
|
});
|
|
166
195
|
|
|
167
196
|
function formatCliCommandHelpLines() {
|
|
@@ -872,6 +901,18 @@ const commands = {
|
|
|
872
901
|
// Start tick loop
|
|
873
902
|
const tickTimer = setInterval(() => e.tick(), interval);
|
|
874
903
|
|
|
904
|
+
// W-mpcyvff6000pf828 (#2653) — Heartbeat decoupled from tickInner.
|
|
905
|
+
// The dashboard flips the engine badge to STALE when
|
|
906
|
+
// `Date.now() - control.heartbeat > 120000`; tying the write to tickInner
|
|
907
|
+
// meant any legitimately slow tick (cold runtime spawn, sequential
|
|
908
|
+
// ADO/gh polls, slow worktree create) blocked the next heartbeat and
|
|
909
|
+
// surfaced a healthy engine as crashed. We now write every 15s on a
|
|
910
|
+
// dedicated interval — 8× headroom vs the 120s threshold even under
|
|
911
|
+
// event-loop pressure, and orders of magnitude under TICK_TIMEOUT_MS so
|
|
912
|
+
// a hung tick still looks distinct from a wedged event loop.
|
|
913
|
+
writeHeartbeatNow(); // prime control.heartbeat immediately
|
|
914
|
+
const heartbeatTimer = setInterval(writeHeartbeatNow, HEARTBEAT_INTERVAL_MS);
|
|
915
|
+
|
|
875
916
|
// Fast poll: check steering every 1s (lightweight — just fs.stat per agent)
|
|
876
917
|
// and wakeup signals every 1s (control.json read)
|
|
877
918
|
const { checkSteering } = require('./timeout');
|
|
@@ -948,6 +989,7 @@ const commands = {
|
|
|
948
989
|
console.log(`\n${signal} received — initiating graceful shutdown...`);
|
|
949
990
|
clearInterval(tickTimer);
|
|
950
991
|
clearInterval(fastPollTimer);
|
|
992
|
+
clearInterval(heartbeatTimer);
|
|
951
993
|
for (const f of _watchedFiles) { try { fs.unwatchFile(f); } catch { /* cleanup */ } }
|
|
952
994
|
const stoppingAt = e.ts();
|
|
953
995
|
const stoppingWrite = markControlStoppingForOwner(controlOwner, stoppingAt);
|
|
@@ -1528,13 +1570,10 @@ const commands = {
|
|
|
1528
1570
|
const shared = require('./shared');
|
|
1529
1571
|
|
|
1530
1572
|
// Kill processes via PID files (expensive — outside dispatch lock).
|
|
1531
|
-
// PID files live in engine/tmp/
|
|
1532
|
-
//
|
|
1533
|
-
//
|
|
1534
|
-
|
|
1535
|
-
const pidFiles = shared.safeReadDir(pidDir).filter(f => f.startsWith('pid-') && f.endsWith('.pid'));
|
|
1536
|
-
for (const f of pidFiles) {
|
|
1537
|
-
const pidPath = path.join(pidDir, f);
|
|
1573
|
+
// PID files live in engine/tmp/ — both legacy flat layout
|
|
1574
|
+
// (`pid-<id>.pid`) and per-dispatch dirs (`dispatch-<id>-XXX/pid-<id>.pid`)
|
|
1575
|
+
// post-P-f6-tmp-toctou. shared.forEachPidFile visits both.
|
|
1576
|
+
shared.forEachPidFile((pidPath, fileName, layout) => {
|
|
1538
1577
|
const raw = safeRead(pidPath).trim();
|
|
1539
1578
|
// Guard against falsy/zero/NaN PIDs. Empty pid files would resolve to
|
|
1540
1579
|
// Number('') === 0, and process.kill(0) on POSIX targets the entire
|
|
@@ -1542,13 +1581,17 @@ const commands = {
|
|
|
1542
1581
|
let pidNum = NaN;
|
|
1543
1582
|
try { pidNum = shared.validatePid(raw); } catch { /* invalid — skip */ }
|
|
1544
1583
|
if (pidNum > 0) {
|
|
1545
|
-
try { process.kill(pidNum); console.log(`Killed process ${pidNum} (${
|
|
1584
|
+
try { process.kill(pidNum); console.log(`Killed process ${pidNum} (${fileName})`); }
|
|
1546
1585
|
catch { console.log(`Process ${pidNum} already dead`); }
|
|
1547
1586
|
} else {
|
|
1548
|
-
console.log(`Skipping ${
|
|
1587
|
+
console.log(`Skipping ${fileName}: invalid or empty PID`);
|
|
1549
1588
|
}
|
|
1550
|
-
|
|
1551
|
-
|
|
1589
|
+
if (layout === 'dispatch-dir') {
|
|
1590
|
+
try { shared.removeDispatchTmpDir(path.dirname(pidPath)); } catch { /* may not exist */ }
|
|
1591
|
+
} else {
|
|
1592
|
+
try { fs.unlinkSync(pidPath); } catch { /* may not exist */ }
|
|
1593
|
+
}
|
|
1594
|
+
});
|
|
1552
1595
|
|
|
1553
1596
|
// Atomically read and clear dispatch.active (locked read-modify-write)
|
|
1554
1597
|
let killed = [];
|
|
@@ -1776,6 +1819,116 @@ const commands = {
|
|
|
1776
1819
|
console.error(`error: ${e.message}`);
|
|
1777
1820
|
process.exit(1);
|
|
1778
1821
|
}
|
|
1822
|
+
},
|
|
1823
|
+
|
|
1824
|
+
// `minions bridge <subcommand>` — Constellation read-only bridge surface.
|
|
1825
|
+
// Owns the on/off flag and the marker-file projection used by the
|
|
1826
|
+
// Constellation agent. Bridge polling logic itself lives in the
|
|
1827
|
+
// Constellation repo (P-wi1-bridge-readonly).
|
|
1828
|
+
//
|
|
1829
|
+
// Subcommands:
|
|
1830
|
+
// status Print enabled flag + last-seen Constellation agent timestamp.
|
|
1831
|
+
// health Probe http://127.0.0.1:7331/api/status and print the same
|
|
1832
|
+
// curated projection the Constellation bridge would consume.
|
|
1833
|
+
// enable Atomically set engine.constellationBridge.enabled = true.
|
|
1834
|
+
// disable Atomically set engine.constellationBridge.enabled = false.
|
|
1835
|
+
bridge(subcmd, ...rest) {
|
|
1836
|
+
const bridge = require('./bridge');
|
|
1837
|
+
const BRIDGE_USAGE = 'Usage: minions bridge <status|health|enable|disable>';
|
|
1838
|
+
|
|
1839
|
+
if (!subcmd || subcmd === 'help' || subcmd === '--help' || subcmd === '-h') {
|
|
1840
|
+
console.log(BRIDGE_USAGE);
|
|
1841
|
+
console.log('');
|
|
1842
|
+
console.log(' status Show enabled flag + last-seen Constellation agent timestamp');
|
|
1843
|
+
console.log(' health Probe http://127.0.0.1:7331/api/status and print bridge projection');
|
|
1844
|
+
console.log(' enable Set engine.constellationBridge.enabled = true');
|
|
1845
|
+
console.log(' disable Set engine.constellationBridge.enabled = false');
|
|
1846
|
+
return;
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
if (rest.length > 0) {
|
|
1850
|
+
console.error(`error: unexpected arguments after bridge ${subcmd}: ${rest.join(' ')}`);
|
|
1851
|
+
process.exit(2);
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
if (subcmd === 'enable' || subcmd === 'disable') {
|
|
1855
|
+
const enabled = subcmd === 'enable';
|
|
1856
|
+
const { previous, current } = bridge.setBridgeEnabled(enabled);
|
|
1857
|
+
const verb = previous === current ? 'already' : 'now';
|
|
1858
|
+
console.log(`bridge: ${verb} ${current ? 'enabled' : 'disabled'}`);
|
|
1859
|
+
console.log(' config: engine.constellationBridge.enabled = ' + current);
|
|
1860
|
+
if (!previous && current) {
|
|
1861
|
+
console.log(' note: Constellation-side bridge must also be enabled to project state.');
|
|
1862
|
+
}
|
|
1863
|
+
return;
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1866
|
+
if (subcmd === 'status') {
|
|
1867
|
+
const config = getConfig();
|
|
1868
|
+
const enabled = bridge.isBridgeEnabled(config);
|
|
1869
|
+
console.log(`bridge: ${enabled ? 'enabled' : 'disabled'}`);
|
|
1870
|
+
console.log(' config: engine.constellationBridge.enabled = ' + enabled);
|
|
1871
|
+
const marker = bridge.readBridgeMarker();
|
|
1872
|
+
if (!marker) {
|
|
1873
|
+
console.log(' marker: no Constellation agent has registered yet');
|
|
1874
|
+
console.log(` (expected at ${bridge.CONSTELLATION_BRIDGE_MARKER_PATH})`);
|
|
1875
|
+
} else {
|
|
1876
|
+
console.log(` last seen: ${bridge.formatRelativeAge(marker.lastSeenAt)} (${marker.lastSeenAt})`);
|
|
1877
|
+
if (marker.agentVersion) console.log(` agent version: ${marker.agentVersion}`);
|
|
1878
|
+
if (marker.source) console.log(` source: ${marker.source}`);
|
|
1879
|
+
}
|
|
1880
|
+
return;
|
|
1881
|
+
}
|
|
1882
|
+
|
|
1883
|
+
if (subcmd === 'health') {
|
|
1884
|
+
const http = require('http');
|
|
1885
|
+
const req = http.get(
|
|
1886
|
+
{ hostname: '127.0.0.1', port: 7331, path: '/api/status', timeout: 5000 },
|
|
1887
|
+
(res) => {
|
|
1888
|
+
let body = '';
|
|
1889
|
+
res.setEncoding('utf8');
|
|
1890
|
+
res.on('data', (chunk) => { body += chunk; });
|
|
1891
|
+
res.on('end', () => {
|
|
1892
|
+
if (res.statusCode !== 200) {
|
|
1893
|
+
console.error(`error: dashboard returned HTTP ${res.statusCode}`);
|
|
1894
|
+
process.exit(1);
|
|
1895
|
+
}
|
|
1896
|
+
let parsed;
|
|
1897
|
+
try { parsed = JSON.parse(body); }
|
|
1898
|
+
catch (e) {
|
|
1899
|
+
console.error(`error: dashboard response was not JSON: ${e.message}`);
|
|
1900
|
+
process.exit(1);
|
|
1901
|
+
}
|
|
1902
|
+
const projection = bridge.projectStatusForBridge(parsed);
|
|
1903
|
+
if (!projection) {
|
|
1904
|
+
console.error('error: dashboard /api/status returned an unexpected shape');
|
|
1905
|
+
process.exit(1);
|
|
1906
|
+
}
|
|
1907
|
+
console.log('bridge: dashboard reachable on http://127.0.0.1:7331');
|
|
1908
|
+
console.log(' projection (same fields the Constellation bridge would read):');
|
|
1909
|
+
for (const [k, v] of Object.entries(projection)) {
|
|
1910
|
+
console.log(` ${k}: ${v === null ? '(unknown)' : v}`);
|
|
1911
|
+
}
|
|
1912
|
+
});
|
|
1913
|
+
}
|
|
1914
|
+
);
|
|
1915
|
+
req.on('timeout', () => {
|
|
1916
|
+
req.destroy(new Error('connect timeout'));
|
|
1917
|
+
});
|
|
1918
|
+
req.on('error', (err) => {
|
|
1919
|
+
if (err.code === 'ECONNREFUSED') {
|
|
1920
|
+
console.error('error: dashboard not running on :7331 — start it with `minions dash`');
|
|
1921
|
+
} else {
|
|
1922
|
+
console.error(`error: ${err.message}`);
|
|
1923
|
+
}
|
|
1924
|
+
process.exit(1);
|
|
1925
|
+
});
|
|
1926
|
+
return;
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
console.error(`Unknown bridge subcommand: ${subcmd}`);
|
|
1930
|
+
console.error(BRIDGE_USAGE);
|
|
1931
|
+
process.exit(2);
|
|
1779
1932
|
}
|
|
1780
1933
|
};
|
|
1781
1934
|
|
|
@@ -1799,4 +1952,8 @@ module.exports = {
|
|
|
1799
1952
|
_readDispatchPid: readDispatchPid,
|
|
1800
1953
|
_normalizeSessionBranch: normalizeSessionBranch,
|
|
1801
1954
|
_dispatchSessionBranch: dispatchSessionBranch,
|
|
1955
|
+
// W-mpcyvff6000pf828 (#2653) — heartbeat writer + factory exported for tests
|
|
1956
|
+
_writeHeartbeatNow: writeHeartbeatNow,
|
|
1957
|
+
_createHeartbeatInterval: createHeartbeatInterval,
|
|
1958
|
+
_HEARTBEAT_INTERVAL_MS: HEARTBEAT_INTERVAL_MS,
|
|
1802
1959
|
};
|
package/engine/dispatch.js
CHANGED
|
@@ -343,6 +343,7 @@ function isRetryableFailureReason(reason = '', failureClass = '') {
|
|
|
343
343
|
const neverRetry = new Set([
|
|
344
344
|
FAILURE_CLASS.CONFIG_ERROR,
|
|
345
345
|
FAILURE_CLASS.PERMISSION_BLOCKED,
|
|
346
|
+
FAILURE_CLASS.AUTH, // W-mpcuc8i80003a7b3 — git/network credential failure; mechanical retry won't fix missing az / GCM creds
|
|
346
347
|
FAILURE_CLASS.WORKTREE_PREFLIGHT, // pre-spawn worktree validation — recompute will produce the same failure
|
|
347
348
|
FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR, // W-mp6k7ywi000fa33c — keep-pids cwd is not a real git worktree; re-running won't fix the structural issue
|
|
348
349
|
FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA, // W-mp7i902u000l991f — keep-pids.json failed shape validation; re-running with the same wrong file won't fix it
|
|
@@ -653,6 +654,7 @@ function completeDispatch(id, result = DISPATCH_RESULT.SUCCESS, reason = '', res
|
|
|
653
654
|
[FAILURE_CLASS.OUT_OF_CONTEXT]: 'context window exhausted',
|
|
654
655
|
[FAILURE_CLASS.CONFIG_ERROR]: 'configuration error',
|
|
655
656
|
[FAILURE_CLASS.PERMISSION_BLOCKED]: 'permission or auth failure',
|
|
657
|
+
[FAILURE_CLASS.AUTH]: 'ADO/git authentication failed (missing or expired credentials)',
|
|
656
658
|
[FAILURE_CLASS.WORKTREE_PREFLIGHT]: 'worktree preflight rejected (nested in project root or rootDir collapsed to drive root)',
|
|
657
659
|
[FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR]: 'keep_processes cwd is not a real git worktree (rerun in a `git worktree add` directory)',
|
|
658
660
|
[FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA]: 'keep-pids.json failed shape validation (wrong keys/types/values — see inbox alert for the canonical shape)',
|
|
@@ -821,6 +823,7 @@ function cleanDispatchEntries(matchFn) {
|
|
|
821
823
|
let removed = 0;
|
|
822
824
|
const pidsToKill = [];
|
|
823
825
|
const filesToDelete = [];
|
|
826
|
+
const dispatchDirsToRemove = [];
|
|
824
827
|
try {
|
|
825
828
|
mutateJsonFileLocked(dispatchPath, (dispatch) => {
|
|
826
829
|
for (const queue of ['pending', 'active', 'completed']) {
|
|
@@ -829,17 +832,26 @@ function cleanDispatchEntries(matchFn) {
|
|
|
829
832
|
if (queue === 'active') {
|
|
830
833
|
for (const d of dispatch[queue]) {
|
|
831
834
|
if (!matchFn(d)) continue;
|
|
832
|
-
//
|
|
833
|
-
//
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
835
|
+
// P-f6-tmp-toctou: prefer the dispatch's recorded tmpDir. Fall
|
|
836
|
+
// back to legacy flat layout for active dispatches that pre-date
|
|
837
|
+
// the per-dispatch-dir layout. shared.findDispatchPidFile honors
|
|
838
|
+
// the same resolution order so we agree on which PID to kill.
|
|
839
|
+
const pidPath = shared.findDispatchPidFile(d);
|
|
840
|
+
if (pidPath) {
|
|
841
|
+
try {
|
|
842
|
+
const pid = parseInt(fs.readFileSync(pidPath, 'utf8').trim());
|
|
843
|
+
if (pid) pidsToKill.push(pid);
|
|
844
|
+
} catch { /* PID file may not exist */ }
|
|
845
|
+
}
|
|
846
|
+
if (d.tmpDir && shared.validateDispatchTmpDir(d.tmpDir)) {
|
|
847
|
+
dispatchDirsToRemove.push(d.tmpDir);
|
|
848
|
+
} else {
|
|
849
|
+
// Legacy individual-file cleanup for pre-migration entries.
|
|
850
|
+
filesToDelete.push(path.join(tmpDir, `pid-${d.id}.pid`));
|
|
851
|
+
filesToDelete.push(path.join(tmpDir, `prompt-${d.id}.md`));
|
|
852
|
+
filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md`));
|
|
853
|
+
filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md.tmp`));
|
|
854
|
+
}
|
|
843
855
|
}
|
|
844
856
|
}
|
|
845
857
|
dispatch[queue] = dispatch[queue].filter(d => !matchFn(d));
|
|
@@ -863,6 +875,9 @@ function cleanDispatchEntries(matchFn) {
|
|
|
863
875
|
for (const fp of filesToDelete) {
|
|
864
876
|
try { fs.unlinkSync(fp); } catch { /* may not exist */ }
|
|
865
877
|
}
|
|
878
|
+
for (const dir of dispatchDirsToRemove) {
|
|
879
|
+
shared.removeDispatchTmpDir(dir);
|
|
880
|
+
}
|
|
866
881
|
return removed;
|
|
867
882
|
}
|
|
868
883
|
|