@yemi33/minions 0.1.1984 → 0.1.1986

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -136,6 +136,12 @@ class Worker {
136
136
  this.killed = false;
137
137
  this.spawnError = null;
138
138
  this.firstSystemPromptSent = false;
139
+ // In-flight spawn+initialize+session/new promise. Set by getSession()
140
+ // before the worker is registered in _tabs, cleared after the handshake
141
+ // settles. Racing getSession() callers await this to avoid the
142
+ // "warm-reuse path returns sessionId=null while init is still pending"
143
+ // hang on first message of a freshly-warmed tab (W-mpd45blx00072f04).
144
+ this.initPromise = null;
139
145
  }
140
146
 
141
147
  // ── Spawn + initialize handshake ────────────────────────────────────────
@@ -499,6 +505,31 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
499
505
  // 'cold-spawn' — fresh proc + initialize + session/new
500
506
  let lifecycle = 'warm-reuse';
501
507
 
508
+ if (worker) {
509
+ // W-mpd45blx00072f04: if the existing worker is still mid-init (warm
510
+ // fired but session/new hasn't resolved yet), await the in-flight init
511
+ // BEFORE evaluating warm-reuse / newSession / cold-spawn — otherwise we
512
+ // return a SessionHandle with sessionId=null and the caller's first
513
+ // session/prompt fires with a null sessionId, causing every subsequent
514
+ // session/update notification to be dropped by _handleMessage's
515
+ // sessionId-match guard. User-visible symptom: first message on a
516
+ // freshly-warmed CC tab hangs (no chunks streamed, eventual onDone
517
+ // with empty text).
518
+ if (worker.initPromise) {
519
+ try {
520
+ await worker.initPromise;
521
+ } catch (err) {
522
+ // Warm init failed (e.g., auth). The originating call has already
523
+ // (or is about to) delete _tabs[tabId] and close the worker in its
524
+ // own catch handler. Surface the same error to this caller so the
525
+ // dashboard's spawn-failed path runs instead of hanging.
526
+ throw err;
527
+ }
528
+ // Re-read in case the failing initPromise's cleanup already ran.
529
+ worker = _tabs.get(tabId) || null;
530
+ }
531
+ }
532
+
502
533
  if (worker) {
503
534
  if (worker.killed) {
504
535
  _tabs.delete(tabId);
@@ -533,8 +564,24 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
533
564
  tabId, model, effort, mcpServers, mcpServersHash, systemPromptHash, cwd,
534
565
  });
535
566
  _tabs.set(tabId, worker);
567
+ // Set initPromise BEFORE awaiting so concurrent getSession() callers
568
+ // landing during the spawn+initialize+session/new round-trip can detect
569
+ // and await it (W-mpd45blx00072f04). Clear on settle so callers that
570
+ // arrive AFTER init succeeds skip the no-op await. Attach the clear
571
+ // handler as both success+failure listeners (not .finally()) so the
572
+ // chained promise has a rejection handler and doesn't surface as an
573
+ // unhandled rejection when init throws.
574
+ const initPromise = worker._spawnAndInit();
575
+ worker.initPromise = initPromise;
576
+ const clearInit = () => {
577
+ // Only clear if we're still the active promise — defensive against
578
+ // a future refactor that calls _spawnAndInit twice for the same
579
+ // Worker (current code path never does).
580
+ if (worker.initPromise === initPromise) worker.initPromise = null;
581
+ };
582
+ initPromise.then(clearInit, clearInit);
536
583
  try {
537
- await worker._spawnAndInit();
584
+ await initPromise;
538
585
  } catch (err) {
539
586
  _tabs.delete(tabId);
540
587
  try { worker.close(); } catch { /* already torn down */ }
package/engine/cleanup.js CHANGED
@@ -273,35 +273,42 @@ function _killProcessInWorktree(dir, activeProcesses, activeIds) {
273
273
  log('info', `Killed orphaned process for dispatch ${id} before worktree removal`);
274
274
  }
275
275
 
276
- // Check PID files in engine/tmp/ — only kill if no active dispatch matches
276
+ // Check PID files in engine/tmp/ — both legacy flat layout and per-dispatch
277
+ // dirs (P-f6-tmp-toctou). Only kill if no active dispatch matches.
277
278
  try {
278
- const tmpDir = path.join(ENGINE_DIR, 'tmp');
279
- for (const f of fs.readdirSync(tmpDir)) {
280
- if (!f.startsWith('pid-') || !f.endsWith('.pid')) continue;
281
- const pidFileName = f.replace(/^pid-/, '').replace(/\.pid$/, '');
282
- if (!dirLower.includes(pidFileName.slice(-8))) continue;
279
+ shared.forEachPidFile((pidFilePath, fileName, layout) => {
280
+ const pidFileName = fileName.replace(/^pid-/, '').replace(/\.pid$/, '');
281
+ if (!dirLower.includes(pidFileName.slice(-8))) return;
283
282
  // Verify this PID file's dispatch is not active
284
283
  let isActive = false;
285
284
  for (const id of activeIds) { if (pidFileName.includes(id.slice(-8))) { isActive = true; break; } }
286
- if (isActive) continue; // still active — do not kill
287
- const pid = parseInt(fs.readFileSync(path.join(tmpDir, f), 'utf8').trim(), 10);
285
+ if (isActive) return; // still active — do not kill
286
+ let pid;
287
+ try { pid = parseInt(fs.readFileSync(pidFilePath, 'utf8').trim(), 10); }
288
+ catch { return; }
288
289
  if (pid > 0) {
289
290
  // Verify the PID still belongs to a Minions runtime process before killing.
290
291
  // The shared helper inspects the PID's full command line for `claude` /
291
292
  // `copilot` so a recycled PID running an unrelated process is skipped.
292
293
  try {
293
294
  if (process.platform === 'win32') {
294
- if (!shared.isProcessCommandLineMatchingAgent(pid)) continue;
295
+ if (!shared.isProcessCommandLineMatchingAgent(pid)) return;
295
296
  exec(`taskkill /F /T /PID ${pid}`, { stdio: 'pipe', timeout: 5000, windowsHide: true });
296
297
  } else {
297
- if (!shared.isProcessCommandLineMatchingAgent(pid)) continue;
298
+ if (!shared.isProcessCommandLineMatchingAgent(pid)) return;
298
299
  try { process.kill(-pid, 'SIGKILL'); } catch { process.kill(pid, 'SIGKILL'); }
299
300
  }
300
- log('info', `Killed orphaned PID ${pid} (${f}) before worktree removal`);
301
+ log('info', `Killed orphaned PID ${pid} (${fileName}, ${layout}) before worktree removal`);
301
302
  } catch {} // process may already be dead
302
303
  }
303
- try { fs.unlinkSync(path.join(tmpDir, f)); } catch {}
304
- }
304
+ if (layout === 'dispatch-dir') {
305
+ // Remove the entire per-dispatch dir — its remaining sidecars are
306
+ // orphans of the same dead process.
307
+ try { shared.removeDispatchTmpDir(path.dirname(pidFilePath)); } catch {}
308
+ } else {
309
+ try { fs.unlinkSync(pidFilePath); } catch {}
310
+ }
311
+ });
305
312
  } catch {} // tmp dir may not exist
306
313
  }
307
314
 
@@ -313,9 +320,35 @@ async function runCleanup(config, verbose = false) {
313
320
  let cleaned = { tempFiles: 0, liveOutputs: 0, worktrees: 0, zombies: 0 };
314
321
 
315
322
  // 1. Clean stale temp prompt/sysprompt files and orphaned safeWrite .tmp.* files (older than 1 hour)
323
+ // P-f6-tmp-toctou: also sweep abandoned per-dispatch dirs (engine/tmp/dispatch-*),
324
+ // and recurse into them so leftover prompt/sysprompt sidecars from crashed
325
+ // dispatches don't accumulate.
316
326
  const oneHourAgo = Date.now() - 3600000;
317
327
  const tmpDir = path.join(ENGINE_DIR, 'tmp');
318
328
  const scanDirs = [ENGINE_DIR, ...(fs.existsSync(tmpDir) ? [tmpDir] : [])];
329
+ // Discover dispatch-* dirs under engine/tmp/ and scan their contents too.
330
+ if (fs.existsSync(tmpDir)) {
331
+ try {
332
+ for (const entry of fs.readdirSync(tmpDir, { withFileTypes: true })) {
333
+ if (!entry.isDirectory()) continue;
334
+ if (!entry.name.startsWith('dispatch-')) continue;
335
+ const full = path.join(tmpDir, entry.name);
336
+ if (!shared.validateDispatchTmpDir(full)) continue;
337
+ scanDirs.push(full);
338
+ }
339
+ } catch { /* tmp dir may be empty/missing */ }
340
+ }
341
+ // Track which dispatch dirs we touch so we can rm empty ones whose owning
342
+ // dispatch is no longer in the active set.
343
+ const activeDispatchTmpDirs = new Set();
344
+ try {
345
+ const dispatch = getDispatch();
346
+ for (const queue of ['pending', 'active']) {
347
+ for (const e of dispatch[queue] || []) {
348
+ if (e?.tmpDir) activeDispatchTmpDirs.add(path.resolve(e.tmpDir));
349
+ }
350
+ }
351
+ } catch { /* dispatch.json may be empty */ }
319
352
  for (const dir of scanDirs) {
320
353
  // Each directory gets its own try-catch so one failure doesn't abort other directories (Bug #27)
321
354
  let dirEntries;
@@ -341,6 +374,22 @@ async function runCleanup(config, verbose = false) {
341
374
  }
342
375
  }
343
376
  }
377
+ // Reap empty/stale per-dispatch tmp dirs not referenced by an active entry.
378
+ cleaned.dispatchDirs = 0;
379
+ if (fs.existsSync(tmpDir)) {
380
+ try {
381
+ for (const entry of fs.readdirSync(tmpDir, { withFileTypes: true })) {
382
+ if (!entry.isDirectory() || !entry.name.startsWith('dispatch-')) continue;
383
+ const full = path.join(tmpDir, entry.name);
384
+ if (!shared.validateDispatchTmpDir(full)) continue;
385
+ if (activeDispatchTmpDirs.has(path.resolve(full))) continue;
386
+ let stat;
387
+ try { stat = fs.statSync(full); } catch { continue; }
388
+ if (stat.mtimeMs >= oneHourAgo) continue;
389
+ if (shared.removeDispatchTmpDir(full)) cleaned.dispatchDirs++;
390
+ }
391
+ } catch { /* sweep is best-effort */ }
392
+ }
344
393
 
345
394
  // 2. Clean live-output.log and live-output-prev.log for idle agents (not currently working)
346
395
  for (const [agentId] of Object.entries(config.agents || {})) {
@@ -1111,31 +1160,31 @@ async function runCleanup(config, verbose = false) {
1111
1160
  } catch (e) { log('warn', 'cap cooldowns: ' + e.message); }
1112
1161
 
1113
1162
  // 12. Clean stale PID files — remove PID files whose process is no longer running
1163
+ // P-f6-tmp-toctou: walks BOTH legacy flat layout and per-dispatch-dir layout
1164
+ // via shared.forEachPidFile.
1114
1165
  cleaned.pidFiles = 0;
1115
1166
  try {
1116
1167
  const tmpDir = path.join(ENGINE_DIR, 'tmp');
1117
1168
  if (fs.existsSync(tmpDir)) {
1118
- let pidDirEntries;
1119
- try { pidDirEntries = fs.readdirSync(tmpDir); } catch { pidDirEntries = []; }
1120
1169
  const activePids = new Set();
1121
1170
  for (const [, info] of activeProcesses) {
1122
1171
  if (info.proc?.pid) activePids.add(String(info.proc.pid));
1123
1172
  }
1124
- for (const f of pidDirEntries) {
1125
- if (!f.startsWith('pid-') || !f.endsWith('.pid')) continue;
1126
- const fp = path.join(tmpDir, f);
1173
+ shared.forEachPidFile((pidFilePath, fileName, layout) => {
1127
1174
  try {
1128
- const pidStr = fs.readFileSync(fp, 'utf8').trim();
1175
+ const pidStr = fs.readFileSync(pidFilePath, 'utf8').trim();
1129
1176
  // Skip if actively tracked
1130
- if (activePids.has(pidStr)) continue;
1177
+ if (activePids.has(pidStr)) return;
1131
1178
  // Check if file is stale (>1 hour old)
1132
- const stat = fs.statSync(fp);
1179
+ const stat = fs.statSync(pidFilePath);
1133
1180
  if (stat.mtimeMs < oneHourAgo) {
1134
- fs.unlinkSync(fp);
1181
+ fs.unlinkSync(pidFilePath);
1135
1182
  cleaned.pidFiles++;
1183
+ // For dispatch-dir layout, the empty/stale dispatch dir gets reaped
1184
+ // by the stale-dispatch-dir sweep in step 1.
1136
1185
  }
1137
1186
  } catch { /* cleanup */ }
1138
- }
1187
+ });
1139
1188
  }
1140
1189
  } catch (e) { log('warn', 'clean stale PID files: ' + e.message); }
1141
1190
 
package/engine/cli.js CHANGED
@@ -57,7 +57,9 @@ function dispatchSafeId(itemId) {
57
57
  }
58
58
 
59
59
  function readDispatchPid(itemId) {
60
- const pidFile = path.join(ENGINE_DIR, 'tmp', `pid-${dispatchSafeId(itemId)}.pid`);
60
+ // P-f6-tmp-toctou: prefer per-dispatch dir layout, fall back to legacy flat.
61
+ const pidFile = shared.findDispatchPidFile(itemId);
62
+ if (!pidFile) return null;
61
63
  let raw;
62
64
  try { raw = fs.readFileSync(pidFile, 'utf8').trim(); }
63
65
  catch { return null; }
@@ -90,6 +92,32 @@ function summarizeActiveDispatchPids(activeItems = []) {
90
92
  return summary;
91
93
  }
92
94
 
95
+ // W-mpcyvff6000pf828 (#2653) — Heartbeat writer decoupled from tickInner.
96
+ // `writeHeartbeatNow` mirrors the legacy in-tick write (synchronous, fast,
97
+ // non-throwing) but is driven from the main start loop on a 15s setInterval
98
+ // instead. See engine.js#tickInner for the load-bearing rationale comment.
99
+ const HEARTBEAT_INTERVAL_MS = 15000;
100
+
101
+ function writeHeartbeatNow() {
102
+ try {
103
+ // Synchronous callback inside the lock — just `s.heartbeat = Date.now()`,
104
+ // no awaits, no other state mutation. Mirrors mutateControl's contract.
105
+ shared.mutateControl(c => { c.heartbeat = Date.now(); return c; });
106
+ } catch (err) {
107
+ try { engine().log('warn', `write heartbeat: ${err.message}`); }
108
+ catch { /* during shutdown logger can be torn down — silent */ }
109
+ }
110
+ }
111
+
112
+ // Factory used by tests to drive the heartbeat interval without spinning up
113
+ // the full engine start loop. Returns the underlying timer so callers (the
114
+ // real start handler in production, tests in unit/engine-heartbeat.test.js)
115
+ // can clearInterval(...) on shutdown. Default to the production cadence.
116
+ function createHeartbeatInterval(intervalMs = HEARTBEAT_INTERVAL_MS, writer = writeHeartbeatNow) {
117
+ return setInterval(writer, intervalMs);
118
+ }
119
+
120
+
93
121
  function createControlOwner(pid = process.pid) {
94
122
  return { pid, ownerToken: `${pid}-${shared.uid()}` };
95
123
  }
@@ -162,6 +190,7 @@ const CLI_COMMAND_DOCS = Object.freeze({
162
190
  doctor: { args: '', summary: 'Check prerequisites and runtime health' },
163
191
  config: { args: 'set-cli <R> [--model M]', summary: 'Persist defaultCli/defaultModel without starting' },
164
192
  pr: { args: 'comment <repo> <prNumber> --agent <id> --kind <k> [--wi <id>] [--body-file <f>|--body <text>]', summary: 'Post a marker-prepended PR comment via gh' },
193
+ bridge: { args: 'status|health|enable|disable', summary: 'Constellation bridge: toggle and inspect the read-only cross-repo feed' },
165
194
  });
166
195
 
167
196
  function formatCliCommandHelpLines() {
@@ -872,6 +901,18 @@ const commands = {
872
901
  // Start tick loop
873
902
  const tickTimer = setInterval(() => e.tick(), interval);
874
903
 
904
+ // W-mpcyvff6000pf828 (#2653) — Heartbeat decoupled from tickInner.
905
+ // The dashboard flips the engine badge to STALE when
906
+ // `Date.now() - control.heartbeat > 120000`; tying the write to tickInner
907
+ // meant any legitimately slow tick (cold runtime spawn, sequential
908
+ // ADO/gh polls, slow worktree create) blocked the next heartbeat and
909
+ // surfaced a healthy engine as crashed. We now write every 15s on a
910
+ // dedicated interval — 8× headroom vs the 120s threshold even under
911
+ // event-loop pressure, and orders of magnitude under TICK_TIMEOUT_MS so
912
+ // a hung tick still looks distinct from a wedged event loop.
913
+ writeHeartbeatNow(); // prime control.heartbeat immediately
914
+ const heartbeatTimer = setInterval(writeHeartbeatNow, HEARTBEAT_INTERVAL_MS);
915
+
875
916
  // Fast poll: check steering every 1s (lightweight — just fs.stat per agent)
876
917
  // and wakeup signals every 1s (control.json read)
877
918
  const { checkSteering } = require('./timeout');
@@ -948,6 +989,7 @@ const commands = {
948
989
  console.log(`\n${signal} received — initiating graceful shutdown...`);
949
990
  clearInterval(tickTimer);
950
991
  clearInterval(fastPollTimer);
992
+ clearInterval(heartbeatTimer);
951
993
  for (const f of _watchedFiles) { try { fs.unwatchFile(f); } catch { /* cleanup */ } }
952
994
  const stoppingAt = e.ts();
953
995
  const stoppingWrite = markControlStoppingForOwner(controlOwner, stoppingAt);
@@ -1528,13 +1570,10 @@ const commands = {
1528
1570
  const shared = require('./shared');
1529
1571
 
1530
1572
  // Kill processes via PID files (expensive — outside dispatch lock).
1531
- // PID files live in engine/tmp/ (see engine/spawn-agent.js:220 derived from
1532
- // the prompt-<id>.md sidecar path that engine.js builds in engine/tmp/).
1533
- // Reading from ENGINE_DIR directly is a no-op: spawn-agent never writes there.
1534
- const pidDir = path.join(ENGINE_DIR, 'tmp');
1535
- const pidFiles = shared.safeReadDir(pidDir).filter(f => f.startsWith('pid-') && f.endsWith('.pid'));
1536
- for (const f of pidFiles) {
1537
- const pidPath = path.join(pidDir, f);
1573
+ // PID files live in engine/tmp/ both legacy flat layout
1574
+ // (`pid-<id>.pid`) and per-dispatch dirs (`dispatch-<id>-XXX/pid-<id>.pid`)
1575
+ // post-P-f6-tmp-toctou. shared.forEachPidFile visits both.
1576
+ shared.forEachPidFile((pidPath, fileName, layout) => {
1538
1577
  const raw = safeRead(pidPath).trim();
1539
1578
  // Guard against falsy/zero/NaN PIDs. Empty pid files would resolve to
1540
1579
  // Number('') === 0, and process.kill(0) on POSIX targets the entire
@@ -1542,13 +1581,17 @@ const commands = {
1542
1581
  let pidNum = NaN;
1543
1582
  try { pidNum = shared.validatePid(raw); } catch { /* invalid — skip */ }
1544
1583
  if (pidNum > 0) {
1545
- try { process.kill(pidNum); console.log(`Killed process ${pidNum} (${f})`); }
1584
+ try { process.kill(pidNum); console.log(`Killed process ${pidNum} (${fileName})`); }
1546
1585
  catch { console.log(`Process ${pidNum} already dead`); }
1547
1586
  } else {
1548
- console.log(`Skipping ${f}: invalid or empty PID`);
1587
+ console.log(`Skipping ${fileName}: invalid or empty PID`);
1549
1588
  }
1550
- try { fs.unlinkSync(pidPath); } catch { /* may not exist */ }
1551
- }
1589
+ if (layout === 'dispatch-dir') {
1590
+ try { shared.removeDispatchTmpDir(path.dirname(pidPath)); } catch { /* may not exist */ }
1591
+ } else {
1592
+ try { fs.unlinkSync(pidPath); } catch { /* may not exist */ }
1593
+ }
1594
+ });
1552
1595
 
1553
1596
  // Atomically read and clear dispatch.active (locked read-modify-write)
1554
1597
  let killed = [];
@@ -1776,6 +1819,116 @@ const commands = {
1776
1819
  console.error(`error: ${e.message}`);
1777
1820
  process.exit(1);
1778
1821
  }
1822
+ },
1823
+
1824
+ // `minions bridge <subcommand>` — Constellation read-only bridge surface.
1825
+ // Owns the on/off flag and the marker-file projection used by the
1826
+ // Constellation agent. Bridge polling logic itself lives in the
1827
+ // Constellation repo (P-wi1-bridge-readonly).
1828
+ //
1829
+ // Subcommands:
1830
+ // status Print enabled flag + last-seen Constellation agent timestamp.
1831
+ // health Probe http://127.0.0.1:7331/api/status and print the same
1832
+ // curated projection the Constellation bridge would consume.
1833
+ // enable Atomically set engine.constellationBridge.enabled = true.
1834
+ // disable Atomically set engine.constellationBridge.enabled = false.
1835
+ bridge(subcmd, ...rest) {
1836
+ const bridge = require('./bridge');
1837
+ const BRIDGE_USAGE = 'Usage: minions bridge <status|health|enable|disable>';
1838
+
1839
+ if (!subcmd || subcmd === 'help' || subcmd === '--help' || subcmd === '-h') {
1840
+ console.log(BRIDGE_USAGE);
1841
+ console.log('');
1842
+ console.log(' status Show enabled flag + last-seen Constellation agent timestamp');
1843
+ console.log(' health Probe http://127.0.0.1:7331/api/status and print bridge projection');
1844
+ console.log(' enable Set engine.constellationBridge.enabled = true');
1845
+ console.log(' disable Set engine.constellationBridge.enabled = false');
1846
+ return;
1847
+ }
1848
+
1849
+ if (rest.length > 0) {
1850
+ console.error(`error: unexpected arguments after bridge ${subcmd}: ${rest.join(' ')}`);
1851
+ process.exit(2);
1852
+ }
1853
+
1854
+ if (subcmd === 'enable' || subcmd === 'disable') {
1855
+ const enabled = subcmd === 'enable';
1856
+ const { previous, current } = bridge.setBridgeEnabled(enabled);
1857
+ const verb = previous === current ? 'already' : 'now';
1858
+ console.log(`bridge: ${verb} ${current ? 'enabled' : 'disabled'}`);
1859
+ console.log(' config: engine.constellationBridge.enabled = ' + current);
1860
+ if (!previous && current) {
1861
+ console.log(' note: Constellation-side bridge must also be enabled to project state.');
1862
+ }
1863
+ return;
1864
+ }
1865
+
1866
+ if (subcmd === 'status') {
1867
+ const config = getConfig();
1868
+ const enabled = bridge.isBridgeEnabled(config);
1869
+ console.log(`bridge: ${enabled ? 'enabled' : 'disabled'}`);
1870
+ console.log(' config: engine.constellationBridge.enabled = ' + enabled);
1871
+ const marker = bridge.readBridgeMarker();
1872
+ if (!marker) {
1873
+ console.log(' marker: no Constellation agent has registered yet');
1874
+ console.log(` (expected at ${bridge.CONSTELLATION_BRIDGE_MARKER_PATH})`);
1875
+ } else {
1876
+ console.log(` last seen: ${bridge.formatRelativeAge(marker.lastSeenAt)} (${marker.lastSeenAt})`);
1877
+ if (marker.agentVersion) console.log(` agent version: ${marker.agentVersion}`);
1878
+ if (marker.source) console.log(` source: ${marker.source}`);
1879
+ }
1880
+ return;
1881
+ }
1882
+
1883
+ if (subcmd === 'health') {
1884
+ const http = require('http');
1885
+ const req = http.get(
1886
+ { hostname: '127.0.0.1', port: 7331, path: '/api/status', timeout: 5000 },
1887
+ (res) => {
1888
+ let body = '';
1889
+ res.setEncoding('utf8');
1890
+ res.on('data', (chunk) => { body += chunk; });
1891
+ res.on('end', () => {
1892
+ if (res.statusCode !== 200) {
1893
+ console.error(`error: dashboard returned HTTP ${res.statusCode}`);
1894
+ process.exit(1);
1895
+ }
1896
+ let parsed;
1897
+ try { parsed = JSON.parse(body); }
1898
+ catch (e) {
1899
+ console.error(`error: dashboard response was not JSON: ${e.message}`);
1900
+ process.exit(1);
1901
+ }
1902
+ const projection = bridge.projectStatusForBridge(parsed);
1903
+ if (!projection) {
1904
+ console.error('error: dashboard /api/status returned an unexpected shape');
1905
+ process.exit(1);
1906
+ }
1907
+ console.log('bridge: dashboard reachable on http://127.0.0.1:7331');
1908
+ console.log(' projection (same fields the Constellation bridge would read):');
1909
+ for (const [k, v] of Object.entries(projection)) {
1910
+ console.log(` ${k}: ${v === null ? '(unknown)' : v}`);
1911
+ }
1912
+ });
1913
+ }
1914
+ );
1915
+ req.on('timeout', () => {
1916
+ req.destroy(new Error('connect timeout'));
1917
+ });
1918
+ req.on('error', (err) => {
1919
+ if (err.code === 'ECONNREFUSED') {
1920
+ console.error('error: dashboard not running on :7331 — start it with `minions dash`');
1921
+ } else {
1922
+ console.error(`error: ${err.message}`);
1923
+ }
1924
+ process.exit(1);
1925
+ });
1926
+ return;
1927
+ }
1928
+
1929
+ console.error(`Unknown bridge subcommand: ${subcmd}`);
1930
+ console.error(BRIDGE_USAGE);
1931
+ process.exit(2);
1779
1932
  }
1780
1933
  };
1781
1934
 
@@ -1799,4 +1952,8 @@ module.exports = {
1799
1952
  _readDispatchPid: readDispatchPid,
1800
1953
  _normalizeSessionBranch: normalizeSessionBranch,
1801
1954
  _dispatchSessionBranch: dispatchSessionBranch,
1955
+ // W-mpcyvff6000pf828 (#2653) — heartbeat writer + factory exported for tests
1956
+ _writeHeartbeatNow: writeHeartbeatNow,
1957
+ _createHeartbeatInterval: createHeartbeatInterval,
1958
+ _HEARTBEAT_INTERVAL_MS: HEARTBEAT_INTERVAL_MS,
1802
1959
  };
@@ -343,6 +343,7 @@ function isRetryableFailureReason(reason = '', failureClass = '') {
343
343
  const neverRetry = new Set([
344
344
  FAILURE_CLASS.CONFIG_ERROR,
345
345
  FAILURE_CLASS.PERMISSION_BLOCKED,
346
+ FAILURE_CLASS.AUTH, // W-mpcuc8i80003a7b3 — git/network credential failure; mechanical retry won't fix missing az / GCM creds
346
347
  FAILURE_CLASS.WORKTREE_PREFLIGHT, // pre-spawn worktree validation — recompute will produce the same failure
347
348
  FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR, // W-mp6k7ywi000fa33c — keep-pids cwd is not a real git worktree; re-running won't fix the structural issue
348
349
  FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA, // W-mp7i902u000l991f — keep-pids.json failed shape validation; re-running with the same wrong file won't fix it
@@ -653,6 +654,7 @@ function completeDispatch(id, result = DISPATCH_RESULT.SUCCESS, reason = '', res
653
654
  [FAILURE_CLASS.OUT_OF_CONTEXT]: 'context window exhausted',
654
655
  [FAILURE_CLASS.CONFIG_ERROR]: 'configuration error',
655
656
  [FAILURE_CLASS.PERMISSION_BLOCKED]: 'permission or auth failure',
657
+ [FAILURE_CLASS.AUTH]: 'ADO/git authentication failed (missing or expired credentials)',
656
658
  [FAILURE_CLASS.WORKTREE_PREFLIGHT]: 'worktree preflight rejected (nested in project root or rootDir collapsed to drive root)',
657
659
  [FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR]: 'keep_processes cwd is not a real git worktree (rerun in a `git worktree add` directory)',
658
660
  [FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA]: 'keep-pids.json failed shape validation (wrong keys/types/values — see inbox alert for the canonical shape)',
@@ -821,6 +823,7 @@ function cleanDispatchEntries(matchFn) {
821
823
  let removed = 0;
822
824
  const pidsToKill = [];
823
825
  const filesToDelete = [];
826
+ const dispatchDirsToRemove = [];
824
827
  try {
825
828
  mutateJsonFileLocked(dispatchPath, (dispatch) => {
826
829
  for (const queue of ['pending', 'active', 'completed']) {
@@ -829,17 +832,26 @@ function cleanDispatchEntries(matchFn) {
829
832
  if (queue === 'active') {
830
833
  for (const d of dispatch[queue]) {
831
834
  if (!matchFn(d)) continue;
832
- // PID files live in engine/tmp/ (see engine/spawn-agent.js:220 — derived
833
- // from the prompt-<id>.md path that engine.js builds in engine/tmp/).
834
- const pidFile = path.join(tmpDir, `pid-${d.id}.pid`);
835
- try {
836
- const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim());
837
- if (pid) pidsToKill.push(pid);
838
- } catch { /* PID file may not exist */ }
839
- filesToDelete.push(pidFile);
840
- filesToDelete.push(path.join(tmpDir, `prompt-${d.id}.md`));
841
- filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md`));
842
- filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md.tmp`));
835
+ // P-f6-tmp-toctou: prefer the dispatch's recorded tmpDir. Fall
836
+ // back to legacy flat layout for active dispatches that pre-date
837
+ // the per-dispatch-dir layout. shared.findDispatchPidFile honors
838
+ // the same resolution order so we agree on which PID to kill.
839
+ const pidPath = shared.findDispatchPidFile(d);
840
+ if (pidPath) {
841
+ try {
842
+ const pid = parseInt(fs.readFileSync(pidPath, 'utf8').trim());
843
+ if (pid) pidsToKill.push(pid);
844
+ } catch { /* PID file may not exist */ }
845
+ }
846
+ if (d.tmpDir && shared.validateDispatchTmpDir(d.tmpDir)) {
847
+ dispatchDirsToRemove.push(d.tmpDir);
848
+ } else {
849
+ // Legacy individual-file cleanup for pre-migration entries.
850
+ filesToDelete.push(path.join(tmpDir, `pid-${d.id}.pid`));
851
+ filesToDelete.push(path.join(tmpDir, `prompt-${d.id}.md`));
852
+ filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md`));
853
+ filesToDelete.push(path.join(tmpDir, `sysprompt-${d.id}.md.tmp`));
854
+ }
843
855
  }
844
856
  }
845
857
  dispatch[queue] = dispatch[queue].filter(d => !matchFn(d));
@@ -863,6 +875,9 @@ function cleanDispatchEntries(matchFn) {
863
875
  for (const fp of filesToDelete) {
864
876
  try { fs.unlinkSync(fp); } catch { /* may not exist */ }
865
877
  }
878
+ for (const dir of dispatchDirsToRemove) {
879
+ shared.removeDispatchTmpDir(dir);
880
+ }
866
881
  return removed;
867
882
  }
868
883