@yemi33/minions 0.1.1658 → 0.1.1660

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.1.1660 (2026-05-01)
4
+
5
+ ### Fixes
6
+ - less rigid agent orphan detection
7
+
8
+ ## 0.1.1659 (2026-05-01)
9
+
10
+ ### Other
11
+ - Harden runtime state writes
12
+
3
13
  ## 0.1.1658 (2026-05-01)
4
14
 
5
15
  ### Fixes
package/dashboard.js CHANGED
@@ -31,7 +31,7 @@ const dispatchMod = require('./engine/dispatch');
31
31
  const steering = require('./engine/steering');
32
32
  const os = require('os');
33
33
 
34
- const { safeRead, safeReadDir, safeWrite, safeJson, safeJsonObj, safeJsonArr, safeUnlink, mutateJsonFileLocked, mutateWorkItems, getProjects: _getProjects, DONE_STATUSES, WI_STATUS, WORK_TYPE, reopenWorkItem } = shared;
34
+ const { safeRead, safeReadDir, safeWrite, safeJson, safeJsonObj, safeJsonArr, safeUnlink, mutateJsonFileLocked, mutateControl, mutateCooldowns, mutateWorkItems, getProjects: _getProjects, DONE_STATUSES, WI_STATUS, WORK_TYPE, reopenWorkItem } = shared;
35
35
  const { getAgents, getAgentDetail, getPrdInfo, getWorkItems, getDispatchQueue,
36
36
  getSkills, getInbox, getNotesWithMeta, getPullRequests,
37
37
  getEngineLog, getMetrics, getKnowledgeBaseEntries, timeSince,
@@ -2116,12 +2116,10 @@ const { cleanDispatchEntries } = require('./engine/dispatch');
2116
2116
  // ── Engine Restart Helpers (used by watchdog + API) ─────────────────────────
2117
2117
 
2118
2118
  function spawnEngine() {
2119
- const controlPath = path.join(ENGINE_DIR, 'control.json');
2120
2119
  // Don't pre-write 'stopped' — let the new engine process own its state transition.
2121
2120
  // The engine start code already handles state:'running' with a dead PID gracefully.
2122
2121
  // Only set restarted_at + clear stale pid so dashboard shows the restart timestamp.
2123
- const control = safeJson(controlPath) || {};
2124
- safeWrite(controlPath, { ...control, pid: null, restarted_at: new Date().toISOString() });
2122
+ mutateControl(control => ({ ...control, pid: null, restarted_at: new Date().toISOString() }));
2125
2123
  const { spawn: cpSpawn } = require('child_process');
2126
2124
  const childEnv = { ...process.env };
2127
2125
  for (const key of Object.keys(childEnv)) {
@@ -2413,12 +2411,10 @@ const server = http.createServer(async (req, res) => {
2413
2411
  }, { defaultValue: { pending: [], active: [], completed: [] } });
2414
2412
  } catch (e) { console.error('dispatch cleanup:', e.message); }
2415
2413
  try {
2416
- const cooldownPath = path.join(MINIONS_DIR, 'engine', 'cooldowns.json');
2417
- const cooldowns = safeJsonObj(cooldownPath);
2418
- if (cooldowns[dispatchKey]) {
2414
+ mutateCooldowns(cooldowns => {
2419
2415
  delete cooldowns[dispatchKey];
2420
- safeWrite(cooldownPath, cooldowns);
2421
- }
2416
+ return cooldowns;
2417
+ });
2422
2418
  } catch (e) { console.error('cooldown cleanup:', e.message); }
2423
2419
 
2424
2420
  return jsonReply(res, 200, { ok: true, id, rematerialized: true });
@@ -2463,12 +2459,10 @@ const server = http.createServer(async (req, res) => {
2463
2459
 
2464
2460
  // Clear cooldown so item isn't blocked by exponential backoff
2465
2461
  try {
2466
- const cooldownPath = path.join(MINIONS_DIR, 'engine', 'cooldowns.json');
2467
- const cooldowns = safeJsonObj(cooldownPath);
2468
- if (cooldowns[dispatchKey]) {
2462
+ mutateCooldowns(cooldowns => {
2469
2463
  delete cooldowns[dispatchKey];
2470
- safeWrite(cooldownPath, cooldowns);
2471
- }
2464
+ return cooldowns;
2465
+ });
2472
2466
  } catch (e) { console.error('cooldown cleanup:', e.message); }
2473
2467
 
2474
2468
  return jsonReply(res, 200, { ok: true, id });
@@ -2514,13 +2508,12 @@ const server = http.createServer(async (req, res) => {
2514
2508
 
2515
2509
  // Clean cooldown entries so item can be re-created immediately
2516
2510
  try {
2517
- const cooldownPath = path.join(MINIONS_DIR, 'engine', 'cooldowns.json');
2518
- const cooldowns = safeJsonObj(cooldownPath);
2519
- let cleaned = false;
2520
- for (const key of Object.keys(cooldowns)) {
2521
- if (key.includes(id)) { delete cooldowns[key]; cleaned = true; }
2522
- }
2523
- if (cleaned) safeWrite(cooldownPath, cooldowns);
2511
+ mutateCooldowns(cooldowns => {
2512
+ for (const key of Object.keys(cooldowns)) {
2513
+ if (key.includes(id)) delete cooldowns[key];
2514
+ }
2515
+ return cooldowns;
2516
+ });
2524
2517
  } catch (e) { console.error('cooldown cleanup:', e.message); }
2525
2518
 
2526
2519
  // Reset PRD item status so it doesn't stay 'dispatched' with no work item (#779)
@@ -2579,13 +2572,12 @@ const server = http.createServer(async (req, res) => {
2579
2572
 
2580
2573
  // Clean cooldown entries
2581
2574
  try {
2582
- const cooldownPath = path.join(MINIONS_DIR, 'engine', 'cooldowns.json');
2583
- const cooldowns = safeJsonObj(cooldownPath);
2584
- let cleaned = false;
2585
- for (const key of Object.keys(cooldowns)) {
2586
- if (key.includes(id)) { delete cooldowns[key]; cleaned = true; }
2587
- }
2588
- if (cleaned) safeWrite(cooldownPath, cooldowns);
2575
+ mutateCooldowns(cooldowns => {
2576
+ for (const key of Object.keys(cooldowns)) {
2577
+ if (key.includes(id)) delete cooldowns[key];
2578
+ }
2579
+ return cooldowns;
2580
+ });
2589
2581
  } catch (e) { console.error('cooldown cleanup on cancel:', e.message); }
2590
2582
 
2591
2583
  invalidateStatusCache();
@@ -2704,12 +2696,10 @@ const server = http.createServer(async (req, res) => {
2704
2696
  }, { defaultValue: { pending: [], active: [], completed: [] } });
2705
2697
  } catch (e) { console.error('dispatch cleanup on reopen:', e.message); }
2706
2698
  try {
2707
- const cooldownPath = path.join(MINIONS_DIR, 'engine', 'cooldowns.json');
2708
- const cooldowns = safeJsonObj(cooldownPath);
2709
- if (cooldowns[dispatchKey]) {
2699
+ mutateCooldowns(cooldowns => {
2710
2700
  delete cooldowns[dispatchKey];
2711
- safeWrite(cooldownPath, cooldowns);
2712
- }
2701
+ return cooldowns;
2702
+ });
2713
2703
  } catch (e) { console.error('cooldown cleanup on reopen:', e.message); }
2714
2704
 
2715
2705
  invalidateStatusCache();
@@ -6446,10 +6436,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6446
6436
 
6447
6437
  // Engine
6448
6438
  { method: 'POST', path: '/api/engine/wakeup', desc: 'Trigger immediate engine tick via control.json signal', handler: async (req, res) => {
6449
- const controlPath = path.join(MINIONS_DIR, 'engine', 'control.json');
6450
- const control = shared.safeJson(controlPath) || {};
6451
- control._wakeupAt = Date.now();
6452
- shared.safeWrite(controlPath, control);
6439
+ shared.mutateControl(control => ({ ...control, _wakeupAt: Date.now() }));
6453
6440
  return jsonReply(res, 200, { ok: true, message: 'Wakeup signal sent' });
6454
6441
  }},
6455
6442
  { method: 'POST', path: '/api/engine/restart', desc: 'Force-kill engine and restart immediately', handler: handleEngineRestart },
package/engine/cleanup.js CHANGED
@@ -9,7 +9,7 @@ const shared = require('./shared');
9
9
  const queries = require('./queries');
10
10
 
11
11
  const { exec, execSilent, log, ts, ENGINE_DEFAULTS } = shared;
12
- const { safeJson, safeWrite, safeReadDir, mutateWorkItems, mutateJsonFileLocked, getProjects, projectWorkItemsPath, projectPrPath,
12
+ const { safeJson, safeWrite, safeReadDir, mutateCooldowns, mutateWorkItems, mutateJsonFileLocked, getProjects, projectWorkItemsPath, projectPrPath,
13
13
  sanitizeBranch, KB_CATEGORIES } = shared;
14
14
  const { getDispatch, getAgentStatus } = queries;
15
15
 
@@ -675,9 +675,9 @@ function runCleanup(config, verbose = false) {
675
675
  entries.sort((a, b) => (b[1].timestamp || 0) - (a[1].timestamp || 0));
676
676
  const keep = Object.fromEntries(entries.slice(0, COOLDOWN_CAP));
677
677
  cleaned.cooldowns = entries.length - COOLDOWN_CAP;
678
- safeWrite(cooldownPath, keep);
678
+ mutateCooldowns(() => keep);
679
679
  } else if (dirty) {
680
- safeWrite(cooldownPath, cooldowns);
680
+ mutateCooldowns(() => cooldowns);
681
681
  }
682
682
  }
683
683
  } catch (e) { log('warn', 'cap cooldowns: ' + e.message); }
package/engine/cli.js CHANGED
@@ -6,7 +6,7 @@
6
6
  const fs = require('fs');
7
7
  const path = require('path');
8
8
  const shared = require('./shared');
9
- const { safeRead, safeJson, safeWrite, mutateWorkItems, ts, WI_STATUS, WORK_TYPE, PLAN_STATUS, PR_STATUS, DISPATCH_RESULT } = shared;
9
+ const { safeRead, safeJson, safeWrite, mutateControl, mutateWorkItems, ts, WI_STATUS, WORK_TYPE, PLAN_STATUS, PR_STATUS, DISPATCH_RESULT } = shared;
10
10
  const queries = require('./queries');
11
11
  const { getConfig, getControl, getDispatch, getAgentStatus,
12
12
  MINIONS_DIR, ENGINE_DIR, AGENTS_DIR, PLANS_DIR, PRD_DIR, CONTROL_PATH, DISPATCH_PATH } = queries;
@@ -312,7 +312,7 @@ const commands = {
312
312
  }
313
313
  let codeCommit = null;
314
314
  try { codeCommit = require('child_process').execSync('git rev-parse --short HEAD', { cwd: path.resolve(__dirname, '..'), encoding: 'utf8', timeout: 5000, windowsHide: true }).trim(); } catch {}
315
- safeWrite(CONTROL_PATH, { state: 'running', pid: process.pid, started_at: e.ts(), codeVersion, codeCommit });
315
+ mutateControl(() => ({ state: 'running', pid: process.pid, started_at: e.ts(), codeVersion, codeCommit }));
316
316
  // Keep .minions-version in sync so `minions version` stays accurate after git pulls
317
317
  if (codeVersion) {
318
318
  try { fs.writeFileSync(path.join(shared.MINIONS_DIR, '.minions-version'), codeVersion); } catch {}
@@ -599,7 +599,10 @@ const commands = {
599
599
  const ctrl = getControl();
600
600
  if (ctrl._wakeupAt && Date.now() - ctrl._wakeupAt < 5000) {
601
601
  delete ctrl._wakeupAt;
602
- safeWrite(CONTROL_PATH, ctrl);
602
+ mutateControl((control) => {
603
+ delete control._wakeupAt;
604
+ return control;
605
+ });
603
606
  e.tick();
604
607
  }
605
608
  }, 1000);
@@ -669,11 +672,11 @@ const commands = {
669
672
  clearInterval(fastPollTimer);
670
673
  if (teamsInboxTimer) clearInterval(teamsInboxTimer);
671
674
  for (const f of _watchedFiles) { try { fs.unwatchFile(f); } catch { /* cleanup */ } }
672
- safeWrite(CONTROL_PATH, { state: 'stopping', pid: process.pid, stopping_at: e.ts() });
675
+ mutateControl(() => ({ state: 'stopping', pid: process.pid, stopping_at: e.ts() }));
673
676
  e.log('info', `Graceful shutdown initiated (${signal})`);
674
677
 
675
678
  if (e.activeProcesses.size === 0) {
676
- safeWrite(CONTROL_PATH, { state: 'stopped', stopped_at: e.ts() });
679
+ mutateControl(() => ({ state: 'stopped', stopped_at: e.ts() }));
677
680
  e.log('info', 'Graceful shutdown complete (no active agents)');
678
681
  shared.flushLogs(); // drain buffered log entries before exit
679
682
  console.log('No active agents — stopped.');
@@ -687,7 +690,7 @@ const commands = {
687
690
  const poll = setInterval(() => {
688
691
  if (e.activeProcesses.size === 0) {
689
692
  clearInterval(poll);
690
- safeWrite(CONTROL_PATH, { state: 'stopped', stopped_at: e.ts() });
693
+ mutateControl(() => ({ state: 'stopped', stopped_at: e.ts() }));
691
694
  e.log('info', 'Graceful shutdown complete (all agents finished)');
692
695
  shared.flushLogs(); // drain buffered log entries before exit
693
696
  console.log('All agents finished — stopped.');
@@ -695,7 +698,7 @@ const commands = {
695
698
  }
696
699
  if (Date.now() >= deadline) {
697
700
  clearInterval(poll);
698
- safeWrite(CONTROL_PATH, { state: 'stopped', stopped_at: e.ts() });
701
+ mutateControl(() => ({ state: 'stopped', stopped_at: e.ts() }));
699
702
  e.log('warn', `Graceful shutdown timed out after ${timeout / 1000}s with ${e.activeProcesses.size} agent(s) still active`);
700
703
  shared.flushLogs(); // drain buffered log entries before exit
701
704
  console.log(`Shutdown timeout (${timeout / 1000}s) — force exiting with ${e.activeProcesses.size} agent(s) still running.`);
@@ -742,14 +745,14 @@ const commands = {
742
745
  if (control.pid && control.pid !== process.pid) {
743
746
  try { process.kill(control.pid); } catch { /* process may be dead */ }
744
747
  }
745
- safeWrite(CONTROL_PATH, { state: 'stopped', stopped_at: e.ts() });
748
+ mutateControl(() => ({ state: 'stopped', stopped_at: e.ts() }));
746
749
  e.log('info', 'Engine stopped');
747
750
  console.log('Engine stopped.');
748
751
  },
749
752
 
750
753
  pause() {
751
754
  const e = engine();
752
- safeWrite(CONTROL_PATH, { state: 'paused', paused_at: e.ts() });
755
+ mutateControl(() => ({ state: 'paused', paused_at: e.ts() }));
753
756
  e.log('info', 'Engine paused');
754
757
  console.log('Engine paused. Run `node .minions/engine.js resume` to resume.');
755
758
  },
@@ -761,7 +764,7 @@ const commands = {
761
764
  console.log('Engine is already running.');
762
765
  return;
763
766
  }
764
- safeWrite(CONTROL_PATH, { state: 'running', resumed_at: e.ts() });
767
+ mutateControl(() => ({ state: 'running', resumed_at: e.ts() }));
765
768
  e.log('info', 'Engine resumed');
766
769
  console.log('Engine resumed.');
767
770
  },
@@ -932,7 +935,7 @@ const commands = {
932
935
  dispatch() {
933
936
  const control = getControl();
934
937
  if (control.state === 'running' && isEngineProcessAlive(control)) {
935
- safeWrite(CONTROL_PATH, { ...control, _wakeupAt: Date.now() });
938
+ mutateControl((c) => ({ ...c, _wakeupAt: Date.now() }));
936
939
  console.log(`Dispatch wakeup requested from running engine (PID ${control.pid}).`);
937
940
  return;
938
941
  }
@@ -7,7 +7,7 @@ const path = require('path');
7
7
  const shared = require('./shared');
8
8
  const queries = require('./queries');
9
9
 
10
- const { safeJson, safeWrite, log, ENGINE_DEFAULTS } = shared;
10
+ const { safeJson, mutateCooldowns, log, ENGINE_DEFAULTS } = shared;
11
11
  const { ENGINE_DIR } = queries;
12
12
 
13
13
  /**
@@ -37,6 +37,7 @@ function _truncateContextEntry(entry, maxBytes) {
37
37
 
38
38
  const COOLDOWN_PATH = path.join(ENGINE_DIR, 'cooldowns.json');
39
39
  const dispatchCooldowns = new Map(); // key → { timestamp, failures }
40
+ let _lastDiskCooldownKeys = new Set();
40
41
 
41
42
  function loadCooldowns() {
42
43
  const saved = safeJson(COOLDOWN_PATH);
@@ -48,6 +49,7 @@ function loadCooldowns() {
48
49
  dispatchCooldowns.set(k, v);
49
50
  }
50
51
  }
52
+ _lastDiskCooldownKeys = new Set(dispatchCooldowns.keys());
51
53
  log('info', `Loaded ${dispatchCooldowns.size} cooldowns from disk`);
52
54
  }
53
55
 
@@ -57,27 +59,35 @@ function saveCooldowns() {
57
59
  if (_cooldownWriteTimer) clearTimeout(_cooldownWriteTimer);
58
60
  _cooldownWriteTimer = setTimeout(() => {
59
61
  _cooldownWriteTimer = null;
60
- // Prune expired entries (>24h) before saving
61
- const now = Date.now();
62
- for (const [k, v] of dispatchCooldowns) {
63
- if (now - v.timestamp > 24 * 60 * 60 * 1000) dispatchCooldowns.delete(k);
64
- }
65
- // Trim pendingContexts arrays before writing to prevent bloat
66
- const cap = ENGINE_DEFAULTS.maxPendingContexts;
67
- const entryLimit = ENGINE_DEFAULTS.maxPendingContextEntryBytes;
68
- for (const [, v] of dispatchCooldowns) {
69
- if (Array.isArray(v.pendingContexts)) {
70
- if (v.pendingContexts.length > cap) {
71
- v.pendingContexts = v.pendingContexts.slice(-cap);
72
- }
73
- // Also truncate oversized individual entries — #1167 showed
74
- // 20 entries × 25 MB each still produced a 500 MB cooldowns.json.
75
- v.pendingContexts = v.pendingContexts.map(e => _truncateContextEntry(e, entryLimit));
76
- }
77
- }
78
- const obj = Object.fromEntries(dispatchCooldowns);
79
62
  try {
80
- safeWrite(COOLDOWN_PATH, obj);
63
+ mutateCooldowns((diskCooldowns) => {
64
+ for (const key of Array.from(dispatchCooldowns.keys())) {
65
+ if (_lastDiskCooldownKeys.has(key) && !Object.prototype.hasOwnProperty.call(diskCooldowns, key)) {
66
+ dispatchCooldowns.delete(key);
67
+ }
68
+ }
69
+ // Prune expired entries (>24h) before saving
70
+ const now = Date.now();
71
+ for (const [k, v] of dispatchCooldowns) {
72
+ if (now - v.timestamp > 24 * 60 * 60 * 1000) dispatchCooldowns.delete(k);
73
+ }
74
+ // Trim pendingContexts arrays before writing to prevent bloat
75
+ const cap = ENGINE_DEFAULTS.maxPendingContexts;
76
+ const entryLimit = ENGINE_DEFAULTS.maxPendingContextEntryBytes;
77
+ for (const [, v] of dispatchCooldowns) {
78
+ if (Array.isArray(v.pendingContexts)) {
79
+ if (v.pendingContexts.length > cap) {
80
+ v.pendingContexts = v.pendingContexts.slice(-cap);
81
+ }
82
+ // Also truncate oversized individual entries — #1167 showed
83
+ // 20 entries × 25 MB each still produced a 500 MB cooldowns.json.
84
+ v.pendingContexts = v.pendingContexts.map(e => _truncateContextEntry(e, entryLimit));
85
+ }
86
+ }
87
+ const obj = Object.fromEntries(dispatchCooldowns);
88
+ _lastDiskCooldownKeys = new Set(Object.keys(obj));
89
+ return obj;
90
+ });
81
91
  } catch (err) {
82
92
  log('warn', `saveCooldowns failed writing ${COOLDOWN_PATH}: ${err.message}`);
83
93
  }
@@ -1,5 +1,5 @@
1
1
  {
2
2
  "runtime": "copilot",
3
3
  "models": null,
4
- "cachedAt": "2026-05-01T05:22:39.907Z"
4
+ "cachedAt": "2026-05-01T05:58:00.505Z"
5
5
  }
package/engine/queries.js CHANGED
@@ -101,7 +101,21 @@ function timeSince(ms) {
101
101
  }
102
102
 
103
103
  function readJsonNoRestore(filePath) {
104
- try { return JSON.parse(fs.readFileSync(filePath, 'utf8')); } catch { return null; }
104
+ let raw;
105
+ try {
106
+ raw = fs.readFileSync(filePath, 'utf8');
107
+ } catch (e) {
108
+ if (e && e.code !== 'ENOENT') {
109
+ console.warn(`[queries] failed to read ${_relativeStatePath(filePath)}: ${e.message}`);
110
+ }
111
+ return null;
112
+ }
113
+ try {
114
+ return JSON.parse(raw);
115
+ } catch (e) {
116
+ console.warn(`[queries] corrupt JSON in ${_relativeStatePath(filePath)}: ${e.message}`);
117
+ return null;
118
+ }
105
119
  }
106
120
 
107
121
  // ── Core State Readers ──────────────────────────────────────────────────────
package/engine/shared.js CHANGED
@@ -8,6 +8,9 @@ const path = require('path');
8
8
  const crypto = require('crypto');
9
9
 
10
10
  const MINIONS_DIR = process.env.MINIONS_TEST_DIR || path.resolve(__dirname, '..');
11
+ const ENGINE_DIR = path.join(MINIONS_DIR, 'engine');
12
+ const CONTROL_PATH = path.join(ENGINE_DIR, 'control.json');
13
+ const COOLDOWNS_PATH = path.join(ENGINE_DIR, 'cooldowns.json');
11
14
  const PR_LINKS_PATH = path.join(MINIONS_DIR, 'engine', 'pr-links.json');
12
15
  const PINNED_ITEMS_PATH = path.join(MINIONS_DIR, 'engine', 'kb-pins.json');
13
16
  const LOG_PATH = path.join(MINIONS_DIR, 'engine', 'log.json');
@@ -428,6 +431,20 @@ function mutateJsonFileLocked(filePath, mutateFn, {
428
431
  }, { retries, retryBackoffMs });
429
432
  }
430
433
 
434
+ function mutateControl(mutator) {
435
+ return mutateJsonFileLocked(CONTROL_PATH, (data) => {
436
+ if (!data || typeof data !== 'object' || Array.isArray(data)) data = {};
437
+ return mutator(data) || data;
438
+ }, { defaultValue: { state: 'stopped', pid: null }, skipWriteIfUnchanged: true });
439
+ }
440
+
441
+ function mutateCooldowns(mutator) {
442
+ return mutateJsonFileLocked(COOLDOWNS_PATH, (data) => {
443
+ if (!data || typeof data !== 'object' || Array.isArray(data)) data = {};
444
+ return mutator(data) || data;
445
+ }, { defaultValue: {}, skipWriteIfUnchanged: true });
446
+ }
447
+
431
448
  /**
432
449
  * Generate a unique ID suffix: timestamp + 4 random chars.
433
450
  * Use for filenames that could collide (dispatch IDs, temp files, etc.)
@@ -704,7 +721,19 @@ const ENGINE_DEFAULTS = {
704
721
  inboxConsolidateThreshold: 5,
705
722
  agentTimeout: 18000000, // 5h
706
723
  heartbeatTimeout: 300000, // 5min — stale-orphan grace after process tracking is lost
707
- heartbeatTimeouts: {}, // optional per-type stale-orphan overrides; merged at runtime (see timeout.js)
724
+ // Per-type stale-orphan overrides (merged with config.engine.heartbeatTimeouts at runtime see timeout.js).
725
+ // Heavy work types (multi-file edits, builds, test suites, full verify cycles) routinely go quiet for
726
+ // longer than the 5-min default when the engine has lost their tracked handle (e.g. across an engine
727
+ // restart). We give them headroom up to a typical build+tests cycle. Short-running types
728
+ // (decompose / meeting / etc.) keep the 5-min default by simply not appearing here.
729
+ heartbeatTimeouts: {
730
+ implement: 900000, // 15min — refactors, multi-file edits, builds
731
+ 'implement:large': 900000, // 15min — same class of work, larger scope
732
+ fix: 900000, // 15min — fix runs often include builds + retries
733
+ test: 900000, // 15min — build-and-test against existing PR
734
+ verify: 900000, // 15min — full project verification cycle
735
+ plan: 600000, // 10min — research-heavy
736
+ },
708
737
  maxTurns: 100,
709
738
  worktreeCreateTimeout: 300000, // 5min for git worktree add on large Windows repos
710
739
  worktreeCreateRetries: 1, // retry once on transient timeout/lock races
@@ -768,7 +797,6 @@ const ENGINE_DEFAULTS = {
768
797
  copilotReasoningSummaries: false, // Copilot --enable-reasoning-summaries (Anthropic-family models only)
769
798
  maxBudgetUsd: undefined, // fleet USD ceiling for --max-budget-usd (per-agent override: agents.<id>.maxBudgetUsd). Honors 0 via ?? so a literal cap of $0 works
770
799
  disableModelDiscovery: false, // skip runtime.listModels() REST calls fleet-wide (settings UI falls back to free-text)
771
- heartbeatTimeouts: {},
772
800
  maxPendingContexts: 20, // cap pendingContexts arrays in cooldowns.json to prevent unbounded growth
773
801
  maxPendingContextEntryBytes: 256 * 1024, // 256 KB — cap each pendingContexts entry to prevent huge PR comments from bloating cooldowns.json
774
802
  maxDispatchPromptBytes: 1024 * 1024, // 1 MB — dispatch items with prompts larger than this sidecar to engine/contexts/ to prevent dispatch.json OOM (#1167)
@@ -2304,6 +2332,9 @@ function createThrottleTracker({ label, baseBackoffMs = 60000, maxBackoffMs = 32
2304
2332
 
2305
2333
  module.exports = {
2306
2334
  MINIONS_DIR,
2335
+ ENGINE_DIR,
2336
+ CONTROL_PATH,
2337
+ COOLDOWNS_PATH,
2307
2338
  PR_LINKS_PATH,
2308
2339
  PINNED_ITEMS_PATH,
2309
2340
  LOG_PATH,
@@ -2325,6 +2356,8 @@ module.exports = {
2325
2356
  assertStateFileSize,
2326
2357
  withFileLock,
2327
2358
  mutateJsonFileLocked,
2359
+ mutateControl,
2360
+ mutateCooldowns,
2328
2361
  mutateWorkItems,
2329
2362
  reopenWorkItem,
2330
2363
  mutatePullRequests,
package/engine/timeout.js CHANGED
@@ -9,7 +9,7 @@ const queries = require('./queries');
9
9
  const steering = require('./steering');
10
10
 
11
11
  const { safeRead, safeWrite, safeJson, mutateJsonFileLocked, getProjects, projectWorkItemsPath, log, ts,
12
- ENGINE_DEFAULTS, WI_STATUS, WORK_TYPE, DISPATCH_RESULT, AGENT_STATUS } = shared;
12
+ ENGINE_DEFAULTS, ENGINE_DIR, WI_STATUS, WORK_TYPE, DISPATCH_RESULT, AGENT_STATUS } = shared;
13
13
  const { getDispatch, getAgentStatus } = queries;
14
14
  const AGENTS_DIR = queries.AGENTS_DIR;
15
15
  const MINIONS_DIR = shared.MINIONS_DIR;
@@ -142,6 +142,23 @@ function isTrackedProcessAlive(procInfo) {
142
142
  }
143
143
  }
144
144
 
145
+ // Last-resort liveness check via the on-disk PID file (engine/tmp/pid-<safeId>.pid).
146
+ // Used by orphan detection to avoid false-positive kills when the engine has lost the
147
+ // tracked process handle (engine restart, never-tracked spawn, etc.) but the OS-level
148
+ // child process is still alive and healthy. The safeId here mirrors engine.js spawn
149
+ // (id.replace(/[:\\/*?"<>|]/g, '-')) — same pattern engine/cli.js uses to re-attach.
150
+ function isOsPidAliveForDispatch(itemId) {
151
+ const safeId = String(itemId || '').replace(/[:\\/*?"<>|]/g, '-');
152
+ const pidPath = path.join(ENGINE_DIR, 'tmp', `pid-${safeId}.pid`);
153
+ let raw;
154
+ try { raw = fs.readFileSync(pidPath, 'utf8'); }
155
+ catch { return false; }
156
+ const pid = parseInt(String(raw).trim(), 10);
157
+ if (!Number.isFinite(pid) || pid <= 0) return false;
158
+ try { process.kill(pid, 0); return true; }
159
+ catch { return false; }
160
+ }
161
+
145
162
  function checkTimeouts(config) {
146
163
  const activeProcesses = engine().activeProcesses;
147
164
  const engineRestartGraceUntil = engine().engineRestartGraceUntil;
@@ -335,6 +352,11 @@ function checkTimeouts(config) {
335
352
  } catch { /* ENOENT — keep default */ }
336
353
 
337
354
  if (!processAlive && silentMs > staleOrphanTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
355
+ // Last-resort PID check: lost tracked handle but OS process may still be alive.
356
+ if (isOsPidAliveForDispatch(item.id)) {
357
+ log('info', `Orphan check: ${item.agent} (${item.id}) silent ${silentSec}s but OS PID is alive — keeping [${_logState}]`);
358
+ continue;
359
+ }
338
360
  // No tracked process AND no recent output past stale-orphan timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
339
361
  log('warn', `Orphan detected: ${item.agent} (${item.id}) — no live process tracked, silent for ${silentSec}s [${_logState}]`);
340
362
  dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Orphaned — no process, silent for ${silentSec}s`);
@@ -424,4 +446,5 @@ module.exports = {
424
446
  checkTimeouts,
425
447
  checkSteering,
426
448
  checkIdleThreshold,
449
+ isOsPidAliveForDispatch,
427
450
  };
package/engine.js CHANGED
@@ -95,6 +95,7 @@ const safeRead = shared.safeRead;
95
95
  const safeWrite = shared.safeWrite;
96
96
  const safeUnlink = shared.safeUnlink;
97
97
  const mutateJsonFileLocked = shared.mutateJsonFileLocked;
98
+ const mutateControl = shared.mutateControl;
98
99
  const mutateWorkItems = shared.mutateWorkItems;
99
100
  const mutatePullRequests = shared.mutatePullRequests;
100
101
  const withFileLock = shared.withFileLock;
@@ -3557,7 +3558,7 @@ async function tickInner() {
3557
3558
  }
3558
3559
 
3559
3560
  // Write heartbeat so dashboard can detect stale engine
3560
- try { safeWrite(CONTROL_PATH, { ...control, heartbeat: Date.now() }); } catch (e) { log('warn', 'write heartbeat: ' + e.message); }
3561
+ try { mutateControl(c => ({ ...c, heartbeat: Date.now() })); } catch (e) { log('warn', 'write heartbeat: ' + e.message); }
3561
3562
 
3562
3563
  const config = getConfig();
3563
3564
  tickCount++;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1658",
3
+ "version": "0.1.1660",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"