@yemi33/minions 0.1.1634 → 0.1.1635

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/engine/timeout.js CHANGED
@@ -1,6 +1,5 @@
1
1
  /**
2
- * engine/timeout.js — Timeout detection, steering, and idle threshold checks.
3
- * Extracted from engine.js for modularity. No logic changes.
2
+ * engine/timeout.js — Runtime timeout, stale-orphan cleanup, steering, and idle checks.
4
3
  */
5
4
 
6
5
  const fs = require('fs');
@@ -124,6 +123,28 @@ function checkSteering(config) {
124
123
 
125
124
  // ─── Timeout Checker ─────────────────────────────────────────────────────────
126
125
 
126
+ function trackedProcessPid(procInfo) {
127
+ const pid = Number(procInfo?.proc?.pid || procInfo?.pid || 0);
128
+ return Number.isFinite(pid) && pid > 0 ? pid : null;
129
+ }
130
+
131
+ function isTrackedProcessAlive(procInfo) {
132
+ if (!procInfo) return false;
133
+ const proc = procInfo.proc;
134
+ if (proc && Object.prototype.hasOwnProperty.call(proc, 'exitCode') && proc.exitCode !== null) {
135
+ return false;
136
+ }
137
+
138
+ const pid = trackedProcessPid(procInfo);
139
+ if (!pid) return !!proc && proc.killed !== true;
140
+ try {
141
+ process.kill(pid, 0);
142
+ return true;
143
+ } catch {
144
+ return false;
145
+ }
146
+ }
147
+
127
148
  function checkTimeouts(config) {
128
149
  const activeProcesses = engine().activeProcesses;
129
150
  const engineRestartGraceUntil = engine().engineRestartGraceUntil;
@@ -132,10 +153,10 @@ function checkTimeouts(config) {
132
153
  const { runPostCompletionHooks } = require('./lifecycle');
133
154
 
134
155
  const timeout = config.engine?.agentTimeout || ENGINE_DEFAULTS.agentTimeout;
135
- const defaultHeartbeatTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
156
+ const defaultStaleOrphanTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
136
157
 
137
- // Per-type heartbeat timeouts: merge ENGINE_DEFAULTS ← config overrides
138
- const perTypeTimeouts = { ...ENGINE_DEFAULTS.heartbeatTimeouts, ...(config.engine?.heartbeatTimeouts || {}) };
158
+ // Optional per-type stale-orphan timeouts: merge ENGINE_DEFAULTS ← config overrides.
159
+ const perTypeStaleOrphanTimeouts = { ...ENGINE_DEFAULTS.heartbeatTimeouts, ...(config.engine?.heartbeatTimeouts || {}) };
139
160
 
140
161
  // 1. Check tracked processes for hard timeout (supports per-item deadline from fan-out)
141
162
  for (const [id, info] of activeProcesses.entries()) {
@@ -148,37 +169,32 @@ function checkTimeouts(config) {
148
169
  }
149
170
  }
150
171
 
151
- // 2. Heartbeat check — for ALL active dispatch items (catches orphans after engine restart)
152
- // Uses live-output.log mtime as heartbeat. If no output for heartbeatTimeout, agent is dead.
172
+ // 2. Stale-orphan check — for ALL active dispatch items (catches lost process handles after restart).
173
+ // Silence is not a failure for tracked live processes: long CLI commands can legitimately
174
+ // produce no stdout/stderr for extended periods.
153
175
  const dispatchData = getDispatch();
154
176
  const deadItems = [];
155
- const blockingAnnotations = new Map(); // id → { tool, silentMs, remainingMs } or null (clear)
177
+ const legacyAnnotationClears = new Set();
156
178
 
157
179
  for (const item of (dispatchData.active || [])) {
158
180
  if (!item.agent) continue;
159
181
 
160
- // Per-type heartbeat: look up work type from dispatch item, fall back to default
182
+ // Per-type stale-orphan timeout: look up work type from dispatch item, fall back to default.
161
183
  const workType = item.workType || item.meta?.item?.type;
162
- const heartbeatTimeout = (workType && perTypeTimeouts[workType]) || defaultHeartbeatTimeout;
184
+ const staleOrphanTimeout = (workType && perTypeStaleOrphanTimeouts[workType]) || defaultStaleOrphanTimeout;
163
185
 
164
- const hasProcess = activeProcesses.has(item.id);
186
+ const procInfo = activeProcesses.get(item.id);
187
+ const hasProcess = !!procInfo;
188
+ const processAlive = isTrackedProcessAlive(procInfo);
165
189
  const liveLogPath = path.join(AGENTS_DIR, item.agent, 'live-output.log');
166
190
  let lastActivity = item.started_at ? new Date(item.started_at).getTime() : 0;
167
191
 
168
- // For tracked processes, use realActivityMap (tracks actual agent stdout/stderr only,
169
- // NOT engine heartbeat writes). This prevents the feedback loop where engine heartbeat
170
- // writes to live-output.log reset the mtime that the timeout check reads (#724).
171
- const realActivityMap = engine().realActivityMap;
172
- if (hasProcess && realActivityMap?.has(item.id)) {
173
- lastActivity = Math.max(lastActivity, realActivityMap.get(item.id));
174
- } else {
175
- // Orphan case (no tracked process): use live-output.log mtime as fallback.
176
- // No heartbeat timer is running for orphans, so mtime is accurate.
177
- try {
178
- const stat = fs.statSync(liveLogPath);
179
- lastActivity = Math.max(lastActivity, stat.mtimeMs);
180
- } catch { /* optional */ }
181
- }
192
+ // live-output.log mtime is only used for stale-orphan cleanup and completion recovery.
193
+ // It is not used as an output-silence timeout for live tracked processes.
194
+ try {
195
+ const stat = fs.statSync(liveLogPath);
196
+ lastActivity = Math.max(lastActivity, stat.mtimeMs);
197
+ } catch { /* optional */ }
182
198
 
183
199
  const silentMs = Date.now() - lastActivity;
184
200
  const silentSec = Math.round(silentMs / 1000);
@@ -266,126 +282,26 @@ function checkTimeouts(config) {
266
282
  // code is known (#1792).
267
283
  } catch (e) { log('warn', 'output completion detection: ' + e.message); }
268
284
 
269
- // Resolve per-type heartbeat timeout: per-type map base heartbeatTimeout fallback
270
- const itemHeartbeat = perTypeTimeouts[item.type] || heartbeatTimeout;
271
-
272
- // Check if agent is in a blocking tool call (TaskOutput block:true, Bash with long timeout, etc.)
273
- // These tools produce no stdout for extended periods — don't kill them prematurely
274
- // Check for BOTH tracked and untracked processes (orphan case after engine restart)
275
- // Skip if agent already completed — blocking tool detection on stale tool calls
276
- // would extend the timeout indefinitely for dead agents (#716).
277
- let isBlocking = false;
278
- let blockingTimeout = itemHeartbeat;
279
- let blockingTool = '';
280
- if (silentMs > itemHeartbeat) {
281
- try {
282
- const liveLog = safeRead(liveLogPath);
283
- if (liveLog) {
284
- // If the output contains a result event or process-exit sentinel, the agent is done.
285
- // Don't extend timeout for stale blocking tool calls from before the result (#716).
286
- if (liveLog.includes('"type":"result"') || liveLog.includes('\n[process-exit]')) {
287
- // Agent completed but close event didn't fire — let orphan/hung detection handle it.
288
- // Don't set isBlocking — use base heartbeat timeout.
289
- } else {
290
- // Find the last tool_use call in the output — check if it's a known blocking tool.
291
- //
292
- // Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
293
- // a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
294
- // a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
295
- // every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
296
- // lines per hour AFTER the original tool_use line. A 30-line lookback misses the
297
- // tool_use entirely, the detector treats the silence as non-blocking, and the
298
- // agent gets killed at heartbeatTimeout despite legitimately waiting on a
299
- // background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
300
- // beyond Monitor's 30 min effective timeout floor.
301
- const lines = liveLog.split('\n');
302
- const TOOL_USE_LOOKBACK = 1000;
303
- for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
304
- const line = lines[i];
305
- if (!line.includes('"tool_use"')) continue;
306
- try {
307
- const parsed = JSON.parse(line);
308
- const toolUse = parsed?.message?.content?.find?.(c => c.type === 'tool_use');
309
- if (!toolUse) continue;
310
- const input = toolUse.input || {};
311
- const name = toolUse.name || '';
312
- // TaskOutput with block:true — waiting for a background task
313
- if (name === 'TaskOutput' && input.block === true) {
314
- const taskTimeout = input.timeout || 600000; // default 10min
315
- blockingTimeout = Math.max(itemHeartbeat, taskTimeout + 60000); // task timeout + 1min grace
316
- isBlocking = true;
317
- blockingTool = 'TaskOutput';
318
- }
319
- // Bash tool call — may be running a long build/install with no stdout
320
- if (name === 'Bash') {
321
- // Use explicit timeout if set, otherwise match Claude Code's actual Bash default (120s)
322
- const bashTimeout = input.timeout || 120000;
323
- blockingTimeout = Math.max(itemHeartbeat, bashTimeout + 60000);
324
- isBlocking = true;
325
- blockingTool = 'Bash';
326
- }
327
- // PowerShell tool call — Windows-native shell with same explicit-timeout
328
- // semantics as Bash (input.timeout, max 600s). Required for projects that
329
- // build via PowerShell on Windows (gradlew.bat, MSBuild, dotnet test) where
330
- // the cold-start phase produces no stdout for several minutes (#1786).
331
- if (name === 'PowerShell') {
332
- const psTimeout = input.timeout || 120000;
333
- blockingTimeout = Math.max(itemHeartbeat, psTimeout + 60000);
334
- isBlocking = true;
335
- blockingTool = 'PowerShell';
336
- }
337
- // Monitor tool call — blocks waiting for stdout-line notifications from a
338
- // background process started via Bash with run_in_background. Between
339
- // notifications the call produces no output, so the heartbeat monitor
340
- // must extend timeout. No fixed timeout on Monitor — match Agent (30min)
341
- // since both are inherently long-running waits (#1786).
342
- if (name === 'Monitor') {
343
- blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for background process waits
344
- isBlocking = true;
345
- blockingTool = 'Monitor';
346
- }
347
- // Agent (subagent) tool call — parent waits silently for child to complete
348
- if (name === 'Agent') {
349
- blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for subagents
350
- isBlocking = true;
351
- blockingTool = 'Agent';
352
- }
353
- break; // only check the most recent tool_use
354
- } catch { /* JSON parse — line may not be valid JSON */ }
355
- }
356
- if (isBlocking) {
357
- // Only log on transition — avoid spamming every tick while blocking persists
358
- if (!item._blockingToolCall) {
359
- log('info', `Agent ${item.agent} (${item.id}) is in a blocking tool call (${blockingTool}) — extended timeout to ${Math.round(blockingTimeout / 1000)}s (silent for ${silentSec}s)`, { event: 'blocking_tool_call_detected' });
360
- }
361
- blockingAnnotations.set(item.id, {
362
- tool: blockingTool,
363
- silentMs,
364
- remainingMs: Math.max(0, blockingTimeout - silentMs),
365
- });
366
- }
367
- } // close else
368
- } // close if (liveLog)
369
- } catch (e) { log('warn', 'blocking tool detection: ' + e.message); }
370
- }
371
- // Agent recovered from blocking state — clear annotation
372
- if (!isBlocking && item._blockingToolCall) {
373
- blockingAnnotations.set(item.id, null);
285
+ // Blocking tool annotations are no longer needed: live tracked processes are allowed to
286
+ // be quiet regardless of which command/tool is running.
287
+ if (item._blockingToolCall) {
288
+ legacyAnnotationClears.add(item.id);
374
289
  }
375
290
 
376
- const effectiveTimeout = isBlocking ? blockingTimeout : itemHeartbeat;
377
-
378
291
  // Skip recently-steered agents — they're being killed and re-spawned
379
- const procInfo = activeProcesses.get(item.id);
380
292
  if (procInfo?._steeringAt && Date.now() - procInfo._steeringAt < 60000) continue;
381
293
 
382
- // Capture live-output.log file state for orphan/hung diagnostics
294
+ if (processAlive) {
295
+ continue;
296
+ }
297
+
298
+ // Capture live-output.log file state for orphan diagnostics
383
299
  // (#W-mo248lkjwgsu original, #W-mo25loq8kjer pid annotation).
384
300
  // Four distinguishable failure modes:
385
301
  // logExists=false → spawn call itself threw, no log ever written
386
302
  // logExists=true pidPresent=false → engine stub written but spawn died before emitting pid line
387
- // logExists=true pidPresent=true silent → process spawned (pid recorded) but never produced stdout
388
- // logExists=true pidPresent=true size>pid → genuine hang (process wrote output then stopped)
303
+ // logExists=true pidPresent=true silent → process spawned (pid recorded) but no recent output
304
+ // logExists=true pidPresent=true size>pid → process handle was lost after output was written
389
305
  //
390
306
  // The pid line `[<iso>] pid: <N>` is stamped by engine.js immediately after runFile() returns.
391
307
  // Its presence → the child process was actually spawned; absence → spawn itself failed or the
@@ -408,33 +324,15 @@ function checkTimeouts(config) {
408
324
  _logState = `logExists=true logSize=${lst.size} pidPresent=${pidPresent}`;
409
325
  } catch { /* ENOENT — keep default */ }
410
326
 
411
- if (!hasProcess && silentMs > effectiveTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
412
- // No tracked process AND no recent output past effective timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
413
- log('warn', `Orphan detected: ${item.agent} (${item.id}) — no process tracked, silent for ${silentSec}s${isBlocking ? ' (blocking timeout exceeded)' : ''} [${_logState}]`);
327
+ if (!processAlive && silentMs > staleOrphanTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
328
+ // No tracked process AND no recent output past stale-orphan timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
329
+ log('warn', `Orphan detected: ${item.agent} (${item.id}) — no live process tracked, silent for ${silentSec}s [${_logState}]`);
414
330
  dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Orphaned — no process, silent for ${silentSec}s`);
415
331
  // Clear session so retry starts fresh
416
332
  try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
417
333
  deadItems.push({ item, reason: `Orphaned — no process, silent for ${silentSec}s` });
418
- } else if (hasProcess && silentMs > effectiveTimeout) {
419
- // Has process but no output past effective timeout → hung
420
- log('warn', `Hung agent: ${item.agent} (${item.id}) — process exists but no output for ${silentSec}s${isBlocking ? ' (blocking timeout exceeded)' : ''} [${_logState}]`);
421
- dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Hung — no output for ${silentSec}s`);
422
- const procInfo = activeProcesses.get(item.id);
423
- if (procInfo) {
424
- shared.killGracefully(procInfo.proc, 5000);
425
- // On Unix, also kill child process tree (killGracefully only hits parent PID)
426
- if (process.platform !== 'win32' && procInfo.proc?.pid) {
427
- setTimeout(() => {
428
- try { shared.exec(`pkill -KILL -P ${procInfo.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
429
- }, 6000); // after grace period
430
- }
431
- activeProcesses.delete(item.id);
432
- }
433
- // Clear session so retry starts fresh instead of resuming the killed session
434
- try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
435
- deadItems.push({ item, reason: `Hung — no output for ${silentSec}s` });
334
+ activeProcesses.delete(item.id);
436
335
  }
437
- // If has process and recent output → healthy, let it run
438
336
  }
439
337
 
440
338
  // Clean up dead items
@@ -442,19 +340,12 @@ function checkTimeouts(config) {
442
340
  completeDispatch(item.id, DISPATCH_RESULT.ERROR, reason);
443
341
  }
444
342
 
445
- // Batch-write blocking tool call annotations to dispatch entries.
446
- // This surfaces blocking state via GET /api/status → dashboard badges.
447
- if (blockingAnnotations.size > 0) {
343
+ // Clear legacy blocking-tool annotations; process liveness no longer depends on tool parsing.
344
+ if (legacyAnnotationClears.size > 0) {
448
345
  const { mutateDispatch: mutateDispatchFn } = dispatch();
449
346
  mutateDispatchFn((dp) => {
450
347
  for (const activeItem of dp.active) {
451
- if (!blockingAnnotations.has(activeItem.id)) continue;
452
- const ann = blockingAnnotations.get(activeItem.id);
453
- if (ann) {
454
- activeItem._blockingToolCall = ann;
455
- } else {
456
- delete activeItem._blockingToolCall;
457
- }
348
+ if (legacyAnnotationClears.has(activeItem.id)) delete activeItem._blockingToolCall;
458
349
  }
459
350
  });
460
351
  }
package/engine.js CHANGED
@@ -145,7 +145,7 @@ const { runPostCompletionHooks, updateWorkItemStatus, syncPrdItemStatus, reconci
145
145
  // ─── Agent Spawner ──────────────────────────────────────────────────────────
146
146
 
147
147
  const activeProcesses = new Map(); // dispatchId → { proc, agentId, startedAt }
148
- const realActivityMap = new Map(); // dispatchId → timestamp of last REAL agent output (not engine heartbeat)
148
+ const realActivityMap = new Map(); // dispatchId → timestamp of last agent stdout/stderr
149
149
  // tempAgents imported from engine/routing.js
150
150
  let engineRestartGraceUntil = 0; // timestamp — suppress orphan detection until this time
151
151
  const engineRestartGraceExempt = new Set(); // dispatch IDs with confirmed-dead PIDs at restart — bypass grace period
@@ -983,17 +983,12 @@ async function spawnAgent(dispatchItem, config) {
983
983
  throw spawnErr;
984
984
  }
985
985
 
986
- // Seed realActivityMap and stamp PID immediately — BEFORE any handlers or timers (#W-mo25loq8kjer).
986
+ // Seed realActivityMap and stamp PID immediately — BEFORE any handlers (#W-mo25loq8kjer).
987
987
  // Why NOW, not later in the function:
988
- // 1. Heartbeat clock anchoring. timeout.js uses realActivityMap as the last-activity timestamp for
989
- // tracked processes; when the map has no entry, it falls back to item.started_at (dispatch time,
990
- // which is 20-60s before actual spawn for write tasks doing worktree setup). Read-only tasks
991
- // that produce no stdout for minutes (explore, security audit, large scans) were hitting
992
- // heartbeatTimeout prematurely — clock had already been running since dispatch.
993
- // 2. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
988
+ // 1. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
994
989
  // on synchronous spawn failures. Seeding before registering handlers ensures delete sees a value
995
990
  // to clear rather than leaving an absent-then-absent no-op that downstream code must guard.
996
- // 3. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
991
+ // 2. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
997
992
  // before first write" (stub-only log) from "process started and is hung" (stub + pid line).
998
993
  realActivityMap.set(id, Date.now());
999
994
  try {
@@ -1003,24 +998,12 @@ async function spawnAgent(dispatchItem, config) {
1003
998
  const MAX_OUTPUT = 1024 * 1024; // 1MB
1004
999
  let stdout = '';
1005
1000
  let stderr = '';
1006
- let lastOutputAt = Date.now();
1007
- let heartbeatTimer = null;
1008
1001
  let _trustCheckDone = false;
1009
1002
  const _spawnTime = Date.now();
1010
1003
 
1011
- // Keep live log active even when the agent produces no stdout/stderr for long stretches.
1012
- // This makes "silent but running" states visible in the dashboard tail view.
1013
- heartbeatTimer = setInterval(() => {
1014
- const silentMs = Date.now() - lastOutputAt;
1015
- if (silentMs < 30000) return;
1016
- const silentSec = Math.round(silentMs / 1000);
1017
- try { fs.appendFileSync(liveOutputPath, `[heartbeat] running — no output for ${silentSec}s\n`); } catch { /* optional */ }
1018
- }, 30000);
1019
-
1020
1004
  proc.stdout.on('data', (data) => {
1021
1005
  const chunk = data.toString();
1022
- lastOutputAt = Date.now();
1023
- realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
1006
+ realActivityMap.set(id, Date.now());
1024
1007
  if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
1025
1008
  try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
1026
1009
 
@@ -1057,14 +1040,12 @@ async function spawnAgent(dispatchItem, config) {
1057
1040
 
1058
1041
  proc.stderr.on('data', (data) => {
1059
1042
  const chunk = data.toString();
1060
- lastOutputAt = Date.now();
1061
- realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
1043
+ realActivityMap.set(id, Date.now());
1062
1044
  if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
1063
1045
  try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
1064
1046
  });
1065
1047
 
1066
1048
  async function onAgentClose(code) {
1067
- if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
1068
1049
  log('info', `Agent ${agentId} (${id}) exited with code ${code}`);
1069
1050
 
1070
1051
  // Emit worker-state transition: FINISHED or FAILED
@@ -1180,33 +1161,22 @@ async function spawnAgent(dispatchItem, config) {
1180
1161
  // Reset output buffers so post-completion parsing only sees the resumed session
1181
1162
  stdout = '';
1182
1163
  stderr = '';
1183
- lastOutputAt = Date.now();
1184
-
1185
- // Restart heartbeat for the resumed process
1186
- if (heartbeatTimer) clearInterval(heartbeatTimer);
1187
- heartbeatTimer = setInterval(() => {
1188
- try { fs.appendFileSync(liveOutputPath, `\n[heartbeat] running — no output for ${Math.round((Date.now() - lastOutputAt) / 1000)}s\n`); } catch {}
1189
- }, 30000);
1190
-
1191
1164
  // Re-wire stdout/stderr handlers (same as original)
1192
1165
  resumeProc.stdout.on('data', (data) => {
1193
1166
  const chunk = data.toString();
1194
- lastOutputAt = Date.now();
1195
- realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
1167
+ realActivityMap.set(id, Date.now());
1196
1168
  if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
1197
1169
  try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
1198
1170
  });
1199
1171
  resumeProc.stderr.on('data', (data) => {
1200
1172
  const chunk = data.toString();
1201
- lastOutputAt = Date.now();
1202
- realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
1173
+ realActivityMap.set(id, Date.now());
1203
1174
  if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
1204
1175
  try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
1205
1176
  });
1206
1177
 
1207
1178
  // Re-wire close handler for the resumed process
1208
1179
  resumeProc.on('close', (resumeCode) => {
1209
- if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
1210
1180
  try { fs.unlinkSync(steerPromptPath); } catch { /* cleanup */ }
1211
1181
  if (resumeCode !== 0) {
1212
1182
  log('warn', `Steering resume for ${agentId} exited with code ${resumeCode} | stderr: ${stderr.slice(-300).replace(/\n/g, ' ')}`);
@@ -1262,7 +1232,7 @@ async function spawnAgent(dispatchItem, config) {
1262
1232
  }
1263
1233
 
1264
1234
  activeProcesses.delete(id);
1265
- realActivityMap.delete(id); // Clean up real activity tracking
1235
+ realActivityMap.delete(id);
1266
1236
 
1267
1237
  // If timeout checker already finalized this dispatch, don't overwrite work-item status again.
1268
1238
  // This avoids races where close-handler marks an auto-retried item as failed.
@@ -1301,7 +1271,7 @@ async function spawnAgent(dispatchItem, config) {
1301
1271
  const { resultSummary, autoRecovered } = await runPostCompletionHooks(dispatchItem, agentId, code, stdout, config);
1302
1272
 
1303
1273
  // Move from active to completed in dispatch (single source of truth for agent status)
1304
- // autoRecovered: agent failed (e.g. heartbeat timeout) but created PRs — treat as success
1274
+ // autoRecovered: agent failed after creating PRs — treat as success
1305
1275
  const effectiveResult = (code === 0 || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR;
1306
1276
  const completeOpts = effectiveResult === DISPATCH_RESULT.ERROR && failureClass ? { failureClass } : {};
1307
1277
  // Extract last 5 non-empty stderr lines as error context when exit code is non-zero
@@ -1379,10 +1349,9 @@ async function spawnAgent(dispatchItem, config) {
1379
1349
  proc.on('close', onAgentClose);
1380
1350
 
1381
1351
  proc.on('error', (err) => {
1382
- if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
1383
1352
  log('error', `Failed to spawn agent ${agentId}: ${err.message}`);
1384
1353
  activeProcesses.delete(id);
1385
- realActivityMap.delete(id); // Clean up real activity tracking
1354
+ realActivityMap.delete(id);
1386
1355
  completeDispatch(id, DISPATCH_RESULT.ERROR, `Spawn error: ${err.message}`);
1387
1356
  });
1388
1357
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1634",
3
+ "version": "0.1.1635",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"
@@ -8,156 +8,39 @@ Repo: {{repo_name}} | Org: {{ado_org}} | Project: {{ado_project}}
8
8
  Team root: {{team_root}}
9
9
  Project path: {{project_path}}
10
10
 
11
- ## Your Task
11
+ ## Mission
12
12
 
13
- A new PR has been created: **{{pr_id}}** "{{pr_title}}"
13
+ A new PR has been created: **{{pr_id}}** - "{{pr_title}}"
14
14
  Branch: `{{pr_branch}}` | Author: {{pr_author}}
15
15
 
16
- Your job is to **check out the branch, build it, run tests, and if it's a webapp, start a local dev server** so the human reviewer can see it running.
16
+ Run the project's normal build/test verification for this PR and report whether it is ready for human review. If it is a runnable app, identify the local URL and the exact command needed to run it.
17
17
 
18
- ## Instructions
18
+ ## Long-Running Commands
19
19
 
20
- ### 1. Set up a worktree for the PR branch
20
+ Builds, dependency installs, tests, and dev servers can be quiet for a long time. Let normal CLI commands run naturally; do not add artificial heartbeat output or split commands just to show progress.
21
21
 
22
- You are already in the correct working directory on branch `{{pr_branch}}`. Do NOT create additional worktrees.
22
+ ## Approach
23
23
 
24
- ### 2. Install dependencies
24
+ Work from the current checkout prepared by the engine. Read the repo's own instructions first (`CLAUDE.md`, README, package files, Makefiles, project scripts) and adapt to the build system you find.
25
25
 
26
- Look at the project's build system (package.json, CLAUDE.md, README, Makefile, etc.) and install:
27
- ```bash
28
- # Examples — use whatever the project needs:
29
- yarn install # or npm install
30
- pip install -r requirements.txt
31
- dotnet restore
32
- ```
26
+ If build or tests fail, report the relevant errors clearly and stop. Do not fix code, push commits, or create PRs from this task.
33
27
 
34
- ### 3. Build the project
28
+ If a server/app should be run for review, include the URL and a copy-pasteable run command with absolute paths. If the server must survive after the agent exits, start it detached and record the PID, restart command, and stop command; otherwise just provide the command for the user.
35
29
 
36
- Run the project's build command:
37
- ```bash
38
- # Examples:
39
- yarn build # or npm run build
40
- dotnet build
41
- cargo build
42
- ```
30
+ ## Findings
43
31
 
44
- If the build **fails**, report the errors clearly and stop. Do NOT attempt to fix the code.
32
+ Write findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` only after successful verification.
45
33
 
46
- > ⚠️ **Cold builds are silent for minutes** (Gradle daemon spin-up, dotnet restore, fresh `npm install`). Run them via `Bash(run_in_background: true)` then `Monitor` to stream stdout, OR pass an explicit `timeout` on the Bash call (max 600000 ms). Without one of these, the heartbeat monitor will kill the agent at ~5 min of silence. See **Long-Running Build / Test Commands** below.
34
+ Include:
35
+ - Branch, author, and project
36
+ - Build status and important warnings/errors
37
+ - Test status and failed test names if any
38
+ - Local server status, URL, run command, PID, restart command, and stop command if applicable
39
+ - A short summary of whether the PR is ready to review
47
40
 
48
- ### 4. Run tests
41
+ ## Constraints
49
42
 
50
- ```bash
51
- # Examples:
52
- yarn test # or npm test
53
- pytest
54
- dotnet test
55
- ```
56
-
57
- Report test results: how many passed, failed, skipped.
58
-
59
- ### 5. Start a local dev server (if applicable)
60
-
61
- Determine if this project is a **webapp** (has a dev server, serves HTTP, has a UI):
62
- - Check package.json for `dev`, `start`, `serve` scripts
63
- - Check for frameworks: Next.js, React, Angular, Vue, Express, Flask, ASP.NET
64
- - Check CLAUDE.md for run instructions
65
-
66
- If it IS a webapp:
67
- 1. Start the dev server **detached from your process** so it survives after you exit.
68
- - If the repo docs provide a local run or background-start command, use that.
69
- - Otherwise, use the detached-process mechanism that fits the current environment. Do not assume Bash, PowerShell, or any specific shell unless the repo or runtime clearly provides it.
70
- 2. Wait a few seconds, then verify it using the repo's documented smoke test, health check, startup output, or the lightest project-appropriate manual check.
71
- 3. Note the localhost URL, port, process identifier/PID, or equivalent runtime details the repo exposes.
72
- 4. Output the exact restart command with **absolute worktree paths**.
73
- 5. Include the stop command or shutdown procedure that matches how you started it.
74
-
75
- If it is NOT a webapp (library, CLI tool, backend service without UI), skip this step.
76
-
77
- ## Output Format
78
-
79
- Write your findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` **only after a successful verification run**: the build passed, required tests passed, and any applicable local server is running or not applicable.
80
-
81
- If the build fails, tests fail, dependency setup fails, or a required local server cannot start, do **not** write an inbox note. Follow the failure handling below and report the failure in your final response instead.
82
-
83
- Structure your report exactly like this:
84
-
85
- ```markdown
86
- ## Build & Test Report: {{pr_id}}
87
-
88
- **Branch:** {{pr_branch}}
89
- **Author:** {{pr_author}}
90
- **Project:** {{project_name}}
91
-
92
- ### Build
93
- - Status: PASS
94
- - Notes: (any warnings or issues)
95
-
96
- ### Tests
97
- - Status: PASS / SKIPPED
98
- - Results: X passed, 0 failed, Z skipped
99
- - Failed tests: none
100
-
101
- ### Local Server
102
- - Status: RUNNING / NOT_APPLICABLE
103
- - URL: http://localhost:XXXX (if running)
104
- - PID / Process: <pid or equivalent identifier, if running>
105
- - Restart Command: `cd <absolute-path-to-worktree> && <exact start command>`
106
- - Stop Command: `<exact stop command or shutdown procedure>`
107
-
108
- ### Summary
109
- (1-2 sentence overall assessment — is this PR safe to review?)
110
- ```
111
-
112
- ## Auto-file Work Items on Failure
113
-
114
- If the build or tests fail, create a work item so another agent can fix it. Write a JSON entry to the project's work queue:
115
-
116
- ```bash
117
- # Read existing items, append new one, write back
118
- node -e "
119
- const fs = require('fs');
120
- const p = '{{project_path}}/.minions/work-items.json';
121
- const items = JSON.parse(fs.readFileSync(p, 'utf8') || '[]');
122
- const id = 'W' + String(items.reduce((m,i) => Math.max(m, parseInt((i.id||'').match(/(\d+)$/)?.[1]||0)), 0) + 1).padStart(3, '0');
123
- items.push({
124
- id,
125
- title: 'Fix build/test failure on PR {{pr_id}}: <SHORT DESCRIPTION OF FAILURE>',
126
- type: 'fix',
127
- priority: 'high',
128
- description: '<PASTE THE BUILD/TEST ERROR OUTPUT HERE — keep it under 2000 chars>',
129
- status: 'pending',
130
- created: new Date().toISOString(),
131
- createdBy: '{{agent_id}}',
132
- pr: '{{pr_id}}',
133
- branch: '{{pr_branch}}'
134
- });
135
- fs.writeFileSync(p, JSON.stringify(items, null, 2));
136
- console.log('Filed work item:', id);
137
- "
138
- ```
139
-
140
- Replace `<SHORT DESCRIPTION OF FAILURE>` and `<PASTE THE BUILD/TEST ERROR OUTPUT HERE>` with the actual error details. The engine will pick this up on the next tick and dispatch a fix agent.
141
-
142
- ## Rules
143
-
144
- - **Do NOT create pull requests** — this is a build/test task only
145
- - **Do NOT push commits** or modify code
146
- - **Do NOT attempt to fix build/test failures** — report them and file a work item
147
- - If starting a dev server, output the **exact restart command with absolute paths** so the user can restart it:
148
- ```
149
- ## Restart Command
150
- cd <absolute-path-to-worktree> && <exact start command>
151
- ```
152
- - Also include the server URL, PID/process identifier, and matching stop command.
153
- - Use the worktree path, NOT the main project path, for all commands
154
- - The worktree will persist after your process ends so the user can inspect it
155
-
156
- ## Do not clean up the worktree
157
-
158
- Leave the worktree in place at `{{project_path}}/../worktrees/bt-{{pr_number}}` — the user needs it to review the running app. The engine will clean it up automatically after the PR is merged or closed.
159
-
160
-
161
- ## When to Stop
162
-
163
- Your task is complete once you have: (1) built the project, (2) run tests, (3) started the app if applicable, and (4) written the success findings to the inbox file. If verification failed, stop after filing the failure work item when applicable and reporting the failure in your final response; do not write an inbox file.
43
+ - Do not create pull requests or push commits.
44
+ - Do not modify code unless the task explicitly changes into a fix task.
45
+ - Use the current checkout/worktree prepared by the engine.
46
+ - Do not remove worktrees; the engine handles cleanup automatically.
package/playbooks/fix.md CHANGED
@@ -45,7 +45,7 @@ Before pushing, prove the review fix did not break the branch:
45
45
  - Fix regressions you introduced. If failures are pre-existing or unrelated, capture the evidence and include it in the PR comment.
46
46
  - Do not push code that breaks existing tests or the build because of your changes.
47
47
 
48
- > ⚠️ **Long builds (Gradle, MSBuild, dotnet, fresh `npm install`)**: any command that may stay silent for more than ~4 minutes will be killed by the heartbeat monitor. Run it via `Bash(run_in_background: true)` then `Monitor` to stream stdout, OR pass an explicit `timeout` (max 600000 ms). See **Long-Running Build / Test Commands** below for the full pattern.
48
+ Long builds, dependency installs, and tests may be quiet for several minutes. Let the normal CLI command run naturally; do not add artificial heartbeat output or split commands just to show progress.
49
49
 
50
50
  ## Publish & Comment on PR
51
51