npm - @yemi33/minions - Versions diffs - 0.1.1634 → 0.1.1635 - Mend

@yemi33/minions 0.1.1634 → 0.1.1635

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +5 -0
package/README.md +11 -11
package/docs/auto-discovery.md +17 -15
package/docs/blog-first-successful-dispatch.md +7 -10
package/docs/engine-restart.md +8 -11
package/docs/human-vs-automated.md +3 -4
package/docs/pr-review-fix-loop.md +1 -1
package/docs/rfc-completion-json.md +5 -5
package/engine/copilot-models.json +1 -1
package/engine/lifecycle.js +1 -1
package/engine/queries.js +4 -4
package/engine/shared.js +4 -12
package/engine/timeout.js +59 -168
package/engine.js +11 -42
package/package.json +1 -1
package/playbooks/build-and-test.md +22 -139
package/playbooks/fix.md +1 -1
package/playbooks/implement-shared.md +1 -1
package/playbooks/implement.md +3 -7
package/playbooks/shared-rules.md +4 -45
package/playbooks/test.md +17 -40
package/playbooks/verify.md +29 -141
package/playbooks/work-item.md +1 -0

package/engine/timeout.js CHANGED Viewed

@@ -1,6 +1,5 @@
 /**
- * engine/timeout.js — Timeout detection, steering, and idle threshold checks.
- * Extracted from engine.js for modularity. No logic changes.
+ * engine/timeout.js — Runtime timeout, stale-orphan cleanup, steering, and idle checks.
  */
 const fs = require('fs');
@@ -124,6 +123,28 @@ function checkSteering(config) {
 // ─── Timeout Checker ─────────────────────────────────────────────────────────
+function trackedProcessPid(procInfo) {
+  const pid = Number(procInfo?.proc?.pid || procInfo?.pid || 0);
+  return Number.isFinite(pid) && pid > 0 ? pid : null;
+}
+function isTrackedProcessAlive(procInfo) {
+  if (!procInfo) return false;
+  const proc = procInfo.proc;
+  if (proc && Object.prototype.hasOwnProperty.call(proc, 'exitCode') && proc.exitCode !== null) {
+    return false;
+  }
+  const pid = trackedProcessPid(procInfo);
+  if (!pid) return !!proc && proc.killed !== true;
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch {
+    return false;
+  }
+}
 function checkTimeouts(config) {
   const activeProcesses = engine().activeProcesses;
   const engineRestartGraceUntil = engine().engineRestartGraceUntil;
@@ -132,10 +153,10 @@ function checkTimeouts(config) {
   const { runPostCompletionHooks } = require('./lifecycle');
   const timeout = config.engine?.agentTimeout || ENGINE_DEFAULTS.agentTimeout;
-  const defaultHeartbeatTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
+  const defaultStaleOrphanTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
-  // Per-type heartbeat timeouts: merge ENGINE_DEFAULTS ← config overrides
-  const perTypeTimeouts = { ...ENGINE_DEFAULTS.heartbeatTimeouts, ...(config.engine?.heartbeatTimeouts || {}) };
+  // Optional per-type stale-orphan timeouts: merge ENGINE_DEFAULTS ← config overrides.
+  const perTypeStaleOrphanTimeouts = { ...ENGINE_DEFAULTS.heartbeatTimeouts, ...(config.engine?.heartbeatTimeouts || {}) };
   // 1. Check tracked processes for hard timeout (supports per-item deadline from fan-out)
   for (const [id, info] of activeProcesses.entries()) {
@@ -148,37 +169,32 @@ function checkTimeouts(config) {
     }
   }
-  // 2. Heartbeat check — for ALL active dispatch items (catches orphans after engine restart)
-  //    Uses live-output.log mtime as heartbeat. If no output for heartbeatTimeout, agent is dead.
+  // 2. Stale-orphan check — for ALL active dispatch items (catches lost process handles after restart).
+  //    Silence is not a failure for tracked live processes: long CLI commands can legitimately
+  //    produce no stdout/stderr for extended periods.
   const dispatchData = getDispatch();
   const deadItems = [];
-  const blockingAnnotations = new Map(); // id → { tool, silentMs, remainingMs } or null (clear)
+  const legacyAnnotationClears = new Set();
   for (const item of (dispatchData.active || [])) {
     if (!item.agent) continue;
-    // Per-type heartbeat: look up work type from dispatch item, fall back to default
+    // Per-type stale-orphan timeout: look up work type from dispatch item, fall back to default.
     const workType = item.workType || item.meta?.item?.type;
-    const heartbeatTimeout = (workType && perTypeTimeouts[workType]) || defaultHeartbeatTimeout;
+    const staleOrphanTimeout = (workType && perTypeStaleOrphanTimeouts[workType]) || defaultStaleOrphanTimeout;
-    const hasProcess = activeProcesses.has(item.id);
+    const procInfo = activeProcesses.get(item.id);
+    const hasProcess = !!procInfo;
+    const processAlive = isTrackedProcessAlive(procInfo);
     const liveLogPath = path.join(AGENTS_DIR, item.agent, 'live-output.log');
     let lastActivity = item.started_at ? new Date(item.started_at).getTime() : 0;
-    // For tracked processes, use realActivityMap (tracks actual agent stdout/stderr only,
-    // NOT engine heartbeat writes). This prevents the feedback loop where engine heartbeat
-    // writes to live-output.log reset the mtime that the timeout check reads (#724).
-    const realActivityMap = engine().realActivityMap;
-    if (hasProcess && realActivityMap?.has(item.id)) {
-      lastActivity = Math.max(lastActivity, realActivityMap.get(item.id));
-    } else {
-      // Orphan case (no tracked process): use live-output.log mtime as fallback.
-      // No heartbeat timer is running for orphans, so mtime is accurate.
-      try {
-        const stat = fs.statSync(liveLogPath);
-        lastActivity = Math.max(lastActivity, stat.mtimeMs);
-      } catch { /* optional */ }
-    }
+    // live-output.log mtime is only used for stale-orphan cleanup and completion recovery.
+    // It is not used as an output-silence timeout for live tracked processes.
+    try {
+      const stat = fs.statSync(liveLogPath);
+      lastActivity = Math.max(lastActivity, stat.mtimeMs);
+    } catch { /* optional */ }
     const silentMs = Date.now() - lastActivity;
     const silentSec = Math.round(silentMs / 1000);
@@ -266,126 +282,26 @@ function checkTimeouts(config) {
       // code is known (#1792).
     } catch (e) { log('warn', 'output completion detection: ' + e.message); }
-    // Resolve per-type heartbeat timeout: per-type map → base heartbeatTimeout fallback
-    const itemHeartbeat = perTypeTimeouts[item.type] || heartbeatTimeout;
-    // Check if agent is in a blocking tool call (TaskOutput block:true, Bash with long timeout, etc.)
-    // These tools produce no stdout for extended periods — don't kill them prematurely
-    // Check for BOTH tracked and untracked processes (orphan case after engine restart)
-    // Skip if agent already completed — blocking tool detection on stale tool calls
-    // would extend the timeout indefinitely for dead agents (#716).
-    let isBlocking = false;
-    let blockingTimeout = itemHeartbeat;
-    let blockingTool = '';
-    if (silentMs > itemHeartbeat) {
-      try {
-        const liveLog = safeRead(liveLogPath);
-        if (liveLog) {
-          // If the output contains a result event or process-exit sentinel, the agent is done.
-          // Don't extend timeout for stale blocking tool calls from before the result (#716).
-          if (liveLog.includes('"type":"result"') || liveLog.includes('\n[process-exit]')) {
-            // Agent completed but close event didn't fire — let orphan/hung detection handle it.
-            // Don't set isBlocking — use base heartbeat timeout.
-          } else {
-          // Find the last tool_use call in the output — check if it's a known blocking tool.
-          //
-          // Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
-          // a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
-          // a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
-          // every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
-          // lines per hour AFTER the original tool_use line. A 30-line lookback misses the
-          // tool_use entirely, the detector treats the silence as non-blocking, and the
-          // agent gets killed at heartbeatTimeout despite legitimately waiting on a
-          // background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
-          // beyond Monitor's 30 min effective timeout floor.
-          const lines = liveLog.split('\n');
-          const TOOL_USE_LOOKBACK = 1000;
-          for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
-            const line = lines[i];
-            if (!line.includes('"tool_use"')) continue;
-            try {
-              const parsed = JSON.parse(line);
-              const toolUse = parsed?.message?.content?.find?.(c => c.type === 'tool_use');
-              if (!toolUse) continue;
-              const input = toolUse.input || {};
-              const name = toolUse.name || '';
-              // TaskOutput with block:true — waiting for a background task
-              if (name === 'TaskOutput' && input.block === true) {
-                const taskTimeout = input.timeout || 600000; // default 10min
-                blockingTimeout = Math.max(itemHeartbeat, taskTimeout + 60000); // task timeout + 1min grace
-                isBlocking = true;
-                blockingTool = 'TaskOutput';
-              }
-              // Bash tool call — may be running a long build/install with no stdout
-              if (name === 'Bash') {
-                // Use explicit timeout if set, otherwise match Claude Code's actual Bash default (120s)
-                const bashTimeout = input.timeout || 120000;
-                blockingTimeout = Math.max(itemHeartbeat, bashTimeout + 60000);
-                isBlocking = true;
-                blockingTool = 'Bash';
-              }
-              // PowerShell tool call — Windows-native shell with same explicit-timeout
-              // semantics as Bash (input.timeout, max 600s). Required for projects that
-              // build via PowerShell on Windows (gradlew.bat, MSBuild, dotnet test) where
-              // the cold-start phase produces no stdout for several minutes (#1786).
-              if (name === 'PowerShell') {
-                const psTimeout = input.timeout || 120000;
-                blockingTimeout = Math.max(itemHeartbeat, psTimeout + 60000);
-                isBlocking = true;
-                blockingTool = 'PowerShell';
-              }
-              // Monitor tool call — blocks waiting for stdout-line notifications from a
-              // background process started via Bash with run_in_background. Between
-              // notifications the call produces no output, so the heartbeat monitor
-              // must extend timeout. No fixed timeout on Monitor — match Agent (30min)
-              // since both are inherently long-running waits (#1786).
-              if (name === 'Monitor') {
-                blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for background process waits
-                isBlocking = true;
-                blockingTool = 'Monitor';
-              }
-              // Agent (subagent) tool call — parent waits silently for child to complete
-              if (name === 'Agent') {
-                blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for subagents
-                isBlocking = true;
-                blockingTool = 'Agent';
-              }
-              break; // only check the most recent tool_use
-            } catch { /* JSON parse — line may not be valid JSON */ }
-          }
-          if (isBlocking) {
-            // Only log on transition — avoid spamming every tick while blocking persists
-            if (!item._blockingToolCall) {
-              log('info', `Agent ${item.agent} (${item.id}) is in a blocking tool call (${blockingTool}) — extended timeout to ${Math.round(blockingTimeout / 1000)}s (silent for ${silentSec}s)`, { event: 'blocking_tool_call_detected' });
-            }
-            blockingAnnotations.set(item.id, {
-              tool: blockingTool,
-              silentMs,
-              remainingMs: Math.max(0, blockingTimeout - silentMs),
-            });
-          }
-          } // close else
-        } // close if (liveLog)
-      } catch (e) { log('warn', 'blocking tool detection: ' + e.message); }
-    }
-    // Agent recovered from blocking state — clear annotation
-    if (!isBlocking && item._blockingToolCall) {
-      blockingAnnotations.set(item.id, null);
+    // Blocking tool annotations are no longer needed: live tracked processes are allowed to
+    // be quiet regardless of which command/tool is running.
+    if (item._blockingToolCall) {
+      legacyAnnotationClears.add(item.id);
     }
-    const effectiveTimeout = isBlocking ? blockingTimeout : itemHeartbeat;
     // Skip recently-steered agents — they're being killed and re-spawned
-    const procInfo = activeProcesses.get(item.id);
     if (procInfo?._steeringAt && Date.now() - procInfo._steeringAt < 60000) continue;
-    // Capture live-output.log file state for orphan/hung diagnostics
+    if (processAlive) {
+      continue;
+    }
+    // Capture live-output.log file state for orphan diagnostics
     // (#W-mo248lkjwgsu original, #W-mo25loq8kjer pid annotation).
     // Four distinguishable failure modes:
     //   logExists=false                         → spawn call itself threw, no log ever written
     //   logExists=true pidPresent=false         → engine stub written but spawn died before emitting pid line
-    //   logExists=true pidPresent=true silent   → process spawned (pid recorded) but never produced stdout
-    //   logExists=true pidPresent=true size>pid → genuine hang (process wrote output then stopped)
+    //   logExists=true pidPresent=true silent   → process spawned (pid recorded) but no recent output
+    //   logExists=true pidPresent=true size>pid → process handle was lost after output was written
     //
     // The pid line `[<iso>] pid: <N>` is stamped by engine.js immediately after runFile() returns.
     // Its presence → the child process was actually spawned; absence → spawn itself failed or the
@@ -408,33 +324,15 @@ function checkTimeouts(config) {
       _logState = `logExists=true logSize=${lst.size} pidPresent=${pidPresent}`;
     } catch { /* ENOENT — keep default */ }
-    if (!hasProcess && silentMs > effectiveTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
-      // No tracked process AND no recent output past effective timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
-      log('warn', `Orphan detected: ${item.agent} (${item.id}) — no process tracked, silent for ${silentSec}s${isBlocking ? ' (blocking timeout exceeded)' : ''} [${_logState}]`);
+    if (!processAlive && silentMs > staleOrphanTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
+      // No tracked process AND no recent output past stale-orphan timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
+      log('warn', `Orphan detected: ${item.agent} (${item.id}) — no live process tracked, silent for ${silentSec}s [${_logState}]`);
       dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Orphaned — no process, silent for ${silentSec}s`);
       // Clear session so retry starts fresh
       try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
       deadItems.push({ item, reason: `Orphaned — no process, silent for ${silentSec}s` });
-    } else if (hasProcess && silentMs > effectiveTimeout) {
-      // Has process but no output past effective timeout → hung
-      log('warn', `Hung agent: ${item.agent} (${item.id}) — process exists but no output for ${silentSec}s${isBlocking ? ' (blocking timeout exceeded)' : ''} [${_logState}]`);
-      dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Hung — no output for ${silentSec}s`);
-      const procInfo = activeProcesses.get(item.id);
-      if (procInfo) {
-        shared.killGracefully(procInfo.proc, 5000);
-        // On Unix, also kill child process tree (killGracefully only hits parent PID)
-        if (process.platform !== 'win32' && procInfo.proc?.pid) {
-          setTimeout(() => {
-            try { shared.exec(`pkill -KILL -P ${procInfo.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
-          }, 6000); // after grace period
-        }
-        activeProcesses.delete(item.id);
-      }
-      // Clear session so retry starts fresh instead of resuming the killed session
-      try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
-      deadItems.push({ item, reason: `Hung — no output for ${silentSec}s` });
+      activeProcesses.delete(item.id);
     }
-    // If has process and recent output → healthy, let it run
   }
   // Clean up dead items
@@ -442,19 +340,12 @@ function checkTimeouts(config) {
     completeDispatch(item.id, DISPATCH_RESULT.ERROR, reason);
   }
-  // Batch-write blocking tool call annotations to dispatch entries.
-  // This surfaces blocking state via GET /api/status → dashboard badges.
-  if (blockingAnnotations.size > 0) {
+  // Clear legacy blocking-tool annotations; process liveness no longer depends on tool parsing.
+  if (legacyAnnotationClears.size > 0) {
     const { mutateDispatch: mutateDispatchFn } = dispatch();
     mutateDispatchFn((dp) => {
       for (const activeItem of dp.active) {
-        if (!blockingAnnotations.has(activeItem.id)) continue;
-        const ann = blockingAnnotations.get(activeItem.id);
-        if (ann) {
-          activeItem._blockingToolCall = ann;
-        } else {
-          delete activeItem._blockingToolCall;
-        }
+        if (legacyAnnotationClears.has(activeItem.id)) delete activeItem._blockingToolCall;
       }
     });
   }

package/engine.js CHANGED Viewed

@@ -145,7 +145,7 @@ const { runPostCompletionHooks, updateWorkItemStatus, syncPrdItemStatus, reconci
 // ─── Agent Spawner ──────────────────────────────────────────────────────────
 const activeProcesses = new Map(); // dispatchId → { proc, agentId, startedAt }
-const realActivityMap = new Map(); // dispatchId → timestamp of last REAL agent output (not engine heartbeat)
+const realActivityMap = new Map(); // dispatchId → timestamp of last agent stdout/stderr
 // tempAgents imported from engine/routing.js
 let engineRestartGraceUntil = 0; // timestamp — suppress orphan detection until this time
 const engineRestartGraceExempt = new Set(); // dispatch IDs with confirmed-dead PIDs at restart — bypass grace period
@@ -983,17 +983,12 @@ async function spawnAgent(dispatchItem, config) {
     throw spawnErr;
   }
-  // Seed realActivityMap and stamp PID immediately — BEFORE any handlers or timers (#W-mo25loq8kjer).
+  // Seed realActivityMap and stamp PID immediately — BEFORE any handlers (#W-mo25loq8kjer).
   // Why NOW, not later in the function:
-  //  1. Heartbeat clock anchoring. timeout.js uses realActivityMap as the last-activity timestamp for
-  //     tracked processes; when the map has no entry, it falls back to item.started_at (dispatch time,
-  //     which is 20-60s before actual spawn for write tasks doing worktree setup). Read-only tasks
-  //     that produce no stdout for minutes (explore, security audit, large scans) were hitting
-  //     heartbeatTimeout prematurely — clock had already been running since dispatch.
-  //  2. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
+  //  1. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
   //     on synchronous spawn failures. Seeding before registering handlers ensures delete sees a value
   //     to clear rather than leaving an absent-then-absent no-op that downstream code must guard.
-  //  3. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
+  //  2. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
   //     before first write" (stub-only log) from "process started and is hung" (stub + pid line).
   realActivityMap.set(id, Date.now());
   try {
@@ -1003,24 +998,12 @@ async function spawnAgent(dispatchItem, config) {
   const MAX_OUTPUT = 1024 * 1024; // 1MB
   let stdout = '';
   let stderr = '';
-  let lastOutputAt = Date.now();
-  let heartbeatTimer = null;
   let _trustCheckDone = false;
   const _spawnTime = Date.now();
-  // Keep live log active even when the agent produces no stdout/stderr for long stretches.
-  // This makes "silent but running" states visible in the dashboard tail view.
-  heartbeatTimer = setInterval(() => {
-    const silentMs = Date.now() - lastOutputAt;
-    if (silentMs < 30000) return;
-    const silentSec = Math.round(silentMs / 1000);
-    try { fs.appendFileSync(liveOutputPath, `[heartbeat] running — no output for ${silentSec}s\n`); } catch { /* optional */ }
-  }, 30000);
   proc.stdout.on('data', (data) => {
     const chunk = data.toString();
-    lastOutputAt = Date.now();
-    realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
+    realActivityMap.set(id, Date.now());
     if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
     try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
@@ -1057,14 +1040,12 @@ async function spawnAgent(dispatchItem, config) {
   proc.stderr.on('data', (data) => {
     const chunk = data.toString();
-    lastOutputAt = Date.now();
-    realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
+    realActivityMap.set(id, Date.now());
     if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
     try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
   });
   async function onAgentClose(code) {
-    if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
     log('info', `Agent ${agentId} (${id}) exited with code ${code}`);
     // Emit worker-state transition: FINISHED or FAILED
@@ -1180,33 +1161,22 @@ async function spawnAgent(dispatchItem, config) {
       // Reset output buffers so post-completion parsing only sees the resumed session
       stdout = '';
       stderr = '';
-      lastOutputAt = Date.now();
-      // Restart heartbeat for the resumed process
-      if (heartbeatTimer) clearInterval(heartbeatTimer);
-      heartbeatTimer = setInterval(() => {
-        try { fs.appendFileSync(liveOutputPath, `\n[heartbeat] running — no output for ${Math.round((Date.now() - lastOutputAt) / 1000)}s\n`); } catch {}
-      }, 30000);
       // Re-wire stdout/stderr handlers (same as original)
       resumeProc.stdout.on('data', (data) => {
         const chunk = data.toString();
-        lastOutputAt = Date.now();
-        realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
+        realActivityMap.set(id, Date.now());
         if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
         try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
       });
       resumeProc.stderr.on('data', (data) => {
         const chunk = data.toString();
-        lastOutputAt = Date.now();
-        realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
+        realActivityMap.set(id, Date.now());
         if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
         try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
       });
       // Re-wire close handler for the resumed process
       resumeProc.on('close', (resumeCode) => {
-        if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
         try { fs.unlinkSync(steerPromptPath); } catch { /* cleanup */ }
         if (resumeCode !== 0) {
           log('warn', `Steering resume for ${agentId} exited with code ${resumeCode} | stderr: ${stderr.slice(-300).replace(/\n/g, ' ')}`);
@@ -1262,7 +1232,7 @@ async function spawnAgent(dispatchItem, config) {
     }
     activeProcesses.delete(id);
-    realActivityMap.delete(id); // Clean up real activity tracking
+    realActivityMap.delete(id);
     // If timeout checker already finalized this dispatch, don't overwrite work-item status again.
     // This avoids races where close-handler marks an auto-retried item as failed.
@@ -1301,7 +1271,7 @@ async function spawnAgent(dispatchItem, config) {
     const { resultSummary, autoRecovered } = await runPostCompletionHooks(dispatchItem, agentId, code, stdout, config);
     // Move from active to completed in dispatch (single source of truth for agent status)
-    // autoRecovered: agent failed (e.g. heartbeat timeout) but created PRs — treat as success
+    // autoRecovered: agent failed after creating PRs — treat as success
     const effectiveResult = (code === 0 || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR;
     const completeOpts = effectiveResult === DISPATCH_RESULT.ERROR && failureClass ? { failureClass } : {};
     // Extract last 5 non-empty stderr lines as error context when exit code is non-zero
@@ -1379,10 +1349,9 @@ async function spawnAgent(dispatchItem, config) {
   proc.on('close', onAgentClose);
   proc.on('error', (err) => {
-    if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
     log('error', `Failed to spawn agent ${agentId}: ${err.message}`);
     activeProcesses.delete(id);
-    realActivityMap.delete(id); // Clean up real activity tracking
+    realActivityMap.delete(id);
     completeDispatch(id, DISPATCH_RESULT.ERROR, `Spawn error: ${err.message}`);
   });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@yemi33/minions",
-  "version": "0.1.1634",
+  "version": "0.1.1635",
   "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
   "bin": {
     "minions": "bin/minions.js"

package/playbooks/build-and-test.md CHANGED Viewed

@@ -8,156 +8,39 @@ Repo: {{repo_name}} | Org: {{ado_org}} | Project: {{ado_project}}
 Team root: {{team_root}}
 Project path: {{project_path}}
-## Your Task
+## Mission
-A new PR has been created: **{{pr_id}}** — "{{pr_title}}"
+A new PR has been created: **{{pr_id}}** - "{{pr_title}}"
 Branch: `{{pr_branch}}` | Author: {{pr_author}}
-Your job is to **check out the branch, build it, run tests, and if it's a webapp, start a local dev server** so the human reviewer can see it running.
+Run the project's normal build/test verification for this PR and report whether it is ready for human review. If it is a runnable app, identify the local URL and the exact command needed to run it.
-## Instructions
+## Long-Running Commands
-### 1. Set up a worktree for the PR branch
+Builds, dependency installs, tests, and dev servers can be quiet for a long time. Let normal CLI commands run naturally; do not add artificial heartbeat output or split commands just to show progress.
-You are already in the correct working directory on branch `{{pr_branch}}`. Do NOT create additional worktrees.
+## Approach
-### 2. Install dependencies
+Work from the current checkout prepared by the engine. Read the repo's own instructions first (`CLAUDE.md`, README, package files, Makefiles, project scripts) and adapt to the build system you find.
-Look at the project's build system (package.json, CLAUDE.md, README, Makefile, etc.) and install:
-```bash
-# Examples — use whatever the project needs:
-yarn install   # or npm install
-pip install -r requirements.txt
-dotnet restore
-```
+If build or tests fail, report the relevant errors clearly and stop. Do not fix code, push commits, or create PRs from this task.
-### 3. Build the project
+If a server/app should be run for review, include the URL and a copy-pasteable run command with absolute paths. If the server must survive after the agent exits, start it detached and record the PID, restart command, and stop command; otherwise just provide the command for the user.
-Run the project's build command:
-```bash
-# Examples:
-yarn build   # or npm run build
-dotnet build
-cargo build
-```
+## Findings
-If the build **fails**, report the errors clearly and stop. Do NOT attempt to fix the code.
+Write findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` only after successful verification.
-> ⚠️ **Cold builds are silent for minutes** (Gradle daemon spin-up, dotnet restore, fresh `npm install`). Run them via `Bash(run_in_background: true)` then `Monitor` to stream stdout, OR pass an explicit `timeout` on the Bash call (max 600000 ms). Without one of these, the heartbeat monitor will kill the agent at ~5 min of silence. See **Long-Running Build / Test Commands** below.
+Include:
+- Branch, author, and project
+- Build status and important warnings/errors
+- Test status and failed test names if any
+- Local server status, URL, run command, PID, restart command, and stop command if applicable
+- A short summary of whether the PR is ready to review
-### 4. Run tests
+## Constraints
-```bash
-# Examples:
-yarn test   # or npm test
-pytest
-dotnet test
-```
-Report test results: how many passed, failed, skipped.
-### 5. Start a local dev server (if applicable)
-Determine if this project is a **webapp** (has a dev server, serves HTTP, has a UI):
-- Check package.json for `dev`, `start`, `serve` scripts
-- Check for frameworks: Next.js, React, Angular, Vue, Express, Flask, ASP.NET
-- Check CLAUDE.md for run instructions
-If it IS a webapp:
-1. Start the dev server **detached from your process** so it survives after you exit.
-   - If the repo docs provide a local run or background-start command, use that.
-   - Otherwise, use the detached-process mechanism that fits the current environment. Do not assume Bash, PowerShell, or any specific shell unless the repo or runtime clearly provides it.
-2. Wait a few seconds, then verify it using the repo's documented smoke test, health check, startup output, or the lightest project-appropriate manual check.
-3. Note the localhost URL, port, process identifier/PID, or equivalent runtime details the repo exposes.
-4. Output the exact restart command with **absolute worktree paths**.
-5. Include the stop command or shutdown procedure that matches how you started it.
-If it is NOT a webapp (library, CLI tool, backend service without UI), skip this step.
-## Output Format
-Write your findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` **only after a successful verification run**: the build passed, required tests passed, and any applicable local server is running or not applicable.
-If the build fails, tests fail, dependency setup fails, or a required local server cannot start, do **not** write an inbox note. Follow the failure handling below and report the failure in your final response instead.
-Structure your report exactly like this:
-```markdown
-## Build & Test Report: {{pr_id}}
-**Branch:** {{pr_branch}}
-**Author:** {{pr_author}}
-**Project:** {{project_name}}
-### Build
-- Status: PASS
-- Notes: (any warnings or issues)
-### Tests
-- Status: PASS / SKIPPED
-- Results: X passed, 0 failed, Z skipped
-- Failed tests: none
-### Local Server
-- Status: RUNNING / NOT_APPLICABLE
-- URL: http://localhost:XXXX (if running)
-- PID / Process: <pid or equivalent identifier, if running>
-- Restart Command: `cd <absolute-path-to-worktree> && <exact start command>`
-- Stop Command: `<exact stop command or shutdown procedure>`
-### Summary
-(1-2 sentence overall assessment — is this PR safe to review?)
-```
-## Auto-file Work Items on Failure
-If the build or tests fail, create a work item so another agent can fix it. Write a JSON entry to the project's work queue:
-```bash
-# Read existing items, append new one, write back
-node -e "
-const fs = require('fs');
-const p = '{{project_path}}/.minions/work-items.json';
-const items = JSON.parse(fs.readFileSync(p, 'utf8') || '[]');
-const id = 'W' + String(items.reduce((m,i) => Math.max(m, parseInt((i.id||'').match(/(\d+)$/)?.[1]||0)), 0) + 1).padStart(3, '0');
-items.push({
-  id,
-  title: 'Fix build/test failure on PR {{pr_id}}: <SHORT DESCRIPTION OF FAILURE>',
-  type: 'fix',
-  priority: 'high',
-  description: '<PASTE THE BUILD/TEST ERROR OUTPUT HERE — keep it under 2000 chars>',
-  status: 'pending',
-  created: new Date().toISOString(),
-  createdBy: '{{agent_id}}',
-  pr: '{{pr_id}}',
-  branch: '{{pr_branch}}'
-});
-fs.writeFileSync(p, JSON.stringify(items, null, 2));
-console.log('Filed work item:', id);
-"
-```
-Replace `<SHORT DESCRIPTION OF FAILURE>` and `<PASTE THE BUILD/TEST ERROR OUTPUT HERE>` with the actual error details. The engine will pick this up on the next tick and dispatch a fix agent.
-## Rules
-- **Do NOT create pull requests** — this is a build/test task only
-- **Do NOT push commits** or modify code
-- **Do NOT attempt to fix build/test failures** — report them and file a work item
-- If starting a dev server, output the **exact restart command with absolute paths** so the user can restart it:
-  ```
-  ## Restart Command
-  cd <absolute-path-to-worktree> && <exact start command>
-  ```
-- Also include the server URL, PID/process identifier, and matching stop command.
-- Use the worktree path, NOT the main project path, for all commands
-- The worktree will persist after your process ends so the user can inspect it
-## Do not clean up the worktree
-Leave the worktree in place at `{{project_path}}/../worktrees/bt-{{pr_number}}` — the user needs it to review the running app. The engine will clean it up automatically after the PR is merged or closed.
-## When to Stop
-Your task is complete once you have: (1) built the project, (2) run tests, (3) started the app if applicable, and (4) written the success findings to the inbox file. If verification failed, stop after filing the failure work item when applicable and reporting the failure in your final response; do not write an inbox file.
+- Do not create pull requests or push commits.
+- Do not modify code unless the task explicitly changes into a fix task.
+- Use the current checkout/worktree prepared by the engine.
+- Do not remove worktrees; the engine handles cleanup automatically.

package/playbooks/fix.md CHANGED Viewed

@@ -45,7 +45,7 @@ Before pushing, prove the review fix did not break the branch:
 - Fix regressions you introduced. If failures are pre-existing or unrelated, capture the evidence and include it in the PR comment.
 - Do not push code that breaks existing tests or the build because of your changes.
-> ⚠️ **Long builds (Gradle, MSBuild, dotnet, fresh `npm install`)**: any command that may stay silent for more than ~4 minutes will be killed by the heartbeat monitor. Run it via `Bash(run_in_background: true)` then `Monitor` to stream stdout, OR pass an explicit `timeout` (max 600000 ms). See **Long-Running Build / Test Commands** below for the full pattern.
+Long builds, dependency installs, and tests may be quiet for several minutes. Let the normal CLI command run naturally; do not add artificial heartbeat output or split commands just to show progress.
 ## Publish & Comment on PR