npm - @yemi33/minions - Versions diffs - 0.1.1580 → 0.1.1582 - Mend

@yemi33/minions 0.1.1580 → 0.1.1582

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,11 +1,19 @@
 # Changelog
-## 0.1.1580 (2026-04-28)
+## 0.1.1582 (2026-04-28)
+### Fixes
+- heartbeat kills agents on silent Monitor + false-positive success (#1792) (#1798)
+## 0.1.1581 (2026-04-28)
 ### Features
 -  stream doc chat progress
 - hash-dedup, compress+normalize pass, dynamic stale-guard, rich result
+### Fixes
+- prohibit grep-filtered Monitor for long builds (#1794) (#1797)
 ### Other
 - Keep CC streams reconnectable

package/engine/timeout.js CHANGED Viewed

@@ -183,40 +183,74 @@ function checkTimeouts(config) {
     const silentMs = Date.now() - lastActivity;
     const silentSec = Math.round(silentMs / 1000);
-    // Check if the agent actually completed (result event in live output).
-    // Read the tail of the log (last 64KB) for efficiency — result JSON is always near the end.
-    // No time cap: a stuck dispatch that produced a result must always be detected (#716).
+    // Check if the agent actually completed by looking for the [process-exit] sentinel.
+    //
+    // The sentinel is written synchronously by spawn-agent.js's proc.on('close') handler
+    // BEFORE spawn-agent itself exits, in the form:
+    //   "\n[process-exit] code=<N>\n"        — normal exit (any exit code)
+    //   "\n[process-exit] spawn-failed\n"    — synchronous spawn() throw before runFile returned
+    //
+    // This sentinel is the single source of truth for "process is gone" + "what was the
+    // exit code". We rely on the actual exit code — NOT a "subtype":"success" substring
+    // match — to decide success/error. Substring-matching `subtype:"success"` was the
+    // false-positive vector for #1792: a resumed --resume turn emits subtype:"success"
+    // even when the agent did no real work, while the OS exit code can still be 1, so
+    // the dispatch was being marked SUCCESS for a no-op resumed session. Exit code from
+    // the [process-exit] sentinel reflects what the OS actually reported.
+    //
+    // We tail 64KB — process-exit is always the last non-empty line of the file.
+    // No time cap: a stuck dispatch whose process has exited must always be detected (#716).
     let completedViaOutput = false;
     try {
-      let liveLog;
+      let liveLogTail;
       try {
         const fd = fs.openSync(liveLogPath, 'r');
-        const stat = fs.fstatSync(fd);
-        const TAIL_SIZE = 65536; // 64KB
-        const tailSize = Math.min(stat.size, TAIL_SIZE);
-        const buf = Buffer.alloc(tailSize);
-        fs.readSync(fd, buf, 0, tailSize, Math.max(0, stat.size - tailSize));
-        fs.closeSync(fd);
-        liveLog = buf.toString('utf8');
-      } catch { /* ENOENT or read failure — liveLog stays undefined */ }
-      if (liveLog && (liveLog.includes('"type":"result"') || liveLog.includes('\n[process-exit]'))) {
+        try {
+          const stat = fs.fstatSync(fd);
+          const TAIL_SIZE = 65536; // 64KB
+          const tailSize = Math.min(stat.size, TAIL_SIZE);
+          const buf = Buffer.alloc(tailSize);
+          fs.readSync(fd, buf, 0, tailSize, Math.max(0, stat.size - tailSize));
+          liveLogTail = buf.toString('utf8');
+        } finally { fs.closeSync(fd); }
+      } catch { /* ENOENT or read failure — liveLogTail stays undefined */ }
+      // Parse the LAST [process-exit] sentinel — code=N or "spawn-failed".
+      // Use the global regex with a manual loop so we always pick up the latest occurrence,
+      // not the first (defends against logs that somehow contain stale sentinel lines).
+      let processExited = false;
+      let processExitCode = null;
+      if (liveLogTail) {
+        const exitPattern = /\n\[process-exit\]\s+(?:code=)?(-?\d+|spawn-failed)/g;
+        let lastMatch = null;
+        let m;
+        while ((m = exitPattern.exec(liveLogTail)) !== null) lastMatch = m;
+        if (lastMatch) {
+          processExited = true;
+          processExitCode = lastMatch[1] === 'spawn-failed' ? -1 : parseInt(lastMatch[1], 10);
+        }
+      }
+      if (processExited) {
         completedViaOutput = true;
-        const isSuccess = liveLog.includes('"subtype":"success"');
-        log('info', `Agent ${item.agent} (${item.id}) completed via output detection (${isSuccess ? 'success' : 'error'})`);
+        const isSuccess = processExitCode === 0;
+        log('info', `Agent ${item.agent} (${item.id}) completed via output detection (exit code ${processExitCode}, ${isSuccess ? 'success' : 'error'})`);
         // Extract output text for the output.log — read full file for complete parsing
         const outputLogPath = path.join(AGENTS_DIR, item.agent, 'output.log');
         try {
-          const fullLog = safeRead(liveLogPath) || liveLog;
+          const fullLog = safeRead(liveLogPath) || liveLogTail;
           const { text } = shared.parseStreamJsonOutput(fullLog);
-          safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${isSuccess ? 0 : 1}\n# Completed: ${ts()}\n# Detected via output scan\n\n## Result\n${text || '(no text)'}\n`);
+          safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${processExitCode}\n# Completed: ${ts()}\n# Detected via output scan\n\n## Result\n${text || '(no text)'}\n`);
         } catch (e) { log('warn', 'parse output result: ' + e.message); }
-        completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR, isSuccess ? 'Completed (detected from output)' : 'Exited with error (detected from output)');
+        completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR,
+          isSuccess ? 'Completed (detected from output)' : `Exited with code ${processExitCode} (detected from output)`);
-        // Run post-completion hooks via shared helper (async — fire and forget in timeout context)
-        const fullLogForHooks = safeRead(liveLogPath) || liveLog;
-        runPostCompletionHooks(item, item.agent, isSuccess ? 0 : 1, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
+        // Run post-completion hooks via shared helper (async — fire and forget in timeout context).
+        // Pass the actual exit code so autoRecovery (PR-created-but-failed) still works correctly.
+        const fullLogForHooks = safeRead(liveLogPath) || liveLogTail;
+        runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
         if (hasProcess) {
           shared.killImmediate(activeProcesses.get(item.id)?.proc);
@@ -224,6 +258,12 @@ function checkTimeouts(config) {
         }
         continue; // Skip orphan/hung detection — we handled it
       }
+      // Note: we DO NOT trigger on `"type":"result"` alone. There is a ~1s race between
+      // claude CLI emitting the result event and spawn-agent.js writing [process-exit] —
+      // engine.js's onAgentClose handler fires within that window for tracked processes
+      // and handles completion correctly. Triggering on result-event here would race the
+      // close handler and risk marking SUCCESS based on subtype before the actual exit
+      // code is known (#1792).
     } catch (e) { log('warn', 'output completion detection: ' + e.message); }
     // Resolve per-type heartbeat timeout: per-type map → base heartbeatTimeout fallback
@@ -247,9 +287,20 @@ function checkTimeouts(config) {
             // Agent completed but close event didn't fire — let orphan/hung detection handle it.
             // Don't set isBlocking — use base heartbeat timeout.
           } else {
-          // Find the last tool_use call in the output — check if it's a known blocking tool
+          // Find the last tool_use call in the output — check if it's a known blocking tool.
+          //
+          // Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
+          // a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
+          // a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
+          // every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
+          // lines per hour AFTER the original tool_use line. A 30-line lookback misses the
+          // tool_use entirely, the detector treats the silence as non-blocking, and the
+          // agent gets killed at heartbeatTimeout despite legitimately waiting on a
+          // background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
+          // beyond Monitor's 30 min effective timeout floor.
           const lines = liveLog.split('\n');
-          for (let i = lines.length - 1; i >= Math.max(0, lines.length - 30); i--) {
+          const TOOL_USE_LOOKBACK = 1000;
+          for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
             const line = lines[i];
             if (!line.includes('"tool_use"')) continue;
             try {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@yemi33/minions",
-  "version": "0.1.1580",
+  "version": "0.1.1582",
   "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
   "bin": {
     "minions": "bin/minions.js"

package/playbooks/shared-rules.md CHANGED Viewed

@@ -64,6 +64,8 @@ The engine kills agents that produce no stdout for `heartbeatTimeout` (default *
 Why: each line that the build emits arrives as a notification, which resets the heartbeat. You see live progress in the dashboard. The Monitor call itself is recognised by the engine as a blocking tool (heartbeat extended ~30 min).
+> ⚠️ **Never use `Monitor({ command: "tail -F <file> | grep ..." })` for long builds.** It looks tidy — only the lines you care about — but it is a heartbeat trap. Cold Gradle / MSBuild / `cargo build` spend 3–8 minutes in a startup + dependency-resolution phase that produces output that **does not match** typical filter terms (`BUILD SUCCESSFUL`, `BUILD FAILED`, `error:`). The grep filter swallows every line, Monitor emits zero notifications, the heartbeat fires at 300s, and the engine kills the agent mid-build. Always pass `bash_id` directly — every output line resets the heartbeat, and noisy output is the *whole point* of the pattern.
 ### Pattern B — Single Bash call with explicit `timeout`
 ```
@@ -75,6 +77,7 @@ The engine reads `input.timeout` from the tool call and extends the heartbeat to
 ### What NOT to do
 - Do NOT run `./gradlew`, `mvn`, `dotnet test`, or any cold-cache build as a default `Bash` call (no `timeout`, no `run_in_background`). It will hit the 120s Bash default, then the 300s heartbeat, and the engine will kill you.
+- Do NOT use `Monitor({ command: "tail | grep ..." })` for any build that has a silent startup phase (cold Gradle, MSBuild, fresh `npm install`, `cargo build`). The grep filter suppresses Gradle's startup output, Monitor emits nothing, heartbeat fires at 300s, agent is killed. Use `Monitor({ bash_id })` instead — noisy output is better than a dead agent.
 - Do NOT loop `sleep` to "wait it out" — sleep produces no stdout and looks identical to a hang.
 - Do NOT pipe through `tee` thinking that helps — heartbeat reads agent stdout, not the underlying file.