@yemi33/minions 0.1.1581 → 0.1.1582

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.1.1582 (2026-04-28)
4
+
5
+ ### Fixes
6
+ - heartbeat kills agents on silent Monitor + false-positive success (#1792) (#1798)
7
+
3
8
  ## 0.1.1581 (2026-04-28)
4
9
 
5
10
  ### Features
package/engine/timeout.js CHANGED
@@ -183,40 +183,74 @@ function checkTimeouts(config) {
183
183
  const silentMs = Date.now() - lastActivity;
184
184
  const silentSec = Math.round(silentMs / 1000);
185
185
 
186
- // Check if the agent actually completed (result event in live output).
187
- // Read the tail of the log (last 64KB) for efficiency — result JSON is always near the end.
188
- // No time cap: a stuck dispatch that produced a result must always be detected (#716).
186
+ // Check if the agent actually completed by looking for the [process-exit] sentinel.
187
+ //
188
+ // The sentinel is written synchronously by spawn-agent.js's proc.on('close') handler
189
+ // BEFORE spawn-agent itself exits, in the form:
190
+ // "\n[process-exit] code=<N>\n" — normal exit (any exit code)
191
+ // "\n[process-exit] spawn-failed\n" — synchronous spawn() throw before runFile returned
192
+ //
193
+ // This sentinel is the single source of truth for "process is gone" + "what was the
194
+ // exit code". We rely on the actual exit code — NOT a "subtype":"success" substring
195
+ // match — to decide success/error. Substring-matching `subtype:"success"` was the
196
+ // false-positive vector for #1792: a resumed --resume turn emits subtype:"success"
197
+ // even when the agent did no real work, while the OS exit code can still be 1, so
198
+ // the dispatch was being marked SUCCESS for a no-op resumed session. Exit code from
199
+ // the [process-exit] sentinel reflects what the OS actually reported.
200
+ //
201
+ // We tail 64KB — process-exit is always the last non-empty line of the file.
202
+ // No time cap: a stuck dispatch whose process has exited must always be detected (#716).
189
203
  let completedViaOutput = false;
190
204
  try {
191
- let liveLog;
205
+ let liveLogTail;
192
206
  try {
193
207
  const fd = fs.openSync(liveLogPath, 'r');
194
- const stat = fs.fstatSync(fd);
195
- const TAIL_SIZE = 65536; // 64KB
196
- const tailSize = Math.min(stat.size, TAIL_SIZE);
197
- const buf = Buffer.alloc(tailSize);
198
- fs.readSync(fd, buf, 0, tailSize, Math.max(0, stat.size - tailSize));
199
- fs.closeSync(fd);
200
- liveLog = buf.toString('utf8');
201
- } catch { /* ENOENT or read failure — liveLog stays undefined */ }
202
- if (liveLog && (liveLog.includes('"type":"result"') || liveLog.includes('\n[process-exit]'))) {
208
+ try {
209
+ const stat = fs.fstatSync(fd);
210
+ const TAIL_SIZE = 65536; // 64KB
211
+ const tailSize = Math.min(stat.size, TAIL_SIZE);
212
+ const buf = Buffer.alloc(tailSize);
213
+ fs.readSync(fd, buf, 0, tailSize, Math.max(0, stat.size - tailSize));
214
+ liveLogTail = buf.toString('utf8');
215
+ } finally { fs.closeSync(fd); }
216
+ } catch { /* ENOENT or read failure — liveLogTail stays undefined */ }
217
+
218
+ // Parse the LAST [process-exit] sentinel — code=N or "spawn-failed".
219
+ // Use the global regex with a manual loop so we always pick up the latest occurrence,
220
+ // not the first (defends against logs that somehow contain stale sentinel lines).
221
+ let processExited = false;
222
+ let processExitCode = null;
223
+ if (liveLogTail) {
224
+ const exitPattern = /\n\[process-exit\]\s+(?:code=)?(-?\d+|spawn-failed)/g;
225
+ let lastMatch = null;
226
+ let m;
227
+ while ((m = exitPattern.exec(liveLogTail)) !== null) lastMatch = m;
228
+ if (lastMatch) {
229
+ processExited = true;
230
+ processExitCode = lastMatch[1] === 'spawn-failed' ? -1 : parseInt(lastMatch[1], 10);
231
+ }
232
+ }
233
+
234
+ if (processExited) {
203
235
  completedViaOutput = true;
204
- const isSuccess = liveLog.includes('"subtype":"success"');
205
- log('info', `Agent ${item.agent} (${item.id}) completed via output detection (${isSuccess ? 'success' : 'error'})`);
236
+ const isSuccess = processExitCode === 0;
237
+ log('info', `Agent ${item.agent} (${item.id}) completed via output detection (exit code ${processExitCode}, ${isSuccess ? 'success' : 'error'})`);
206
238
 
207
239
  // Extract output text for the output.log — read full file for complete parsing
208
240
  const outputLogPath = path.join(AGENTS_DIR, item.agent, 'output.log');
209
241
  try {
210
- const fullLog = safeRead(liveLogPath) || liveLog;
242
+ const fullLog = safeRead(liveLogPath) || liveLogTail;
211
243
  const { text } = shared.parseStreamJsonOutput(fullLog);
212
- safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${isSuccess ? 0 : 1}\n# Completed: ${ts()}\n# Detected via output scan\n\n## Result\n${text || '(no text)'}\n`);
244
+ safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${processExitCode}\n# Completed: ${ts()}\n# Detected via output scan\n\n## Result\n${text || '(no text)'}\n`);
213
245
  } catch (e) { log('warn', 'parse output result: ' + e.message); }
214
246
 
215
- completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR, isSuccess ? 'Completed (detected from output)' : 'Exited with error (detected from output)');
247
+ completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR,
248
+ isSuccess ? 'Completed (detected from output)' : `Exited with code ${processExitCode} (detected from output)`);
216
249
 
217
- // Run post-completion hooks via shared helper (async — fire and forget in timeout context)
218
- const fullLogForHooks = safeRead(liveLogPath) || liveLog;
219
- runPostCompletionHooks(item, item.agent, isSuccess ? 0 : 1, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
250
+ // Run post-completion hooks via shared helper (async — fire and forget in timeout context).
251
+ // Pass the actual exit code so autoRecovery (PR-created-but-failed) still works correctly.
252
+ const fullLogForHooks = safeRead(liveLogPath) || liveLogTail;
253
+ runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
220
254
 
221
255
  if (hasProcess) {
222
256
  shared.killImmediate(activeProcesses.get(item.id)?.proc);
@@ -224,6 +258,12 @@ function checkTimeouts(config) {
224
258
  }
225
259
  continue; // Skip orphan/hung detection — we handled it
226
260
  }
261
+ // Note: we DO NOT trigger on `"type":"result"` alone. There is a ~1s race between
262
+ // claude CLI emitting the result event and spawn-agent.js writing [process-exit] —
263
+ // engine.js's onAgentClose handler fires within that window for tracked processes
264
+ // and handles completion correctly. Triggering on result-event here would race the
265
+ // close handler and risk marking SUCCESS based on subtype before the actual exit
266
+ // code is known (#1792).
227
267
  } catch (e) { log('warn', 'output completion detection: ' + e.message); }
228
268
 
229
269
  // Resolve per-type heartbeat timeout: per-type map → base heartbeatTimeout fallback
@@ -247,9 +287,20 @@ function checkTimeouts(config) {
247
287
  // Agent completed but close event didn't fire — let orphan/hung detection handle it.
248
288
  // Don't set isBlocking — use base heartbeat timeout.
249
289
  } else {
250
- // Find the last tool_use call in the output — check if it's a known blocking tool
290
+ // Find the last tool_use call in the output — check if it's a known blocking tool.
291
+ //
292
+ // Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
293
+ // a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
294
+ // a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
295
+ // every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
296
+ // lines per hour AFTER the original tool_use line. A 30-line lookback misses the
297
+ // tool_use entirely, the detector treats the silence as non-blocking, and the
298
+ // agent gets killed at heartbeatTimeout despite legitimately waiting on a
299
+ // background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
300
+ // beyond Monitor's 30 min effective timeout floor.
251
301
  const lines = liveLog.split('\n');
252
- for (let i = lines.length - 1; i >= Math.max(0, lines.length - 30); i--) {
302
+ const TOOL_USE_LOOKBACK = 1000;
303
+ for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
253
304
  const line = lines[i];
254
305
  if (!line.includes('"tool_use"')) continue;
255
306
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1581",
3
+ "version": "0.1.1582",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"