@yemi33/minions 0.1.1580 → 0.1.1582
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -1
- package/engine/timeout.js +74 -23
- package/package.json +1 -1
- package/playbooks/shared-rules.md +3 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
## 0.1.
|
|
3
|
+
## 0.1.1582 (2026-04-28)
|
|
4
|
+
|
|
5
|
+
### Fixes
|
|
6
|
+
- heartbeat kills agents on silent Monitor + false-positive success (#1792) (#1798)
|
|
7
|
+
|
|
8
|
+
## 0.1.1581 (2026-04-28)
|
|
4
9
|
|
|
5
10
|
### Features
|
|
6
11
|
- stream doc chat progress
|
|
7
12
|
- hash-dedup, compress+normalize pass, dynamic stale-guard, rich result
|
|
8
13
|
|
|
14
|
+
### Fixes
|
|
15
|
+
- prohibit grep-filtered Monitor for long builds (#1794) (#1797)
|
|
16
|
+
|
|
9
17
|
### Other
|
|
10
18
|
- Keep CC streams reconnectable
|
|
11
19
|
|
package/engine/timeout.js
CHANGED
|
@@ -183,40 +183,74 @@ function checkTimeouts(config) {
|
|
|
183
183
|
const silentMs = Date.now() - lastActivity;
|
|
184
184
|
const silentSec = Math.round(silentMs / 1000);
|
|
185
185
|
|
|
186
|
-
// Check if the agent actually completed
|
|
187
|
-
//
|
|
188
|
-
//
|
|
186
|
+
// Check if the agent actually completed by looking for the [process-exit] sentinel.
|
|
187
|
+
//
|
|
188
|
+
// The sentinel is written synchronously by spawn-agent.js's proc.on('close') handler
|
|
189
|
+
// BEFORE spawn-agent itself exits, in the form:
|
|
190
|
+
// "\n[process-exit] code=<N>\n" — normal exit (any exit code)
|
|
191
|
+
// "\n[process-exit] spawn-failed\n" — synchronous spawn() throw before runFile returned
|
|
192
|
+
//
|
|
193
|
+
// This sentinel is the single source of truth for "process is gone" + "what was the
|
|
194
|
+
// exit code". We rely on the actual exit code — NOT a "subtype":"success" substring
|
|
195
|
+
// match — to decide success/error. Substring-matching `subtype:"success"` was the
|
|
196
|
+
// false-positive vector for #1792: a resumed --resume turn emits subtype:"success"
|
|
197
|
+
// even when the agent did no real work, while the OS exit code can still be 1, so
|
|
198
|
+
// the dispatch was being marked SUCCESS for a no-op resumed session. Exit code from
|
|
199
|
+
// the [process-exit] sentinel reflects what the OS actually reported.
|
|
200
|
+
//
|
|
201
|
+
// We tail 64KB — process-exit is always the last non-empty line of the file.
|
|
202
|
+
// No time cap: a stuck dispatch whose process has exited must always be detected (#716).
|
|
189
203
|
let completedViaOutput = false;
|
|
190
204
|
try {
|
|
191
|
-
let
|
|
205
|
+
let liveLogTail;
|
|
192
206
|
try {
|
|
193
207
|
const fd = fs.openSync(liveLogPath, 'r');
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
208
|
+
try {
|
|
209
|
+
const stat = fs.fstatSync(fd);
|
|
210
|
+
const TAIL_SIZE = 65536; // 64KB
|
|
211
|
+
const tailSize = Math.min(stat.size, TAIL_SIZE);
|
|
212
|
+
const buf = Buffer.alloc(tailSize);
|
|
213
|
+
fs.readSync(fd, buf, 0, tailSize, Math.max(0, stat.size - tailSize));
|
|
214
|
+
liveLogTail = buf.toString('utf8');
|
|
215
|
+
} finally { fs.closeSync(fd); }
|
|
216
|
+
} catch { /* ENOENT or read failure — liveLogTail stays undefined */ }
|
|
217
|
+
|
|
218
|
+
// Parse the LAST [process-exit] sentinel — code=N or "spawn-failed".
|
|
219
|
+
// Use the global regex with a manual loop so we always pick up the latest occurrence,
|
|
220
|
+
// not the first (defends against logs that somehow contain stale sentinel lines).
|
|
221
|
+
let processExited = false;
|
|
222
|
+
let processExitCode = null;
|
|
223
|
+
if (liveLogTail) {
|
|
224
|
+
const exitPattern = /\n\[process-exit\]\s+(?:code=)?(-?\d+|spawn-failed)/g;
|
|
225
|
+
let lastMatch = null;
|
|
226
|
+
let m;
|
|
227
|
+
while ((m = exitPattern.exec(liveLogTail)) !== null) lastMatch = m;
|
|
228
|
+
if (lastMatch) {
|
|
229
|
+
processExited = true;
|
|
230
|
+
processExitCode = lastMatch[1] === 'spawn-failed' ? -1 : parseInt(lastMatch[1], 10);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (processExited) {
|
|
203
235
|
completedViaOutput = true;
|
|
204
|
-
const isSuccess =
|
|
205
|
-
log('info', `Agent ${item.agent} (${item.id}) completed via output detection (${isSuccess ? 'success' : 'error'})`);
|
|
236
|
+
const isSuccess = processExitCode === 0;
|
|
237
|
+
log('info', `Agent ${item.agent} (${item.id}) completed via output detection (exit code ${processExitCode}, ${isSuccess ? 'success' : 'error'})`);
|
|
206
238
|
|
|
207
239
|
// Extract output text for the output.log — read full file for complete parsing
|
|
208
240
|
const outputLogPath = path.join(AGENTS_DIR, item.agent, 'output.log');
|
|
209
241
|
try {
|
|
210
|
-
const fullLog = safeRead(liveLogPath) ||
|
|
242
|
+
const fullLog = safeRead(liveLogPath) || liveLogTail;
|
|
211
243
|
const { text } = shared.parseStreamJsonOutput(fullLog);
|
|
212
|
-
safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${
|
|
244
|
+
safeWrite(outputLogPath, `# Output for dispatch ${item.id}\n# Exit code: ${processExitCode}\n# Completed: ${ts()}\n# Detected via output scan\n\n## Result\n${text || '(no text)'}\n`);
|
|
213
245
|
} catch (e) { log('warn', 'parse output result: ' + e.message); }
|
|
214
246
|
|
|
215
|
-
completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR,
|
|
247
|
+
completeDispatch(item.id, isSuccess ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR,
|
|
248
|
+
isSuccess ? 'Completed (detected from output)' : `Exited with code ${processExitCode} (detected from output)`);
|
|
216
249
|
|
|
217
|
-
// Run post-completion hooks via shared helper (async — fire and forget in timeout context)
|
|
218
|
-
|
|
219
|
-
|
|
250
|
+
// Run post-completion hooks via shared helper (async — fire and forget in timeout context).
|
|
251
|
+
// Pass the actual exit code so autoRecovery (PR-created-but-failed) still works correctly.
|
|
252
|
+
const fullLogForHooks = safeRead(liveLogPath) || liveLogTail;
|
|
253
|
+
runPostCompletionHooks(item, item.agent, processExitCode, fullLogForHooks, config).catch(e => log('warn', 'post-completion hooks: ' + e.message));
|
|
220
254
|
|
|
221
255
|
if (hasProcess) {
|
|
222
256
|
shared.killImmediate(activeProcesses.get(item.id)?.proc);
|
|
@@ -224,6 +258,12 @@ function checkTimeouts(config) {
|
|
|
224
258
|
}
|
|
225
259
|
continue; // Skip orphan/hung detection — we handled it
|
|
226
260
|
}
|
|
261
|
+
// Note: we DO NOT trigger on `"type":"result"` alone. There is a ~1s race between
|
|
262
|
+
// claude CLI emitting the result event and spawn-agent.js writing [process-exit] —
|
|
263
|
+
// engine.js's onAgentClose handler fires within that window for tracked processes
|
|
264
|
+
// and handles completion correctly. Triggering on result-event here would race the
|
|
265
|
+
// close handler and risk marking SUCCESS based on subtype before the actual exit
|
|
266
|
+
// code is known (#1792).
|
|
227
267
|
} catch (e) { log('warn', 'output completion detection: ' + e.message); }
|
|
228
268
|
|
|
229
269
|
// Resolve per-type heartbeat timeout: per-type map → base heartbeatTimeout fallback
|
|
@@ -247,9 +287,20 @@ function checkTimeouts(config) {
|
|
|
247
287
|
// Agent completed but close event didn't fire — let orphan/hung detection handle it.
|
|
248
288
|
// Don't set isBlocking — use base heartbeat timeout.
|
|
249
289
|
} else {
|
|
250
|
-
// Find the last tool_use call in the output — check if it's a known blocking tool
|
|
290
|
+
// Find the last tool_use call in the output — check if it's a known blocking tool.
|
|
291
|
+
//
|
|
292
|
+
// Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
|
|
293
|
+
// a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
|
|
294
|
+
// a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
|
|
295
|
+
// every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
|
|
296
|
+
// lines per hour AFTER the original tool_use line. A 30-line lookback misses the
|
|
297
|
+
// tool_use entirely, the detector treats the silence as non-blocking, and the
|
|
298
|
+
// agent gets killed at heartbeatTimeout despite legitimately waiting on a
|
|
299
|
+
// background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
|
|
300
|
+
// beyond Monitor's 30 min effective timeout floor.
|
|
251
301
|
const lines = liveLog.split('\n');
|
|
252
|
-
|
|
302
|
+
const TOOL_USE_LOOKBACK = 1000;
|
|
303
|
+
for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
|
|
253
304
|
const line = lines[i];
|
|
254
305
|
if (!line.includes('"tool_use"')) continue;
|
|
255
306
|
try {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1582",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|
|
@@ -64,6 +64,8 @@ The engine kills agents that produce no stdout for `heartbeatTimeout` (default *
|
|
|
64
64
|
|
|
65
65
|
Why: each line that the build emits arrives as a notification, which resets the heartbeat. You see live progress in the dashboard. The Monitor call itself is recognised by the engine as a blocking tool (heartbeat extended ~30 min).
|
|
66
66
|
|
|
67
|
+
> ⚠️ **Never use `Monitor({ command: "tail -F <file> | grep ..." })` for long builds.** It looks tidy — only the lines you care about — but it is a heartbeat trap. Cold Gradle / MSBuild / `cargo build` spend 3–8 minutes in a startup + dependency-resolution phase that produces output that **does not match** typical filter terms (`BUILD SUCCESSFUL`, `BUILD FAILED`, `error:`). The grep filter swallows every line, Monitor emits zero notifications, the heartbeat fires at 300s, and the engine kills the agent mid-build. Always pass `bash_id` directly — every output line resets the heartbeat, and noisy output is the *whole point* of the pattern.
|
|
68
|
+
|
|
67
69
|
### Pattern B — Single Bash call with explicit `timeout`
|
|
68
70
|
|
|
69
71
|
```
|
|
@@ -75,6 +77,7 @@ The engine reads `input.timeout` from the tool call and extends the heartbeat to
|
|
|
75
77
|
### What NOT to do
|
|
76
78
|
|
|
77
79
|
- Do NOT run `./gradlew`, `mvn`, `dotnet test`, or any cold-cache build as a default `Bash` call (no `timeout`, no `run_in_background`). It will hit the 120s Bash default, then the 300s heartbeat, and the engine will kill you.
|
|
80
|
+
- Do NOT use `Monitor({ command: "tail | grep ..." })` for any build that has a silent startup phase (cold Gradle, MSBuild, fresh `npm install`, `cargo build`). The grep filter suppresses Gradle's startup output, Monitor emits nothing, heartbeat fires at 300s, agent is killed. Use `Monitor({ bash_id })` instead — noisy output is better than a dead agent.
|
|
78
81
|
- Do NOT loop `sleep` to "wait it out" — sleep produces no stdout and looks identical to a hang.
|
|
79
82
|
- Do NOT pipe through `tee` thinking that helps — heartbeat reads agent stdout, not the underlying file.
|
|
80
83
|
|