@yemi33/minions 0.1.1634 → 0.1.1636
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/README.md +11 -11
- package/dashboard.js +42 -9
- package/docs/auto-discovery.md +17 -15
- package/docs/blog-first-successful-dispatch.md +7 -10
- package/docs/engine-restart.md +8 -11
- package/docs/human-vs-automated.md +3 -4
- package/docs/pr-review-fix-loop.md +1 -1
- package/docs/rfc-completion-json.md +5 -5
- package/engine/copilot-models.json +1 -1
- package/engine/lifecycle.js +1 -1
- package/engine/queries.js +4 -4
- package/engine/shared.js +4 -12
- package/engine/timeout.js +59 -168
- package/engine.js +11 -42
- package/package.json +1 -1
- package/playbooks/build-and-test.md +22 -139
- package/playbooks/fix.md +1 -1
- package/playbooks/implement-shared.md +1 -1
- package/playbooks/implement.md +3 -7
- package/playbooks/shared-rules.md +4 -45
- package/playbooks/test.md +17 -40
- package/playbooks/verify.md +29 -141
- package/playbooks/work-item.md +1 -0
package/engine/timeout.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* engine/timeout.js —
|
|
3
|
-
* Extracted from engine.js for modularity. No logic changes.
|
|
2
|
+
* engine/timeout.js — Runtime timeout, stale-orphan cleanup, steering, and idle checks.
|
|
4
3
|
*/
|
|
5
4
|
|
|
6
5
|
const fs = require('fs');
|
|
@@ -124,6 +123,28 @@ function checkSteering(config) {
|
|
|
124
123
|
|
|
125
124
|
// ─── Timeout Checker ─────────────────────────────────────────────────────────
|
|
126
125
|
|
|
126
|
+
function trackedProcessPid(procInfo) {
|
|
127
|
+
const pid = Number(procInfo?.proc?.pid || procInfo?.pid || 0);
|
|
128
|
+
return Number.isFinite(pid) && pid > 0 ? pid : null;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function isTrackedProcessAlive(procInfo) {
|
|
132
|
+
if (!procInfo) return false;
|
|
133
|
+
const proc = procInfo.proc;
|
|
134
|
+
if (proc && Object.prototype.hasOwnProperty.call(proc, 'exitCode') && proc.exitCode !== null) {
|
|
135
|
+
return false;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const pid = trackedProcessPid(procInfo);
|
|
139
|
+
if (!pid) return !!proc && proc.killed !== true;
|
|
140
|
+
try {
|
|
141
|
+
process.kill(pid, 0);
|
|
142
|
+
return true;
|
|
143
|
+
} catch {
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
127
148
|
function checkTimeouts(config) {
|
|
128
149
|
const activeProcesses = engine().activeProcesses;
|
|
129
150
|
const engineRestartGraceUntil = engine().engineRestartGraceUntil;
|
|
@@ -132,10 +153,10 @@ function checkTimeouts(config) {
|
|
|
132
153
|
const { runPostCompletionHooks } = require('./lifecycle');
|
|
133
154
|
|
|
134
155
|
const timeout = config.engine?.agentTimeout || ENGINE_DEFAULTS.agentTimeout;
|
|
135
|
-
const
|
|
156
|
+
const defaultStaleOrphanTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
|
|
136
157
|
|
|
137
|
-
//
|
|
138
|
-
const
|
|
158
|
+
// Optional per-type stale-orphan timeouts: merge ENGINE_DEFAULTS ← config overrides.
|
|
159
|
+
const perTypeStaleOrphanTimeouts = { ...ENGINE_DEFAULTS.heartbeatTimeouts, ...(config.engine?.heartbeatTimeouts || {}) };
|
|
139
160
|
|
|
140
161
|
// 1. Check tracked processes for hard timeout (supports per-item deadline from fan-out)
|
|
141
162
|
for (const [id, info] of activeProcesses.entries()) {
|
|
@@ -148,37 +169,32 @@ function checkTimeouts(config) {
|
|
|
148
169
|
}
|
|
149
170
|
}
|
|
150
171
|
|
|
151
|
-
// 2.
|
|
152
|
-
//
|
|
172
|
+
// 2. Stale-orphan check — for ALL active dispatch items (catches lost process handles after restart).
|
|
173
|
+
// Silence is not a failure for tracked live processes: long CLI commands can legitimately
|
|
174
|
+
// produce no stdout/stderr for extended periods.
|
|
153
175
|
const dispatchData = getDispatch();
|
|
154
176
|
const deadItems = [];
|
|
155
|
-
const
|
|
177
|
+
const legacyAnnotationClears = new Set();
|
|
156
178
|
|
|
157
179
|
for (const item of (dispatchData.active || [])) {
|
|
158
180
|
if (!item.agent) continue;
|
|
159
181
|
|
|
160
|
-
// Per-type
|
|
182
|
+
// Per-type stale-orphan timeout: look up work type from dispatch item, fall back to default.
|
|
161
183
|
const workType = item.workType || item.meta?.item?.type;
|
|
162
|
-
const
|
|
184
|
+
const staleOrphanTimeout = (workType && perTypeStaleOrphanTimeouts[workType]) || defaultStaleOrphanTimeout;
|
|
163
185
|
|
|
164
|
-
const
|
|
186
|
+
const procInfo = activeProcesses.get(item.id);
|
|
187
|
+
const hasProcess = !!procInfo;
|
|
188
|
+
const processAlive = isTrackedProcessAlive(procInfo);
|
|
165
189
|
const liveLogPath = path.join(AGENTS_DIR, item.agent, 'live-output.log');
|
|
166
190
|
let lastActivity = item.started_at ? new Date(item.started_at).getTime() : 0;
|
|
167
191
|
|
|
168
|
-
//
|
|
169
|
-
//
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
} else {
|
|
175
|
-
// Orphan case (no tracked process): use live-output.log mtime as fallback.
|
|
176
|
-
// No heartbeat timer is running for orphans, so mtime is accurate.
|
|
177
|
-
try {
|
|
178
|
-
const stat = fs.statSync(liveLogPath);
|
|
179
|
-
lastActivity = Math.max(lastActivity, stat.mtimeMs);
|
|
180
|
-
} catch { /* optional */ }
|
|
181
|
-
}
|
|
192
|
+
// live-output.log mtime is only used for stale-orphan cleanup and completion recovery.
|
|
193
|
+
// It is not used as an output-silence timeout for live tracked processes.
|
|
194
|
+
try {
|
|
195
|
+
const stat = fs.statSync(liveLogPath);
|
|
196
|
+
lastActivity = Math.max(lastActivity, stat.mtimeMs);
|
|
197
|
+
} catch { /* optional */ }
|
|
182
198
|
|
|
183
199
|
const silentMs = Date.now() - lastActivity;
|
|
184
200
|
const silentSec = Math.round(silentMs / 1000);
|
|
@@ -266,126 +282,26 @@ function checkTimeouts(config) {
|
|
|
266
282
|
// code is known (#1792).
|
|
267
283
|
} catch (e) { log('warn', 'output completion detection: ' + e.message); }
|
|
268
284
|
|
|
269
|
-
//
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
// These tools produce no stdout for extended periods — don't kill them prematurely
|
|
274
|
-
// Check for BOTH tracked and untracked processes (orphan case after engine restart)
|
|
275
|
-
// Skip if agent already completed — blocking tool detection on stale tool calls
|
|
276
|
-
// would extend the timeout indefinitely for dead agents (#716).
|
|
277
|
-
let isBlocking = false;
|
|
278
|
-
let blockingTimeout = itemHeartbeat;
|
|
279
|
-
let blockingTool = '';
|
|
280
|
-
if (silentMs > itemHeartbeat) {
|
|
281
|
-
try {
|
|
282
|
-
const liveLog = safeRead(liveLogPath);
|
|
283
|
-
if (liveLog) {
|
|
284
|
-
// If the output contains a result event or process-exit sentinel, the agent is done.
|
|
285
|
-
// Don't extend timeout for stale blocking tool calls from before the result (#716).
|
|
286
|
-
if (liveLog.includes('"type":"result"') || liveLog.includes('\n[process-exit]')) {
|
|
287
|
-
// Agent completed but close event didn't fire — let orphan/hung detection handle it.
|
|
288
|
-
// Don't set isBlocking — use base heartbeat timeout.
|
|
289
|
-
} else {
|
|
290
|
-
// Find the last tool_use call in the output — check if it's a known blocking tool.
|
|
291
|
-
//
|
|
292
|
-
// Lookback depth (1000 lines) is sized for the heartbeat-noise scenario from #1792:
|
|
293
|
-
// a long-running Monitor / Bash / PowerShell call goes silent for 15+ minutes while
|
|
294
|
-
// a cold Gradle build runs. During that silence the ENGINE writes a heartbeat line
|
|
295
|
-
// every 30s (engine.js heartbeatTimer), so the live log accumulates ~120 heartbeat
|
|
296
|
-
// lines per hour AFTER the original tool_use line. A 30-line lookback misses the
|
|
297
|
-
// tool_use entirely, the detector treats the silence as non-blocking, and the
|
|
298
|
-
// agent gets killed at heartbeatTimeout despite legitimately waiting on a
|
|
299
|
-
// background process. 1000 lines covers ~8 hours of pure heartbeat noise — well
|
|
300
|
-
// beyond Monitor's 30 min effective timeout floor.
|
|
301
|
-
const lines = liveLog.split('\n');
|
|
302
|
-
const TOOL_USE_LOOKBACK = 1000;
|
|
303
|
-
for (let i = lines.length - 1; i >= Math.max(0, lines.length - TOOL_USE_LOOKBACK); i--) {
|
|
304
|
-
const line = lines[i];
|
|
305
|
-
if (!line.includes('"tool_use"')) continue;
|
|
306
|
-
try {
|
|
307
|
-
const parsed = JSON.parse(line);
|
|
308
|
-
const toolUse = parsed?.message?.content?.find?.(c => c.type === 'tool_use');
|
|
309
|
-
if (!toolUse) continue;
|
|
310
|
-
const input = toolUse.input || {};
|
|
311
|
-
const name = toolUse.name || '';
|
|
312
|
-
// TaskOutput with block:true — waiting for a background task
|
|
313
|
-
if (name === 'TaskOutput' && input.block === true) {
|
|
314
|
-
const taskTimeout = input.timeout || 600000; // default 10min
|
|
315
|
-
blockingTimeout = Math.max(itemHeartbeat, taskTimeout + 60000); // task timeout + 1min grace
|
|
316
|
-
isBlocking = true;
|
|
317
|
-
blockingTool = 'TaskOutput';
|
|
318
|
-
}
|
|
319
|
-
// Bash tool call — may be running a long build/install with no stdout
|
|
320
|
-
if (name === 'Bash') {
|
|
321
|
-
// Use explicit timeout if set, otherwise match Claude Code's actual Bash default (120s)
|
|
322
|
-
const bashTimeout = input.timeout || 120000;
|
|
323
|
-
blockingTimeout = Math.max(itemHeartbeat, bashTimeout + 60000);
|
|
324
|
-
isBlocking = true;
|
|
325
|
-
blockingTool = 'Bash';
|
|
326
|
-
}
|
|
327
|
-
// PowerShell tool call — Windows-native shell with same explicit-timeout
|
|
328
|
-
// semantics as Bash (input.timeout, max 600s). Required for projects that
|
|
329
|
-
// build via PowerShell on Windows (gradlew.bat, MSBuild, dotnet test) where
|
|
330
|
-
// the cold-start phase produces no stdout for several minutes (#1786).
|
|
331
|
-
if (name === 'PowerShell') {
|
|
332
|
-
const psTimeout = input.timeout || 120000;
|
|
333
|
-
blockingTimeout = Math.max(itemHeartbeat, psTimeout + 60000);
|
|
334
|
-
isBlocking = true;
|
|
335
|
-
blockingTool = 'PowerShell';
|
|
336
|
-
}
|
|
337
|
-
// Monitor tool call — blocks waiting for stdout-line notifications from a
|
|
338
|
-
// background process started via Bash with run_in_background. Between
|
|
339
|
-
// notifications the call produces no output, so the heartbeat monitor
|
|
340
|
-
// must extend timeout. No fixed timeout on Monitor — match Agent (30min)
|
|
341
|
-
// since both are inherently long-running waits (#1786).
|
|
342
|
-
if (name === 'Monitor') {
|
|
343
|
-
blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for background process waits
|
|
344
|
-
isBlocking = true;
|
|
345
|
-
blockingTool = 'Monitor';
|
|
346
|
-
}
|
|
347
|
-
// Agent (subagent) tool call — parent waits silently for child to complete
|
|
348
|
-
if (name === 'Agent') {
|
|
349
|
-
blockingTimeout = Math.max(itemHeartbeat, 1800000); // 30min for subagents
|
|
350
|
-
isBlocking = true;
|
|
351
|
-
blockingTool = 'Agent';
|
|
352
|
-
}
|
|
353
|
-
break; // only check the most recent tool_use
|
|
354
|
-
} catch { /* JSON parse — line may not be valid JSON */ }
|
|
355
|
-
}
|
|
356
|
-
if (isBlocking) {
|
|
357
|
-
// Only log on transition — avoid spamming every tick while blocking persists
|
|
358
|
-
if (!item._blockingToolCall) {
|
|
359
|
-
log('info', `Agent ${item.agent} (${item.id}) is in a blocking tool call (${blockingTool}) — extended timeout to ${Math.round(blockingTimeout / 1000)}s (silent for ${silentSec}s)`, { event: 'blocking_tool_call_detected' });
|
|
360
|
-
}
|
|
361
|
-
blockingAnnotations.set(item.id, {
|
|
362
|
-
tool: blockingTool,
|
|
363
|
-
silentMs,
|
|
364
|
-
remainingMs: Math.max(0, blockingTimeout - silentMs),
|
|
365
|
-
});
|
|
366
|
-
}
|
|
367
|
-
} // close else
|
|
368
|
-
} // close if (liveLog)
|
|
369
|
-
} catch (e) { log('warn', 'blocking tool detection: ' + e.message); }
|
|
370
|
-
}
|
|
371
|
-
// Agent recovered from blocking state — clear annotation
|
|
372
|
-
if (!isBlocking && item._blockingToolCall) {
|
|
373
|
-
blockingAnnotations.set(item.id, null);
|
|
285
|
+
// Blocking tool annotations are no longer needed: live tracked processes are allowed to
|
|
286
|
+
// be quiet regardless of which command/tool is running.
|
|
287
|
+
if (item._blockingToolCall) {
|
|
288
|
+
legacyAnnotationClears.add(item.id);
|
|
374
289
|
}
|
|
375
290
|
|
|
376
|
-
const effectiveTimeout = isBlocking ? blockingTimeout : itemHeartbeat;
|
|
377
|
-
|
|
378
291
|
// Skip recently-steered agents — they're being killed and re-spawned
|
|
379
|
-
const procInfo = activeProcesses.get(item.id);
|
|
380
292
|
if (procInfo?._steeringAt && Date.now() - procInfo._steeringAt < 60000) continue;
|
|
381
293
|
|
|
382
|
-
|
|
294
|
+
if (processAlive) {
|
|
295
|
+
continue;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Capture live-output.log file state for orphan diagnostics
|
|
383
299
|
// (#W-mo248lkjwgsu original, #W-mo25loq8kjer pid annotation).
|
|
384
300
|
// Four distinguishable failure modes:
|
|
385
301
|
// logExists=false → spawn call itself threw, no log ever written
|
|
386
302
|
// logExists=true pidPresent=false → engine stub written but spawn died before emitting pid line
|
|
387
|
-
// logExists=true pidPresent=true silent → process spawned (pid recorded) but
|
|
388
|
-
// logExists=true pidPresent=true size>pid →
|
|
303
|
+
// logExists=true pidPresent=true silent → process spawned (pid recorded) but no recent output
|
|
304
|
+
// logExists=true pidPresent=true size>pid → process handle was lost after output was written
|
|
389
305
|
//
|
|
390
306
|
// The pid line `[<iso>] pid: <N>` is stamped by engine.js immediately after runFile() returns.
|
|
391
307
|
// Its presence → the child process was actually spawned; absence → spawn itself failed or the
|
|
@@ -408,33 +324,15 @@ function checkTimeouts(config) {
|
|
|
408
324
|
_logState = `logExists=true logSize=${lst.size} pidPresent=${pidPresent}`;
|
|
409
325
|
} catch { /* ENOENT — keep default */ }
|
|
410
326
|
|
|
411
|
-
if (!
|
|
412
|
-
// No tracked process AND no recent output past
|
|
413
|
-
log('warn', `Orphan detected: ${item.agent} (${item.id}) — no process tracked, silent for ${silentSec}s
|
|
327
|
+
if (!processAlive && silentMs > staleOrphanTimeout && (Date.now() > engineRestartGraceUntil || engineRestartGraceExempt?.has(item.id))) {
|
|
328
|
+
// No tracked process AND no recent output past stale-orphan timeout AND (grace period expired OR confirmed-dead at restart) → orphaned
|
|
329
|
+
log('warn', `Orphan detected: ${item.agent} (${item.id}) — no live process tracked, silent for ${silentSec}s [${_logState}]`);
|
|
414
330
|
dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Orphaned — no process, silent for ${silentSec}s`);
|
|
415
331
|
// Clear session so retry starts fresh
|
|
416
332
|
try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
|
|
417
333
|
deadItems.push({ item, reason: `Orphaned — no process, silent for ${silentSec}s` });
|
|
418
|
-
|
|
419
|
-
// Has process but no output past effective timeout → hung
|
|
420
|
-
log('warn', `Hung agent: ${item.agent} (${item.id}) — process exists but no output for ${silentSec}s${isBlocking ? ' (blocking timeout exceeded)' : ''} [${_logState}]`);
|
|
421
|
-
dispatch().updateAgentStatus(item.id, AGENT_STATUS.TIMED_OUT, `Hung — no output for ${silentSec}s`);
|
|
422
|
-
const procInfo = activeProcesses.get(item.id);
|
|
423
|
-
if (procInfo) {
|
|
424
|
-
shared.killGracefully(procInfo.proc, 5000);
|
|
425
|
-
// On Unix, also kill child process tree (killGracefully only hits parent PID)
|
|
426
|
-
if (process.platform !== 'win32' && procInfo.proc?.pid) {
|
|
427
|
-
setTimeout(() => {
|
|
428
|
-
try { shared.exec(`pkill -KILL -P ${procInfo.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
|
|
429
|
-
}, 6000); // after grace period
|
|
430
|
-
}
|
|
431
|
-
activeProcesses.delete(item.id);
|
|
432
|
-
}
|
|
433
|
-
// Clear session so retry starts fresh instead of resuming the killed session
|
|
434
|
-
try { shared.safeUnlink(path.join(AGENTS_DIR, item.agent, 'session.json')); } catch {}
|
|
435
|
-
deadItems.push({ item, reason: `Hung — no output for ${silentSec}s` });
|
|
334
|
+
activeProcesses.delete(item.id);
|
|
436
335
|
}
|
|
437
|
-
// If has process and recent output → healthy, let it run
|
|
438
336
|
}
|
|
439
337
|
|
|
440
338
|
// Clean up dead items
|
|
@@ -442,19 +340,12 @@ function checkTimeouts(config) {
|
|
|
442
340
|
completeDispatch(item.id, DISPATCH_RESULT.ERROR, reason);
|
|
443
341
|
}
|
|
444
342
|
|
|
445
|
-
//
|
|
446
|
-
|
|
447
|
-
if (blockingAnnotations.size > 0) {
|
|
343
|
+
// Clear legacy blocking-tool annotations; process liveness no longer depends on tool parsing.
|
|
344
|
+
if (legacyAnnotationClears.size > 0) {
|
|
448
345
|
const { mutateDispatch: mutateDispatchFn } = dispatch();
|
|
449
346
|
mutateDispatchFn((dp) => {
|
|
450
347
|
for (const activeItem of dp.active) {
|
|
451
|
-
if (
|
|
452
|
-
const ann = blockingAnnotations.get(activeItem.id);
|
|
453
|
-
if (ann) {
|
|
454
|
-
activeItem._blockingToolCall = ann;
|
|
455
|
-
} else {
|
|
456
|
-
delete activeItem._blockingToolCall;
|
|
457
|
-
}
|
|
348
|
+
if (legacyAnnotationClears.has(activeItem.id)) delete activeItem._blockingToolCall;
|
|
458
349
|
}
|
|
459
350
|
});
|
|
460
351
|
}
|
package/engine.js
CHANGED
|
@@ -145,7 +145,7 @@ const { runPostCompletionHooks, updateWorkItemStatus, syncPrdItemStatus, reconci
|
|
|
145
145
|
// ─── Agent Spawner ──────────────────────────────────────────────────────────
|
|
146
146
|
|
|
147
147
|
const activeProcesses = new Map(); // dispatchId → { proc, agentId, startedAt }
|
|
148
|
-
const realActivityMap = new Map(); // dispatchId → timestamp of last
|
|
148
|
+
const realActivityMap = new Map(); // dispatchId → timestamp of last agent stdout/stderr
|
|
149
149
|
// tempAgents imported from engine/routing.js
|
|
150
150
|
let engineRestartGraceUntil = 0; // timestamp — suppress orphan detection until this time
|
|
151
151
|
const engineRestartGraceExempt = new Set(); // dispatch IDs with confirmed-dead PIDs at restart — bypass grace period
|
|
@@ -983,17 +983,12 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
983
983
|
throw spawnErr;
|
|
984
984
|
}
|
|
985
985
|
|
|
986
|
-
// Seed realActivityMap and stamp PID immediately — BEFORE any handlers
|
|
986
|
+
// Seed realActivityMap and stamp PID immediately — BEFORE any handlers (#W-mo25loq8kjer).
|
|
987
987
|
// Why NOW, not later in the function:
|
|
988
|
-
// 1.
|
|
989
|
-
// tracked processes; when the map has no entry, it falls back to item.started_at (dispatch time,
|
|
990
|
-
// which is 20-60s before actual spawn for write tasks doing worktree setup). Read-only tasks
|
|
991
|
-
// that produce no stdout for minutes (explore, security audit, large scans) were hitting
|
|
992
|
-
// heartbeatTimeout prematurely — clock had already been running since dispatch.
|
|
993
|
-
// 2. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
|
|
988
|
+
// 1. Error-handler race. The `proc.on('error', ...)` handler below calls realActivityMap.delete(id)
|
|
994
989
|
// on synchronous spawn failures. Seeding before registering handlers ensures delete sees a value
|
|
995
990
|
// to clear rather than leaving an absent-then-absent no-op that downstream code must guard.
|
|
996
|
-
//
|
|
991
|
+
// 2. Orphan diagnostics. The PID line gives timeout.js a deterministic way to tell "spawn died
|
|
997
992
|
// before first write" (stub-only log) from "process started and is hung" (stub + pid line).
|
|
998
993
|
realActivityMap.set(id, Date.now());
|
|
999
994
|
try {
|
|
@@ -1003,24 +998,12 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1003
998
|
const MAX_OUTPUT = 1024 * 1024; // 1MB
|
|
1004
999
|
let stdout = '';
|
|
1005
1000
|
let stderr = '';
|
|
1006
|
-
let lastOutputAt = Date.now();
|
|
1007
|
-
let heartbeatTimer = null;
|
|
1008
1001
|
let _trustCheckDone = false;
|
|
1009
1002
|
const _spawnTime = Date.now();
|
|
1010
1003
|
|
|
1011
|
-
// Keep live log active even when the agent produces no stdout/stderr for long stretches.
|
|
1012
|
-
// This makes "silent but running" states visible in the dashboard tail view.
|
|
1013
|
-
heartbeatTimer = setInterval(() => {
|
|
1014
|
-
const silentMs = Date.now() - lastOutputAt;
|
|
1015
|
-
if (silentMs < 30000) return;
|
|
1016
|
-
const silentSec = Math.round(silentMs / 1000);
|
|
1017
|
-
try { fs.appendFileSync(liveOutputPath, `[heartbeat] running — no output for ${silentSec}s\n`); } catch { /* optional */ }
|
|
1018
|
-
}, 30000);
|
|
1019
|
-
|
|
1020
1004
|
proc.stdout.on('data', (data) => {
|
|
1021
1005
|
const chunk = data.toString();
|
|
1022
|
-
|
|
1023
|
-
realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
|
|
1006
|
+
realActivityMap.set(id, Date.now());
|
|
1024
1007
|
if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
|
|
1025
1008
|
try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
|
|
1026
1009
|
|
|
@@ -1057,14 +1040,12 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1057
1040
|
|
|
1058
1041
|
proc.stderr.on('data', (data) => {
|
|
1059
1042
|
const chunk = data.toString();
|
|
1060
|
-
|
|
1061
|
-
realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
|
|
1043
|
+
realActivityMap.set(id, Date.now());
|
|
1062
1044
|
if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
|
|
1063
1045
|
try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
|
|
1064
1046
|
});
|
|
1065
1047
|
|
|
1066
1048
|
async function onAgentClose(code) {
|
|
1067
|
-
if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
|
|
1068
1049
|
log('info', `Agent ${agentId} (${id}) exited with code ${code}`);
|
|
1069
1050
|
|
|
1070
1051
|
// Emit worker-state transition: FINISHED or FAILED
|
|
@@ -1180,33 +1161,22 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1180
1161
|
// Reset output buffers so post-completion parsing only sees the resumed session
|
|
1181
1162
|
stdout = '';
|
|
1182
1163
|
stderr = '';
|
|
1183
|
-
lastOutputAt = Date.now();
|
|
1184
|
-
|
|
1185
|
-
// Restart heartbeat for the resumed process
|
|
1186
|
-
if (heartbeatTimer) clearInterval(heartbeatTimer);
|
|
1187
|
-
heartbeatTimer = setInterval(() => {
|
|
1188
|
-
try { fs.appendFileSync(liveOutputPath, `\n[heartbeat] running — no output for ${Math.round((Date.now() - lastOutputAt) / 1000)}s\n`); } catch {}
|
|
1189
|
-
}, 30000);
|
|
1190
|
-
|
|
1191
1164
|
// Re-wire stdout/stderr handlers (same as original)
|
|
1192
1165
|
resumeProc.stdout.on('data', (data) => {
|
|
1193
1166
|
const chunk = data.toString();
|
|
1194
|
-
|
|
1195
|
-
realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
|
|
1167
|
+
realActivityMap.set(id, Date.now());
|
|
1196
1168
|
if (stdout.length < MAX_OUTPUT) stdout += chunk.slice(0, MAX_OUTPUT - stdout.length);
|
|
1197
1169
|
try { fs.appendFileSync(liveOutputPath, chunk); } catch { /* optional */ }
|
|
1198
1170
|
});
|
|
1199
1171
|
resumeProc.stderr.on('data', (data) => {
|
|
1200
1172
|
const chunk = data.toString();
|
|
1201
|
-
|
|
1202
|
-
realActivityMap.set(id, Date.now()); // Track real agent output separately from heartbeat
|
|
1173
|
+
realActivityMap.set(id, Date.now());
|
|
1203
1174
|
if (stderr.length < MAX_OUTPUT) stderr += chunk.slice(0, MAX_OUTPUT - stderr.length);
|
|
1204
1175
|
try { fs.appendFileSync(liveOutputPath, '[stderr] ' + chunk); } catch { /* optional */ }
|
|
1205
1176
|
});
|
|
1206
1177
|
|
|
1207
1178
|
// Re-wire close handler for the resumed process
|
|
1208
1179
|
resumeProc.on('close', (resumeCode) => {
|
|
1209
|
-
if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
|
|
1210
1180
|
try { fs.unlinkSync(steerPromptPath); } catch { /* cleanup */ }
|
|
1211
1181
|
if (resumeCode !== 0) {
|
|
1212
1182
|
log('warn', `Steering resume for ${agentId} exited with code ${resumeCode} | stderr: ${stderr.slice(-300).replace(/\n/g, ' ')}`);
|
|
@@ -1262,7 +1232,7 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1262
1232
|
}
|
|
1263
1233
|
|
|
1264
1234
|
activeProcesses.delete(id);
|
|
1265
|
-
realActivityMap.delete(id);
|
|
1235
|
+
realActivityMap.delete(id);
|
|
1266
1236
|
|
|
1267
1237
|
// If timeout checker already finalized this dispatch, don't overwrite work-item status again.
|
|
1268
1238
|
// This avoids races where close-handler marks an auto-retried item as failed.
|
|
@@ -1301,7 +1271,7 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1301
1271
|
const { resultSummary, autoRecovered } = await runPostCompletionHooks(dispatchItem, agentId, code, stdout, config);
|
|
1302
1272
|
|
|
1303
1273
|
// Move from active to completed in dispatch (single source of truth for agent status)
|
|
1304
|
-
// autoRecovered: agent failed
|
|
1274
|
+
// autoRecovered: agent failed after creating PRs — treat as success
|
|
1305
1275
|
const effectiveResult = (code === 0 || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR;
|
|
1306
1276
|
const completeOpts = effectiveResult === DISPATCH_RESULT.ERROR && failureClass ? { failureClass } : {};
|
|
1307
1277
|
// Extract last 5 non-empty stderr lines as error context when exit code is non-zero
|
|
@@ -1379,10 +1349,9 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1379
1349
|
proc.on('close', onAgentClose);
|
|
1380
1350
|
|
|
1381
1351
|
proc.on('error', (err) => {
|
|
1382
|
-
if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
|
|
1383
1352
|
log('error', `Failed to spawn agent ${agentId}: ${err.message}`);
|
|
1384
1353
|
activeProcesses.delete(id);
|
|
1385
|
-
realActivityMap.delete(id);
|
|
1354
|
+
realActivityMap.delete(id);
|
|
1386
1355
|
completeDispatch(id, DISPATCH_RESULT.ERROR, `Spawn error: ${err.message}`);
|
|
1387
1356
|
});
|
|
1388
1357
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1636",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|
|
@@ -8,156 +8,39 @@ Repo: {{repo_name}} | Org: {{ado_org}} | Project: {{ado_project}}
|
|
|
8
8
|
Team root: {{team_root}}
|
|
9
9
|
Project path: {{project_path}}
|
|
10
10
|
|
|
11
|
-
##
|
|
11
|
+
## Mission
|
|
12
12
|
|
|
13
|
-
A new PR has been created: **{{pr_id}}**
|
|
13
|
+
A new PR has been created: **{{pr_id}}** - "{{pr_title}}"
|
|
14
14
|
Branch: `{{pr_branch}}` | Author: {{pr_author}}
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
Run the project's normal build/test verification for this PR and report whether it is ready for human review. If it is a runnable app, identify the local URL and the exact command needed to run it.
|
|
17
17
|
|
|
18
|
-
##
|
|
18
|
+
## Long-Running Commands
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
Builds, dependency installs, tests, and dev servers can be quiet for a long time. Let normal CLI commands run naturally; do not add artificial heartbeat output or split commands just to show progress.
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
## Approach
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
Work from the current checkout prepared by the engine. Read the repo's own instructions first (`CLAUDE.md`, README, package files, Makefiles, project scripts) and adapt to the build system you find.
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
# Examples — use whatever the project needs:
|
|
29
|
-
yarn install # or npm install
|
|
30
|
-
pip install -r requirements.txt
|
|
31
|
-
dotnet restore
|
|
32
|
-
```
|
|
26
|
+
If build or tests fail, report the relevant errors clearly and stop. Do not fix code, push commits, or create PRs from this task.
|
|
33
27
|
|
|
34
|
-
|
|
28
|
+
If a server/app should be run for review, include the URL and a copy-pasteable run command with absolute paths. If the server must survive after the agent exits, start it detached and record the PID, restart command, and stop command; otherwise just provide the command for the user.
|
|
35
29
|
|
|
36
|
-
|
|
37
|
-
```bash
|
|
38
|
-
# Examples:
|
|
39
|
-
yarn build # or npm run build
|
|
40
|
-
dotnet build
|
|
41
|
-
cargo build
|
|
42
|
-
```
|
|
30
|
+
## Findings
|
|
43
31
|
|
|
44
|
-
|
|
32
|
+
Write findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` only after successful verification.
|
|
45
33
|
|
|
46
|
-
|
|
34
|
+
Include:
|
|
35
|
+
- Branch, author, and project
|
|
36
|
+
- Build status and important warnings/errors
|
|
37
|
+
- Test status and failed test names if any
|
|
38
|
+
- Local server status, URL, run command, PID, restart command, and stop command if applicable
|
|
39
|
+
- A short summary of whether the PR is ready to review
|
|
47
40
|
|
|
48
|
-
|
|
41
|
+
## Constraints
|
|
49
42
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
dotnet test
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
Report test results: how many passed, failed, skipped.
|
|
58
|
-
|
|
59
|
-
### 5. Start a local dev server (if applicable)
|
|
60
|
-
|
|
61
|
-
Determine if this project is a **webapp** (has a dev server, serves HTTP, has a UI):
|
|
62
|
-
- Check package.json for `dev`, `start`, `serve` scripts
|
|
63
|
-
- Check for frameworks: Next.js, React, Angular, Vue, Express, Flask, ASP.NET
|
|
64
|
-
- Check CLAUDE.md for run instructions
|
|
65
|
-
|
|
66
|
-
If it IS a webapp:
|
|
67
|
-
1. Start the dev server **detached from your process** so it survives after you exit.
|
|
68
|
-
- If the repo docs provide a local run or background-start command, use that.
|
|
69
|
-
- Otherwise, use the detached-process mechanism that fits the current environment. Do not assume Bash, PowerShell, or any specific shell unless the repo or runtime clearly provides it.
|
|
70
|
-
2. Wait a few seconds, then verify it using the repo's documented smoke test, health check, startup output, or the lightest project-appropriate manual check.
|
|
71
|
-
3. Note the localhost URL, port, process identifier/PID, or equivalent runtime details the repo exposes.
|
|
72
|
-
4. Output the exact restart command with **absolute worktree paths**.
|
|
73
|
-
5. Include the stop command or shutdown procedure that matches how you started it.
|
|
74
|
-
|
|
75
|
-
If it is NOT a webapp (library, CLI tool, backend service without UI), skip this step.
|
|
76
|
-
|
|
77
|
-
## Output Format
|
|
78
|
-
|
|
79
|
-
Write your findings to `{{team_root}}/notes/inbox/{{agent_id}}-bt-{{pr_number}}-{{date}}.md` **only after a successful verification run**: the build passed, required tests passed, and any applicable local server is running or not applicable.
|
|
80
|
-
|
|
81
|
-
If the build fails, tests fail, dependency setup fails, or a required local server cannot start, do **not** write an inbox note. Follow the failure handling below and report the failure in your final response instead.
|
|
82
|
-
|
|
83
|
-
Structure your report exactly like this:
|
|
84
|
-
|
|
85
|
-
```markdown
|
|
86
|
-
## Build & Test Report: {{pr_id}}
|
|
87
|
-
|
|
88
|
-
**Branch:** {{pr_branch}}
|
|
89
|
-
**Author:** {{pr_author}}
|
|
90
|
-
**Project:** {{project_name}}
|
|
91
|
-
|
|
92
|
-
### Build
|
|
93
|
-
- Status: PASS
|
|
94
|
-
- Notes: (any warnings or issues)
|
|
95
|
-
|
|
96
|
-
### Tests
|
|
97
|
-
- Status: PASS / SKIPPED
|
|
98
|
-
- Results: X passed, 0 failed, Z skipped
|
|
99
|
-
- Failed tests: none
|
|
100
|
-
|
|
101
|
-
### Local Server
|
|
102
|
-
- Status: RUNNING / NOT_APPLICABLE
|
|
103
|
-
- URL: http://localhost:XXXX (if running)
|
|
104
|
-
- PID / Process: <pid or equivalent identifier, if running>
|
|
105
|
-
- Restart Command: `cd <absolute-path-to-worktree> && <exact start command>`
|
|
106
|
-
- Stop Command: `<exact stop command or shutdown procedure>`
|
|
107
|
-
|
|
108
|
-
### Summary
|
|
109
|
-
(1-2 sentence overall assessment — is this PR safe to review?)
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
## Auto-file Work Items on Failure
|
|
113
|
-
|
|
114
|
-
If the build or tests fail, create a work item so another agent can fix it. Write a JSON entry to the project's work queue:
|
|
115
|
-
|
|
116
|
-
```bash
|
|
117
|
-
# Read existing items, append new one, write back
|
|
118
|
-
node -e "
|
|
119
|
-
const fs = require('fs');
|
|
120
|
-
const p = '{{project_path}}/.minions/work-items.json';
|
|
121
|
-
const items = JSON.parse(fs.readFileSync(p, 'utf8') || '[]');
|
|
122
|
-
const id = 'W' + String(items.reduce((m,i) => Math.max(m, parseInt((i.id||'').match(/(\d+)$/)?.[1]||0)), 0) + 1).padStart(3, '0');
|
|
123
|
-
items.push({
|
|
124
|
-
id,
|
|
125
|
-
title: 'Fix build/test failure on PR {{pr_id}}: <SHORT DESCRIPTION OF FAILURE>',
|
|
126
|
-
type: 'fix',
|
|
127
|
-
priority: 'high',
|
|
128
|
-
description: '<PASTE THE BUILD/TEST ERROR OUTPUT HERE — keep it under 2000 chars>',
|
|
129
|
-
status: 'pending',
|
|
130
|
-
created: new Date().toISOString(),
|
|
131
|
-
createdBy: '{{agent_id}}',
|
|
132
|
-
pr: '{{pr_id}}',
|
|
133
|
-
branch: '{{pr_branch}}'
|
|
134
|
-
});
|
|
135
|
-
fs.writeFileSync(p, JSON.stringify(items, null, 2));
|
|
136
|
-
console.log('Filed work item:', id);
|
|
137
|
-
"
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
Replace `<SHORT DESCRIPTION OF FAILURE>` and `<PASTE THE BUILD/TEST ERROR OUTPUT HERE>` with the actual error details. The engine will pick this up on the next tick and dispatch a fix agent.
|
|
141
|
-
|
|
142
|
-
## Rules
|
|
143
|
-
|
|
144
|
-
- **Do NOT create pull requests** — this is a build/test task only
|
|
145
|
-
- **Do NOT push commits** or modify code
|
|
146
|
-
- **Do NOT attempt to fix build/test failures** — report them and file a work item
|
|
147
|
-
- If starting a dev server, output the **exact restart command with absolute paths** so the user can restart it:
|
|
148
|
-
```
|
|
149
|
-
## Restart Command
|
|
150
|
-
cd <absolute-path-to-worktree> && <exact start command>
|
|
151
|
-
```
|
|
152
|
-
- Also include the server URL, PID/process identifier, and matching stop command.
|
|
153
|
-
- Use the worktree path, NOT the main project path, for all commands
|
|
154
|
-
- The worktree will persist after your process ends so the user can inspect it
|
|
155
|
-
|
|
156
|
-
## Do not clean up the worktree
|
|
157
|
-
|
|
158
|
-
Leave the worktree in place at `{{project_path}}/../worktrees/bt-{{pr_number}}` — the user needs it to review the running app. The engine will clean it up automatically after the PR is merged or closed.
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
## When to Stop
|
|
162
|
-
|
|
163
|
-
Your task is complete once you have: (1) built the project, (2) run tests, (3) started the app if applicable, and (4) written the success findings to the inbox file. If verification failed, stop after filing the failure work item when applicable and reporting the failure in your final response; do not write an inbox file.
|
|
43
|
+
- Do not create pull requests or push commits.
|
|
44
|
+
- Do not modify code unless the task explicitly changes into a fix task.
|
|
45
|
+
- Use the current checkout/worktree prepared by the engine.
|
|
46
|
+
- Do not remove worktrees; the engine handles cleanup automatically.
|
package/playbooks/fix.md
CHANGED
|
@@ -45,7 +45,7 @@ Before pushing, prove the review fix did not break the branch:
|
|
|
45
45
|
- Fix regressions you introduced. If failures are pre-existing or unrelated, capture the evidence and include it in the PR comment.
|
|
46
46
|
- Do not push code that breaks existing tests or the build because of your changes.
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
Long builds, dependency installs, and tests may be quiet for several minutes. Let the normal CLI command run naturally; do not add artificial heartbeat output or split commands just to show progress.
|
|
49
49
|
|
|
50
50
|
## Publish & Comment on PR
|
|
51
51
|
|