@yemi33/minions 0.1.1633 → 0.1.1635
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/README.md +11 -11
- package/dashboard.js +46 -0
- package/docs/auto-discovery.md +17 -15
- package/docs/blog-first-successful-dispatch.md +7 -10
- package/docs/engine-restart.md +8 -11
- package/docs/human-vs-automated.md +3 -4
- package/docs/pr-review-fix-loop.md +1 -1
- package/docs/rfc-completion-json.md +5 -5
- package/engine/copilot-models.json +1 -1
- package/engine/lifecycle.js +1 -1
- package/engine/playbook.js +2 -1
- package/engine/queries.js +4 -4
- package/engine/shared.js +4 -12
- package/engine/timeout.js +59 -168
- package/engine.js +11 -42
- package/package.json +1 -1
- package/playbooks/build-and-test.md +22 -139
- package/playbooks/docs.md +113 -0
- package/playbooks/fix.md +1 -1
- package/playbooks/implement-shared.md +1 -1
- package/playbooks/implement.md +3 -7
- package/playbooks/shared-rules.md +4 -45
- package/playbooks/test.md +17 -40
- package/playbooks/verify.md +29 -141
- package/playbooks/work-item.md +1 -0
- package/prompts/cc-system.md +2 -0
package/CHANGELOG.md
CHANGED
package/README.md
CHANGED
|
@@ -227,7 +227,7 @@ You can also run scripts directly: `node ~/.minions/engine.js start`, `node ~/.m
|
|
|
227
227
|
- **Pipelines** — multi-stage workflows chaining tasks, meetings, plans, and more. Cron triggers or manual. Artifacts flow between stages.
|
|
228
228
|
- **Eval loop** — after implementation, auto-dispatches review → fix cycles (configurable iterations and cost ceiling per work item)
|
|
229
229
|
- **Pinned notes** — critical context pinned to all agent prompts via `pinned.md`
|
|
230
|
-
- **
|
|
230
|
+
- **Process-based liveness** — live agents may be quiet; output staleness is only used for orphan cleanup after process tracking is lost
|
|
231
231
|
- **Auto-cleanup** — stale temp files, orphaned worktrees, zombie processes cleaned every 10 minutes
|
|
232
232
|
|
|
233
233
|
## Dashboard
|
|
@@ -403,7 +403,7 @@ No bash or shell involved — Node spawns Node directly. Dependency branches are
|
|
|
403
403
|
- **MCP servers** — inherited from `~/.claude.json` (no extra config needed)
|
|
404
404
|
- **Full tool access** — all built-in tools plus all MCP tools
|
|
405
405
|
- **Permission mode** — `bypassPermissions` (no interactive prompts)
|
|
406
|
-
- **Output format** — `stream-json` (real-time streaming for live dashboard +
|
|
406
|
+
- **Output format** — `stream-json` (real-time streaming for live dashboard + completion recovery)
|
|
407
407
|
|
|
408
408
|
### Post-Completion
|
|
409
409
|
|
|
@@ -462,15 +462,15 @@ Playbooks are fully customizable — edit the shared templates in `playbooks/` t
|
|
|
462
462
|
|
|
463
463
|
## Health Monitoring
|
|
464
464
|
|
|
465
|
-
###
|
|
465
|
+
### Liveness Check (every tick)
|
|
466
466
|
|
|
467
|
-
|
|
468
|
-
- **
|
|
469
|
-
- **
|
|
470
|
-
- **
|
|
471
|
-
- **No process +
|
|
467
|
+
Agent liveness mirrors a normal CLI process:
|
|
468
|
+
- **Tracked process alive** → keep running, even if stdout/stderr are quiet
|
|
469
|
+
- **Tracked process exceeds `agentTimeout`** → stop and mark timed out
|
|
470
|
+
- **Tracked process exits** → handle normal completion/failure
|
|
471
|
+
- **No tracked process + stale output** → treat as an orphan from engine restart/process loss and mark failed
|
|
472
472
|
|
|
473
|
-
|
|
473
|
+
Builds, dependency installs, tests, and other CLI commands can legitimately produce no output for long periods. The engine does not infer "hung" from stdout/stderr silence while it still has a live process handle. `heartbeatTimeout` is only the stale-orphan grace window used when the engine has lost process tracking.
|
|
474
474
|
|
|
475
475
|
### Automated Cleanup (every 10 ticks)
|
|
476
476
|
|
|
@@ -532,7 +532,7 @@ Engine behavior is controlled via `config.json`. Key settings:
|
|
|
532
532
|
| `tickInterval` | 60000 (1min) | Milliseconds between engine ticks |
|
|
533
533
|
| `maxConcurrent` | 5 | Max agents running simultaneously |
|
|
534
534
|
| `agentTimeout` | 18000000 (5h) | Max total agent runtime |
|
|
535
|
-
| `heartbeatTimeout` | 300000 (5min) |
|
|
535
|
+
| `heartbeatTimeout` | 300000 (5min) | Stale-orphan grace after process tracking is lost |
|
|
536
536
|
| `maxTurns` | 100 | Max Claude CLI turns per agent session |
|
|
537
537
|
| `inboxConsolidateThreshold` | 5 | Inbox files needed before consolidation |
|
|
538
538
|
| `worktreeCreateTimeout` | 300000 (5min) | Timeout for each `git worktree add` attempt |
|
|
@@ -649,7 +649,7 @@ To move to a new machine: `npm install -g @yemi33/minions && minions init --forc
|
|
|
649
649
|
pipeline.js <- Multi-stage pipeline orchestration
|
|
650
650
|
meeting.js <- Meeting creation, rounds, conclusion
|
|
651
651
|
cleanup.js <- Worktree + temp file cleanup
|
|
652
|
-
timeout.js <- Agent timeout and
|
|
652
|
+
timeout.js <- Agent timeout and orphan detection
|
|
653
653
|
cooldown.js <- Dispatch cooldown with exponential backoff
|
|
654
654
|
github.js <- GitHub PR polling, comment polling, reconciliation
|
|
655
655
|
routing.js <- Agent routing and temp agent management
|
package/dashboard.js
CHANGED
|
@@ -25,6 +25,9 @@ const ado = require('./engine/ado');
|
|
|
25
25
|
const gh = require('./engine/github');
|
|
26
26
|
const issues = require('./engine/issues');
|
|
27
27
|
const watchesMod = require('./engine/watches');
|
|
28
|
+
const routing = require('./engine/routing');
|
|
29
|
+
const playbook = require('./engine/playbook');
|
|
30
|
+
const dispatchMod = require('./engine/dispatch');
|
|
28
31
|
const os = require('os');
|
|
29
32
|
|
|
30
33
|
const { safeRead, safeReadDir, safeWrite, safeJson, safeJsonObj, safeJsonArr, safeUnlink, mutateJsonFileLocked, mutateWorkItems, getProjects: _getProjects, DONE_STATUSES, WI_STATUS, reopenWorkItem } = shared;
|
|
@@ -1238,6 +1241,49 @@ async function executeCCActions(actions) {
|
|
|
1238
1241
|
results.push({ type: action.type, id, ok: true });
|
|
1239
1242
|
break;
|
|
1240
1243
|
}
|
|
1244
|
+
case 'build-and-test': {
|
|
1245
|
+
// Resolve PR by number, ID, or URL — same lookup that drives the link-pr / PR-row paths.
|
|
1246
|
+
const allPrs = getPullRequests().filter(p => !p._ghost);
|
|
1247
|
+
const pr = shared.findPrRecord(allPrs, action.pr) || null;
|
|
1248
|
+
if (!pr) {
|
|
1249
|
+
results.push({ type: 'build-and-test', error: `PR not found: ${action.pr}` });
|
|
1250
|
+
break;
|
|
1251
|
+
}
|
|
1252
|
+
// Resolve project: explicit param wins, else PR's _project, else first configured project as last resort.
|
|
1253
|
+
const projectName = action.project || pr._project || null;
|
|
1254
|
+
const project = projectName
|
|
1255
|
+
? PROJECTS.find(p => p.name?.toLowerCase() === String(projectName).toLowerCase())
|
|
1256
|
+
: null;
|
|
1257
|
+
if (!project) {
|
|
1258
|
+
results.push({ type: 'build-and-test', error: `Project not found for PR ${pr.id}: ${projectName || '(none)'}` });
|
|
1259
|
+
break;
|
|
1260
|
+
}
|
|
1261
|
+
// Pick agent: explicit param wins; else routing for 'test' work type.
|
|
1262
|
+
let agentId = action.agent && CONFIG.agents?.[action.agent] ? action.agent : null;
|
|
1263
|
+
if (!agentId) {
|
|
1264
|
+
agentId = routing.resolveAgent('test', CONFIG, { authorAgent: pr.agent });
|
|
1265
|
+
}
|
|
1266
|
+
if (!agentId) {
|
|
1267
|
+
results.push({ type: 'build-and-test', error: 'No available agent for test routing' });
|
|
1268
|
+
break;
|
|
1269
|
+
}
|
|
1270
|
+
const prNumber = shared.getPrNumber(pr);
|
|
1271
|
+
const dispatchKey = `cc-bt-${project.name}-${pr.id}`;
|
|
1272
|
+
const item = playbook.buildPrDispatch(agentId, CONFIG, project, pr, 'test', {
|
|
1273
|
+
pr_id: pr.id, pr_number: prNumber, pr_title: pr.title || '', pr_branch: pr.branch || '',
|
|
1274
|
+
pr_author: pr.agent || '', pr_url: pr.url || '',
|
|
1275
|
+
project_path: project.localPath || '',
|
|
1276
|
+
task: `Build & test ${pr.id}: ${pr.title || ''}`,
|
|
1277
|
+
}, `Build & test ${pr.id}: ${pr.title || ''}`,
|
|
1278
|
+
{ dispatchKey, source: 'cc-build-and-test', pr, branch: pr.branch, project: { name: project.name, localPath: project.localPath } });
|
|
1279
|
+
if (!item) {
|
|
1280
|
+
results.push({ type: 'build-and-test', error: 'Failed to render build-and-test playbook' });
|
|
1281
|
+
break;
|
|
1282
|
+
}
|
|
1283
|
+
const id = dispatchMod.addToDispatch(item);
|
|
1284
|
+
results.push({ type: 'build-and-test', id, agent: agentId, pr: pr.id, ok: true });
|
|
1285
|
+
break;
|
|
1286
|
+
}
|
|
1241
1287
|
case 'note': {
|
|
1242
1288
|
shared.writeToInbox('command-center', shared.slugify(action.title || 'note'), `# ${action.title || 'Note'}\n\n${action.content || action.description || ''}`);
|
|
1243
1289
|
results.push({ type: 'note', ok: true });
|
package/docs/auto-discovery.md
CHANGED
|
@@ -8,7 +8,7 @@ The engine runs a tick every 60 seconds (configurable via `config.json` → `eng
|
|
|
8
8
|
|
|
9
9
|
```
|
|
10
10
|
tick()
|
|
11
|
-
1. checkTimeouts()
|
|
11
|
+
1. checkTimeouts() Enforce runtime limits and stale-orphan cleanup
|
|
12
12
|
2. consolidateInbox() Merge learnings into notes.md (Haiku-powered)
|
|
13
13
|
2.5 runCleanup() Periodic cleanup (every 10 ticks ≈ 10min)
|
|
14
14
|
2.6 pollPrStatus() Poll ADO + GitHub for build, review, merge status (wall-clock cadence from prPollStatusEvery × tickInterval, default ≈ 12min)
|
|
@@ -283,7 +283,7 @@ proc.on('close')
|
|
|
283
283
|
├─ Post-completion hooks:
|
|
284
284
|
│ review → update PR minionsReview in pull-requests.json, vote on ADO
|
|
285
285
|
│ fix → set PR minionsReview back to "waiting"
|
|
286
|
-
│ build-test →
|
|
286
|
+
│ build-test → record verification result and findings
|
|
287
287
|
│
|
|
288
288
|
├─ Check for learnings in notes/inbox/
|
|
289
289
|
│ (warns if agent didn't write findings)
|
|
@@ -346,10 +346,10 @@ ADO + GitHub REST ── pollPrStatus() ──► pull-requests.json
|
|
|
346
346
|
│
|
|
347
347
|
┌───────────┬───────┼───────┬──────────┐
|
|
348
348
|
▼ ▼ ▼ ▼ ▼
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
│
|
|
352
|
-
consolidateInbox()
|
|
349
|
+
output.log notes/ PRs work-items localhost
|
|
350
|
+
(per agent) inbox/*.md .json .json (if webapp,
|
|
351
|
+
│ from build
|
|
352
|
+
consolidateInbox() & test)
|
|
353
353
|
(at 5+ files)
|
|
354
354
|
│
|
|
355
355
|
▼
|
|
@@ -359,18 +359,20 @@ ADO + GitHub REST ── pollPrStatus() ──► pull-requests.json
|
|
|
359
359
|
playbooks)
|
|
360
360
|
```
|
|
361
361
|
|
|
362
|
-
## Timeout & Stale Detection
|
|
362
|
+
## Timeout & Stale-Orphan Detection
|
|
363
363
|
|
|
364
364
|
Two layers of protection:
|
|
365
365
|
|
|
366
366
|
**Agent timeout** (`engine.agentTimeout`, default 5 hours / 18,000,000ms):
|
|
367
|
-
-
|
|
368
|
-
- Sends SIGTERM, then SIGKILL after
|
|
367
|
+
- Applies to tracked live processes regardless of output activity
|
|
368
|
+
- Sends SIGTERM, then SIGKILL after a short grace period
|
|
369
369
|
|
|
370
|
-
**Stale detection** (`engine.heartbeatTimeout`, default 5 min / 300,000ms):
|
|
371
|
-
-
|
|
372
|
-
-
|
|
373
|
-
-
|
|
370
|
+
**Stale-orphan detection** (`engine.heartbeatTimeout`, default 5 min / 300,000ms):
|
|
371
|
+
- Applies only when an active dispatch has no live tracked process
|
|
372
|
+
- Uses `live-output.log` mtime as indirect evidence after engine restart or process-handle loss
|
|
373
|
+
- Marks stale orphaned dispatches failed and resets the agent to idle
|
|
374
|
+
|
|
375
|
+
Lack of stdout/stderr is not treated as a hang while the engine still has a live process handle. Long builds, dependency installs, and tests can legitimately run quietly.
|
|
374
376
|
|
|
375
377
|
## Cooldown Behavior
|
|
376
378
|
|
|
@@ -391,8 +393,8 @@ All discovery behavior is controlled via `config.json`:
|
|
|
391
393
|
"engine": {
|
|
392
394
|
"tickInterval": 60000, // ms between ticks
|
|
393
395
|
"maxConcurrent": 5, // max agents running at once
|
|
394
|
-
"agentTimeout": 18000000, // 5 hours —
|
|
395
|
-
"heartbeatTimeout": 300000, // 5min —
|
|
396
|
+
"agentTimeout": 18000000, // 5 hours — hard runtime limit
|
|
397
|
+
"heartbeatTimeout": 300000, // 5min — stale-orphan grace after process tracking is lost
|
|
396
398
|
"maxTurns": 100, // max claude CLI turns per agent
|
|
397
399
|
"worktreeCreateTimeout": 300000, // timeout for git worktree add on large repos
|
|
398
400
|
"worktreeCreateRetries": 1 // retry count for transient add failures
|
|
@@ -59,9 +59,7 @@ proc.stdin.end();
|
|
|
59
59
|
|
|
60
60
|
### Output Format: json vs stream-json
|
|
61
61
|
|
|
62
|
-
`--output-format json` produces **one JSON blob at exit**. No streaming output during execution. This broke
|
|
63
|
-
- Live output in dashboard (nothing to show until agent finishes)
|
|
64
|
-
- Heartbeat monitoring (no file writes to check mtime against)
|
|
62
|
+
`--output-format json` produces **one JSON blob at exit**. No streaming output during execution. This broke live output in the dashboard and made restart recovery less observable.
|
|
65
63
|
|
|
66
64
|
Fix: switched to `--output-format stream-json` — streams events as they happen.
|
|
67
65
|
|
|
@@ -73,15 +71,15 @@ Agents would hang waiting for permission prompts (invisible in headless mode). A
|
|
|
73
71
|
|
|
74
72
|
Claude Code sets `CLAUDECODE` env var to prevent nested sessions. Spawned agents inherit it and refuse to start. The engine strips it from `childEnv`, but the wrapper script was using `process.env` (which re-inherits from the parent). Fixed by stripping in both places.
|
|
75
73
|
|
|
76
|
-
###
|
|
74
|
+
### Process Liveness vs Stale-Orphan Detection
|
|
77
75
|
|
|
78
76
|
Original approach: kill agents after a fixed time threshold (staleThreshold). Problem: agents can legitimately run for hours on complex tasks.
|
|
79
77
|
|
|
80
|
-
New approach:
|
|
78
|
+
New approach: rely on the tracked process while the engine has a live process handle, regardless of whether stdout/stderr are quiet. `live-output.log` mtime is only used after process tracking is lost, such as an engine restart, to clean up stale orphaned dispatches.
|
|
81
79
|
|
|
82
80
|
### Engine Restart Orphan Problem
|
|
83
81
|
|
|
84
|
-
When the engine restarts, the in-memory `activeProcesses` Map is lost. Active dispatch items stay in `dispatch.json` but the engine has no process handle. Old stale detection (6h threshold) was too slow to catch this.
|
|
82
|
+
When the engine restarts, the in-memory `activeProcesses` Map is lost. Active dispatch items stay in `dispatch.json` but the engine has no process handle. Old stale detection (6h threshold) was too slow to catch this. Stale-orphan detection uses recent `live-output.log` activity and catches abandoned dispatches after the restart grace window.
|
|
85
83
|
|
|
86
84
|
## The Successful Run
|
|
87
85
|
|
|
@@ -107,9 +105,9 @@ When the engine restarts, the in-memory `activeProcesses` Map is lost. Active di
|
|
|
107
105
|
|
|
108
106
|
1. **Never pass user content through shell expansion.** Use stdin or direct args via Node's `spawn` (without shell).
|
|
109
107
|
2. **On Windows, npm-installed CLI tools are shell wrappers.** Resolve the actual `.js` entry point and spawn via `node`.
|
|
110
|
-
3. **Streaming output format is essential** for
|
|
108
|
+
3. **Streaming output format is essential** for live dashboards and restart recovery. One-shot JSON hides everything until exit.
|
|
111
109
|
4. **Environment variable inheritance is tricky** with nested spawns. Strip at every level.
|
|
112
|
-
5. **
|
|
110
|
+
5. **Process liveness beats output-silence heuristics** for agents that can run quiet CLI commands for long periods.
|
|
113
111
|
|
|
114
112
|
## The Spawn Chain (Final Working Version)
|
|
115
113
|
|
|
@@ -120,9 +118,8 @@ engine.js (tick loop)
|
|
|
120
118
|
→ spawn(process.execPath, ['cli.js', '-p', '--system-prompt', content, ...args])
|
|
121
119
|
→ claude-code runs with prompt via stdin
|
|
122
120
|
→ agent works, streams JSON events to stdout
|
|
123
|
-
→ engine captures to live-output.log (
|
|
121
|
+
→ engine captures to live-output.log (dashboard + restart recovery)
|
|
124
122
|
→ dashboard polls /api/agent/:id/live (3s refresh)
|
|
125
123
|
```
|
|
126
124
|
|
|
127
125
|
No bash. No shell. No metacharacter interpretation. Just Node spawning Node.
|
|
128
|
-
|
package/docs/engine-restart.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
## The Problem
|
|
4
4
|
|
|
5
|
-
When the engine restarts, it loses its in-memory process handles (`activeProcesses` Map). Claude CLI agents spawned before the restart
|
|
5
|
+
When the engine restarts, it loses its in-memory process handles (`activeProcesses` Map). Claude CLI agents spawned before the restart may still be running as OS processes, but the engine can't monitor their process state, detect exit codes, or manage their lifecycle. Stale-orphan detection keeps these dispatch records from staying active forever after the restart grace period expires.
|
|
6
6
|
|
|
7
7
|
## What's Persisted vs Lost
|
|
8
8
|
|
|
@@ -10,7 +10,7 @@ When the engine restarts, it loses its in-memory process handles (`activeProcess
|
|
|
10
10
|
|-------|---------|-----------------|
|
|
11
11
|
| Dispatch queue (pending/active/completed) | `engine/dispatch.json` | Yes |
|
|
12
12
|
| Agent status (working/idle/error) | Derived from `engine/dispatch.json` | Yes |
|
|
13
|
-
| Agent live output | `agents/*/live-output.log` | Yes (mtime used
|
|
13
|
+
| Agent live output | `agents/*/live-output.log` | Yes (mtime used for orphan cleanup) |
|
|
14
14
|
| Process handles (`ChildProcess`) | In-memory Map | **No** |
|
|
15
15
|
| Cooldown timestamps | In-memory Map | **No** (repopulated from `engine/cooldowns.json`) |
|
|
16
16
|
|
|
@@ -29,14 +29,11 @@ Configurable via `config.json`:
|
|
|
29
29
|
}
|
|
30
30
|
```
|
|
31
31
|
|
|
32
|
-
### 2.
|
|
32
|
+
### 2. Process-Based Liveness
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
After the grace period expires, a dispatch with a tracked live process keeps running until the process exits or exceeds `engine.agentTimeout`. Quiet stdout/stderr alone is not a hang signal; long builds, dependency installs, and tests can legitimately be silent.
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
- **`Bash` with long timeout (>5 min)** — timeout extended to the bash timeout + 1 min
|
|
38
|
-
|
|
39
|
-
This works for both tracked processes and orphans (no process handle).
|
|
36
|
+
If there is no live tracked process, the engine uses `live-output.log` mtime as indirect evidence. Once the log is stale for `engine.heartbeatTimeout`, the dispatch is treated as an orphan and marked failed.
|
|
40
37
|
|
|
41
38
|
### 3. Stop Warning
|
|
42
39
|
|
|
@@ -86,7 +83,7 @@ T+0-20m Ticks run. Orphan detection skipped (grace period).
|
|
|
86
83
|
Engine detects completed output on next tick via file scan.
|
|
87
84
|
|
|
88
85
|
T+20m Grace period expires.
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
Stale-orphan detection resumes.
|
|
87
|
+
Dispatch with live tracked process → keep running.
|
|
88
|
+
Dispatch with no live process and stale output → orphaned.
|
|
92
89
|
```
|
|
@@ -50,8 +50,8 @@ These run continuously without you:
|
|
|
50
50
|
- **Build failure detection** — auto-files fix tasks when CI fails
|
|
51
51
|
- **Inbox consolidation** — LLM-powered dedup and categorization when inbox hits threshold
|
|
52
52
|
- **Knowledge base classification** — auto-assigns category to consolidated notes
|
|
53
|
-
- **
|
|
54
|
-
- **
|
|
53
|
+
- **Process-based liveness** — tracks running agent processes and enforces the hard runtime limit
|
|
54
|
+
- **Stale-orphan detection** — cleans up dispatches after process tracking is lost
|
|
55
55
|
- **Metrics collection** — tracks tasks, errors, PRs, approvals per agent
|
|
56
56
|
- **Dispatch priority** — fixes first, then reviews, then implementations
|
|
57
57
|
- **Cooldown & backoff** — prevents re-dispatching recently failed items
|
|
@@ -98,11 +98,10 @@ If you start the engine and dashboard, then leave:
|
|
|
98
98
|
2. Discovers pending work items, PRD gaps, PR reviews needed
|
|
99
99
|
3. Dispatches agents (up to max concurrent)
|
|
100
100
|
4. Agents create worktrees, write code, create PRs
|
|
101
|
-
5. Engine monitors for
|
|
101
|
+
5. Engine monitors for process exit, hard timeouts, stale orphans, and build failures
|
|
102
102
|
6. Successful work → PRs appear in your ADO/GitHub queue
|
|
103
103
|
7. Failed work → marked failed, waiting for your retry
|
|
104
104
|
8. Notes consolidated into team knowledge automatically
|
|
105
105
|
9. Worktrees cleaned up after PRs merge
|
|
106
106
|
|
|
107
107
|
**What blocks:** Plans waiting for approval. PRs waiting for your review vote. Failed tasks waiting for retry. Everything else keeps moving.
|
|
108
|
-
|
|
@@ -96,7 +96,7 @@ When multiple problems coexist, earlier triggers get the first chance to enqueue
|
|
|
96
96
|
| Build fix before CI runs | `_buildFixPushedAt` grace period (10min) |
|
|
97
97
|
| Duplicate dispatch | `dispatchKey` dedup + cooldown |
|
|
98
98
|
| Stale review status | Pre-dispatch live API check |
|
|
99
|
-
| Orphan detection |
|
|
99
|
+
| Orphan detection | Stale-orphan timeout + output scan |
|
|
100
100
|
|
|
101
101
|
## Key files
|
|
102
102
|
|
|
@@ -28,7 +28,7 @@ The engine reconstructs control-plane state from the unstructured stdout of `cla
|
|
|
28
28
|
| 6 | `parseStructuredCompletion` (`lifecycle.js:1494`) | Last ` ```completion ` fenced block, parsed as `key: value` | An agent that includes a ` ```completion ` block in a quoted file (e.g. another playbook) overrides its own real status |
|
|
29
29
|
| 7 | `classifyFailure` (`lifecycle.js:2096`) | Failure-class regexes on combined stdout/stderr (`max_turns`, `permission denied`, `merge conflict`, …) | An agent that quotes one error class while genuinely failing on another gets the wrong recovery recipe |
|
|
30
30
|
| 8 | `checkForLearnings` (`lifecycle.js:1266`) | Filesystem scan for `notes/inbox/*<agentId>*<date>*` | Not stdout-based, but date-collisions cause cross-task attribution |
|
|
31
|
-
| 9 | `checkTimeouts` (`engine/timeout.js:189-219`) | Tail of `live-output.log` for `
|
|
31
|
+
| 9 | `checkTimeouts` (`engine/timeout.js:189-219`) | Tail of `live-output.log` for `[process-exit]` markers — completion-via-output detection after process tracking is lost | Lower-risk: this is engine/CLI output, not agent-authored content |
|
|
32
32
|
|
|
33
33
|
Sites 1–8 are agent-spoofable (intentionally or accidentally). Site 9 is claude-CLI-emitted and stays on stdout — see §6.
|
|
34
34
|
|
|
@@ -44,7 +44,7 @@ The current ` ```completion ` fenced block (Site 6) was a half-step toward struc
|
|
|
44
44
|
5. Zero new dependencies — file write + JSON parse, same toolbox as the rest of Minions.
|
|
45
45
|
|
|
46
46
|
**Non-goals.**
|
|
47
|
-
1. Replacing `live-output.log`
|
|
47
|
+
1. Replacing process tracking or `live-output.log` recovery. A live tracked process is the authoritative liveness signal; `live-output.log` remains useful for completion recovery after process tracking is lost — see §6.
|
|
48
48
|
2. Replacing `safeWrite`/`mutateJsonFileLocked` for engine state files. `completion.json` is one-shot, write-once, agent-authored — no concurrent writers.
|
|
49
49
|
3. Hardening against a *malicious* agent. An attacker who controls the agent process could write any completion.json. The threat model is *accidental spoofing by quoted text* and *forward compatibility with structured tool outputs*.
|
|
50
50
|
|
|
@@ -269,8 +269,8 @@ The flag name `engine.requireCompletionFile` mirrors existing engine flags (`aut
|
|
|
269
269
|
|
|
270
270
|
These paths stay on stdout / live-output.log:
|
|
271
271
|
|
|
272
|
-
1. **`engine/timeout.js` completion-via-output detection** (`timeout.js:189-219`). The signal
|
|
273
|
-
2. **
|
|
272
|
+
1. **`engine/timeout.js` completion-via-output detection** (`timeout.js:189-219`). The signal is the engine-written `[process-exit]` sentinel, emitted even if the agent crashed before writing completion.json. Removing it would mean orphans that finished during process-handle loss are never reconciled.
|
|
273
|
+
2. **Stale-orphan cleanup via `live-output.log` mtime** (`timeout.js:178`). Completion.json is written once at exit, so `live-output.log` remains the best indirect signal after the engine loses process tracking.
|
|
274
274
|
3. **`parseStreamJsonOutput` for `resultSummary`** in `parseAgentOutput` (`lifecycle.js:1483`). This extracts the human-readable summary from the CLI's stream-json. Even after the flip, `completion.summary` is *also* extracted, but the stream-json text remains the canonical "what did the agent say last" — used in dashboards, agent history, Teams notifications. The two coexist: `completion.summary` is for routing decisions, the stream-json text is for display.
|
|
275
275
|
4. **Inbox-file skill scan** (`lifecycle.js:2013-2024`). Some agents write skills into their inbox findings file (a deliberate human-discoverable artifact). The completion file deprecates inline ` ```skill ` blocks in stdout, but the inbox file scan is opt-in and stays — it's a different surface (a real file the agent intentionally wrote, not regex-scraped from stdout).
|
|
276
276
|
|
|
@@ -308,7 +308,7 @@ These paths stay on stdout / live-output.log:
|
|
|
308
308
|
| Agent quotes a previous ` ```completion ` block → wrong status | ✅ | ` ```completion ` parser removed in Phase 4 |
|
|
309
309
|
| Agent quotes one error class while failing on another → wrong recovery recipe | ✅ | `failure.class` is explicit; if missing or invalid, falls through to `FAILURE_CLASS.UNKNOWN` (safe default) |
|
|
310
310
|
| Decompose agent emits a ` ```json ` block earlier in reasoning → corrupted children | ✅ | `decomposition.subItems` is explicit |
|
|
311
|
-
|
|
|
311
|
+
| Orphaned agent never reaches the write site → no completion.json | ⚠️ | Engine's stale-orphan cleanup (`timeout.js`) catches this after process tracking is lost; completed agents can still be reconciled via the `[process-exit]` sentinel |
|
|
312
312
|
| Malicious agent writes a fake completion.json (e.g. claims `noop` to avoid retry) | ❌ | Out of scope — see §2 non-goals. An adversarial agent owns its own write path regardless. |
|
|
313
313
|
|
|
314
314
|
The key shift: **the agent's intent is now in a place no quoted text can reach.** Stdout becomes display-only.
|
package/engine/lifecycle.js
CHANGED
|
@@ -1665,7 +1665,7 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
1665
1665
|
log('info', `Structured completion reports PR (${structuredCompletion.pr}) but regex sync found none — PR may already be tracked`);
|
|
1666
1666
|
}
|
|
1667
1667
|
|
|
1668
|
-
// Auto-recover: if a failed implement/fix agent created PRs, it likely succeeded before
|
|
1668
|
+
// Auto-recover: if a failed implement/fix agent created PRs, it likely succeeded before the failure surfaced.
|
|
1669
1669
|
const prCreatingType = type === WORK_TYPE.IMPLEMENT || type === WORK_TYPE.IMPLEMENT_LARGE || type === WORK_TYPE.FIX;
|
|
1670
1670
|
const autoRecovered = !isSuccess && prsCreatedCount > 0 && prCreatingType && !!meta?.item?.id;
|
|
1671
1671
|
if (autoRecovered) {
|
package/engine/playbook.js
CHANGED
|
@@ -278,6 +278,7 @@ const PLAYBOOK_REQUIRED_VARS = {
|
|
|
278
278
|
'decompose': ['item_id', 'item_description', 'project_path'],
|
|
279
279
|
'verify': ['task_description'],
|
|
280
280
|
'test': ['item_name'],
|
|
281
|
+
'docs': ['item_id', 'item_name'],
|
|
281
282
|
'work-item': ['item_id', 'item_name'],
|
|
282
283
|
'meeting-investigate': ['meeting_title', 'agenda'],
|
|
283
284
|
'meeting-debate': ['meeting_title', 'agenda'],
|
|
@@ -630,7 +631,7 @@ function selectPlaybook(workType, item) {
|
|
|
630
631
|
if (workType === WORK_TYPE.REVIEW && !item?._pr && !item?.pr_id) {
|
|
631
632
|
return 'work-item';
|
|
632
633
|
}
|
|
633
|
-
const typeSpecificPlaybooks = ['explore', 'review', 'test', 'plan-to-prd', 'plan', 'ask', 'verify', 'decompose', 'meeting-investigate', 'meeting-debate', 'meeting-conclude'];
|
|
634
|
+
const typeSpecificPlaybooks = ['explore', 'review', 'test', 'plan-to-prd', 'plan', 'ask', 'verify', 'decompose', 'docs', 'meeting-investigate', 'meeting-debate', 'meeting-conclude'];
|
|
634
635
|
return typeSpecificPlaybooks.includes(workType) ? workType : 'work-item';
|
|
635
636
|
}
|
|
636
637
|
|
package/engine/queries.js
CHANGED
|
@@ -300,7 +300,7 @@ function getAgentStatus(agentId) {
|
|
|
300
300
|
branch: active.meta?.branch || '',
|
|
301
301
|
started_at: active.started_at || active.created_at || null,
|
|
302
302
|
};
|
|
303
|
-
// Surface blocking
|
|
303
|
+
// Surface any legacy blocking-tool annotation until timeout.js clears it.
|
|
304
304
|
if (active._blockingToolCall) {
|
|
305
305
|
result._blockingToolCall = active._blockingToolCall;
|
|
306
306
|
}
|
|
@@ -355,12 +355,12 @@ function getAgentStatus(agentId) {
|
|
|
355
355
|
|
|
356
356
|
// Fallback: derive active state from work-item markers.
|
|
357
357
|
// This protects UI status when dispatch.json briefly desyncs from work-item files.
|
|
358
|
-
// Guard: only trust dispatched state within 2x
|
|
358
|
+
// Guard: only trust dispatched state within 2x stale-orphan timeout to prevent stale
|
|
359
359
|
// dispatched items from permanently showing an agent as working after a dead process.
|
|
360
360
|
try {
|
|
361
361
|
const config = getConfig();
|
|
362
|
-
const
|
|
363
|
-
const staleThresholdMs =
|
|
362
|
+
const staleOrphanTimeout = config.engine?.heartbeatTimeout || ENGINE_DEFAULTS.heartbeatTimeout;
|
|
363
|
+
const staleThresholdMs = staleOrphanTimeout * 2;
|
|
364
364
|
const now = Date.now();
|
|
365
365
|
const allItems = getWorkItems(config);
|
|
366
366
|
const latestInFlight = allItems
|
package/engine/shared.js
CHANGED
|
@@ -699,8 +699,8 @@ const ENGINE_DEFAULTS = {
|
|
|
699
699
|
maxConcurrent: 5,
|
|
700
700
|
inboxConsolidateThreshold: 5,
|
|
701
701
|
agentTimeout: 18000000, // 5h
|
|
702
|
-
heartbeatTimeout: 300000, // 5min —
|
|
703
|
-
heartbeatTimeouts: {}, // per-type overrides; merged
|
|
702
|
+
heartbeatTimeout: 300000, // 5min — stale-orphan grace after process tracking is lost
|
|
703
|
+
heartbeatTimeouts: {}, // optional per-type stale-orphan overrides; merged at runtime (see timeout.js)
|
|
704
704
|
maxTurns: 100,
|
|
705
705
|
worktreeCreateTimeout: 300000, // 5min for git worktree add on large Windows repos
|
|
706
706
|
worktreeCreateRetries: 1, // retry once on transient timeout/lock races
|
|
@@ -758,7 +758,7 @@ const ENGINE_DEFAULTS = {
|
|
|
758
758
|
copilotReasoningSummaries: false, // Copilot --enable-reasoning-summaries (Anthropic-family models only)
|
|
759
759
|
maxBudgetUsd: undefined, // fleet USD ceiling for --max-budget-usd (per-agent override: agents.<id>.maxBudgetUsd). Honors 0 via ?? so a literal cap of $0 works
|
|
760
760
|
disableModelDiscovery: false, // skip runtime.listModels() REST calls fleet-wide (settings UI falls back to free-text)
|
|
761
|
-
heartbeatTimeouts: {},
|
|
761
|
+
heartbeatTimeouts: {},
|
|
762
762
|
maxPendingContexts: 20, // cap pendingContexts arrays in cooldowns.json to prevent unbounded growth
|
|
763
763
|
maxPendingContextEntryBytes: 256 * 1024, // 256 KB — cap each pendingContexts entry to prevent huge PR comments from bloating cooldowns.json
|
|
764
764
|
maxDispatchPromptBytes: 1024 * 1024, // 1 MB — dispatch items with prompts larger than this sidecar to engine/contexts/ to prevent dispatch.json OOM (#1167)
|
|
@@ -1063,14 +1063,6 @@ const WORK_TYPE = {
|
|
|
1063
1063
|
MEETING: 'meeting', EXPLORE: 'explore', ASK: 'ask', TEST: 'test', DOCS: 'docs',
|
|
1064
1064
|
};
|
|
1065
1065
|
|
|
1066
|
-
// Per-work-type heartbeat timeouts (ms) — read-heavy tasks need longer silence windows.
|
|
1067
|
-
// Keyed by WORK_TYPE constants; types not listed fall back to ENGINE_DEFAULTS.heartbeatTimeout.
|
|
1068
|
-
Object.assign(ENGINE_DEFAULTS.heartbeatTimeouts, {
|
|
1069
|
-
[WORK_TYPE.EXPLORE]: 600000, // 10 min — spends most time reading/analyzing, minimal stdout
|
|
1070
|
-
[WORK_TYPE.ASK]: 600000, // 10 min — research-heavy, long silent analysis periods
|
|
1071
|
-
[WORK_TYPE.REVIEW]: 480000, // 8 min — code review reads extensively before producing output
|
|
1072
|
-
});
|
|
1073
|
-
|
|
1074
1066
|
const PLAN_STATUS = {
|
|
1075
1067
|
ACTIVE: 'active', AWAITING_APPROVAL: 'awaiting-approval', APPROVED: 'approved',
|
|
1076
1068
|
PAUSED: 'paused', REJECTED: 'rejected', COMPLETED: 'completed',
|
|
@@ -1161,7 +1153,7 @@ const FAILURE_CLASS = {
|
|
|
1161
1153
|
PERMISSION_BLOCKED: 'permission-blocked', // Trust gate, permission denied, auth failure
|
|
1162
1154
|
MERGE_CONFLICT: 'merge-conflict', // Git merge conflict in worktree or dependency
|
|
1163
1155
|
BUILD_FAILURE: 'build-failure', // Compilation, lint, or test failure
|
|
1164
|
-
TIMEOUT: 'timeout', // Hard timeout or
|
|
1156
|
+
TIMEOUT: 'timeout', // Hard runtime timeout or stale-orphan timeout
|
|
1165
1157
|
EMPTY_OUTPUT: 'empty-output', // Agent produced no meaningful output
|
|
1166
1158
|
SPAWN_ERROR: 'spawn-error', // Process failed to start or crashed immediately
|
|
1167
1159
|
NETWORK_ERROR: 'network-error', // API rate limit, DNS, connectivity
|