agentctl-swarm 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/daemon.js +336 -0
- package/lib/daemon.test.js +314 -0
- package/lib/health-monitor.js +244 -0
- package/lib/health-monitor.test.js +183 -0
- package/lib/spawner.js +230 -0
- package/lib/spawner.test.js +182 -0
- package/lib/supervisor.js +510 -0
- package/lib/supervisor.test.js +327 -0
- package/owl/behaviors/promotion.md +30 -0
- package/owl/behaviors/recovery.md +58 -0
- package/owl/behaviors/scaling.md +39 -0
- package/owl/behaviors/swarm-lifecycle.md +38 -0
- package/owl/components/daemon.md +46 -0
- package/owl/components/health-monitor.md +37 -0
- package/owl/components/spawner.md +38 -0
- package/owl/components/supervisor.md +47 -0
- package/owl/constraints.md +42 -0
- package/owl/product.md +23 -0
- package/package.json +13 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Supervisor Tests
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { test, describe, before, after, beforeEach, afterEach } from 'node:test';
|
|
6
|
+
import assert from 'node:assert';
|
|
7
|
+
import fs from 'fs';
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import os from 'os';
|
|
10
|
+
import { Supervisor } from './supervisor.js';
|
|
11
|
+
import { DaemonState } from './daemon.js';
|
|
12
|
+
|
|
13
|
+
const tmpBase = path.join(os.tmpdir(), `supervisor-test-${Date.now()}`);
|
|
14
|
+
|
|
15
|
+
function makeConfig(overrides = {}) {
|
|
16
|
+
const id = Math.random().toString(36).slice(2, 8);
|
|
17
|
+
return {
|
|
18
|
+
count: 3,
|
|
19
|
+
maxActive: 2,
|
|
20
|
+
basePath: path.join(tmpBase, `workspace-${id}`),
|
|
21
|
+
pidfile: path.join(tmpBase, `swarm-${id}.pid`),
|
|
22
|
+
logDir: path.join(tmpBase, `logs-${id}`),
|
|
23
|
+
heartbeatIntervalMs: 100,
|
|
24
|
+
persist: false,
|
|
25
|
+
...overrides,
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
describe('Supervisor', () => {
|
|
30
|
+
before(() => {
|
|
31
|
+
fs.mkdirSync(tmpBase, { recursive: true });
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
after(() => {
|
|
35
|
+
fs.rmSync(tmpBase, { recursive: true, force: true });
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test('start spawns N daemons', () => {
|
|
39
|
+
const sup = new Supervisor(makeConfig({ count: 5 }));
|
|
40
|
+
sup.start();
|
|
41
|
+
|
|
42
|
+
const status = sup.status();
|
|
43
|
+
assert.strictEqual(status.total, 5);
|
|
44
|
+
assert.strictEqual(status.idle, 5);
|
|
45
|
+
assert.strictEqual(status.active, 0);
|
|
46
|
+
assert.strictEqual(status.running, true);
|
|
47
|
+
|
|
48
|
+
sup.stop();
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
test('pidfile prevents double start', () => {
|
|
52
|
+
const config = makeConfig();
|
|
53
|
+
const sup1 = new Supervisor(config);
|
|
54
|
+
sup1.start();
|
|
55
|
+
|
|
56
|
+
const sup2 = new Supervisor(config);
|
|
57
|
+
assert.throws(() => sup2.start(), /already running/);
|
|
58
|
+
|
|
59
|
+
sup1.stop();
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test('pidfile removed on stop', () => {
|
|
63
|
+
const config = makeConfig();
|
|
64
|
+
const sup = new Supervisor(config);
|
|
65
|
+
sup.start();
|
|
66
|
+
assert.ok(fs.existsSync(config.pidfile));
|
|
67
|
+
|
|
68
|
+
sup.stop();
|
|
69
|
+
assert.ok(!fs.existsSync(config.pidfile));
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('stale pidfile is cleaned up', () => {
|
|
73
|
+
const config = makeConfig();
|
|
74
|
+
// Write a pidfile with a dead PID
|
|
75
|
+
fs.mkdirSync(path.dirname(config.pidfile), { recursive: true });
|
|
76
|
+
fs.writeFileSync(config.pidfile, '999999999');
|
|
77
|
+
|
|
78
|
+
const sup = new Supervisor(config);
|
|
79
|
+
sup.start(); // Should succeed despite stale pidfile
|
|
80
|
+
|
|
81
|
+
assert.strictEqual(sup.status().running, true);
|
|
82
|
+
sup.stop();
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
test('stop cleans up all daemons', () => {
|
|
86
|
+
const sup = new Supervisor(makeConfig({ count: 3, persist: false }));
|
|
87
|
+
sup.start();
|
|
88
|
+
assert.strictEqual(sup.processTable.size, 3);
|
|
89
|
+
|
|
90
|
+
sup.stop();
|
|
91
|
+
assert.strictEqual(sup.processTable.size, 0);
|
|
92
|
+
assert.strictEqual(sup.running, false);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
test('stop with persist keeps workspaces', () => {
|
|
96
|
+
const config = makeConfig({ count: 2, persist: true });
|
|
97
|
+
const sup = new Supervisor(config);
|
|
98
|
+
sup.start();
|
|
99
|
+
|
|
100
|
+
// Get workspace paths before stopping
|
|
101
|
+
const workspaces = [];
|
|
102
|
+
for (const [, entry] of sup.processTable) {
|
|
103
|
+
workspaces.push(entry.daemon.workspace);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
sup.stop();
|
|
107
|
+
|
|
108
|
+
// Workspaces should still exist
|
|
109
|
+
for (const ws of workspaces) {
|
|
110
|
+
assert.ok(fs.existsSync(ws), `Workspace should persist: ${ws}`);
|
|
111
|
+
}
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
test('promotion respects maxActive', () => {
|
|
115
|
+
const sup = new Supervisor(makeConfig({ count: 3, maxActive: 1 }));
|
|
116
|
+
sup.start();
|
|
117
|
+
|
|
118
|
+
const agents = [...sup.processTable.values()];
|
|
119
|
+
|
|
120
|
+
// Override _spawnClaude BEFORE handleMessage (supervisor auto-approves synchronously)
|
|
121
|
+
agents[0].daemon._spawnClaude = function () {
|
|
122
|
+
this.state = DaemonState.ACTIVE;
|
|
123
|
+
this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
// First promotion — auto-approved by supervisor
|
|
127
|
+
agents[0].daemon.handleMessage({
|
|
128
|
+
type: 'ASSIGN',
|
|
129
|
+
agentId: agents[0].daemon.agentId,
|
|
130
|
+
task: { component: 'test1', prompt: 'task 1' },
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
assert.strictEqual(sup.activeCount, 1);
|
|
134
|
+
|
|
135
|
+
// Second promotion should be queued (maxActive=1)
|
|
136
|
+
agents[1].daemon.handleMessage({
|
|
137
|
+
type: 'ASSIGN',
|
|
138
|
+
agentId: agents[1].daemon.agentId,
|
|
139
|
+
task: { component: 'test2', prompt: 'task 2' },
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
assert.strictEqual(sup.activeCount, 1);
|
|
143
|
+
assert.strictEqual(sup.promotionQueue.length, 1);
|
|
144
|
+
|
|
145
|
+
sup.stop();
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
test('promotion queue drains on demotion', () => {
|
|
149
|
+
const sup = new Supervisor(makeConfig({ count: 3, maxActive: 1 }));
|
|
150
|
+
sup.start();
|
|
151
|
+
const logs = [];
|
|
152
|
+
sup.on('log', l => logs.push(l));
|
|
153
|
+
|
|
154
|
+
const agents = [...sup.processTable.values()];
|
|
155
|
+
|
|
156
|
+
// Override _spawnClaude BEFORE triggering promotions
|
|
157
|
+
agents[0].daemon._spawnClaude = function () {
|
|
158
|
+
this.state = DaemonState.ACTIVE;
|
|
159
|
+
this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
|
|
160
|
+
};
|
|
161
|
+
agents[1].daemon._spawnClaude = function () {
|
|
162
|
+
this.state = DaemonState.ACTIVE;
|
|
163
|
+
this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
// Promote first (auto-approved)
|
|
167
|
+
agents[0].daemon.handleMessage({
|
|
168
|
+
type: 'ASSIGN',
|
|
169
|
+
agentId: agents[0].daemon.agentId,
|
|
170
|
+
task: { component: 'test1', prompt: 'task 1' },
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
// Queue second (maxActive=1, slot full)
|
|
174
|
+
agents[1].daemon.handleMessage({
|
|
175
|
+
type: 'ASSIGN',
|
|
176
|
+
agentId: agents[1].daemon.agentId,
|
|
177
|
+
task: { component: 'test2', prompt: 'task 2' },
|
|
178
|
+
});
|
|
179
|
+
assert.strictEqual(sup.promotionQueue.length, 1);
|
|
180
|
+
|
|
181
|
+
// Demote first — should auto-promote second from queue
|
|
182
|
+
agents[0].daemon._handleClaudeExit(0, null, 'done', '');
|
|
183
|
+
|
|
184
|
+
assert.strictEqual(sup.promotionQueue.length, 0);
|
|
185
|
+
|
|
186
|
+
sup.stop();
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
test('token budget pauses promotions', () => {
|
|
190
|
+
const sup = new Supervisor(makeConfig({ count: 2, maxActive: 5, tokenBudget: 100 }));
|
|
191
|
+
sup.start();
|
|
192
|
+
|
|
193
|
+
sup.tokensUsed = 100; // Exhaust budget
|
|
194
|
+
|
|
195
|
+
const agents = [...sup.processTable.values()];
|
|
196
|
+
const unclaims = [];
|
|
197
|
+
agents[0].daemon.on('unclaim', u => unclaims.push(u));
|
|
198
|
+
|
|
199
|
+
agents[0].daemon.handleMessage({
|
|
200
|
+
type: 'ASSIGN',
|
|
201
|
+
agentId: agents[0].daemon.agentId,
|
|
202
|
+
task: { component: 'test', prompt: 'task' },
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
assert.strictEqual(sup.promotionsPaused, true);
|
|
206
|
+
assert.strictEqual(unclaims.length, 1);
|
|
207
|
+
assert.ok(unclaims[0].reason.includes('budget'));
|
|
208
|
+
|
|
209
|
+
sup.stop();
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
test('scale up adds daemons', () => {
|
|
213
|
+
const sup = new Supervisor(makeConfig({ count: 2 }));
|
|
214
|
+
sup.start();
|
|
215
|
+
assert.strictEqual(sup.processTable.size, 2);
|
|
216
|
+
|
|
217
|
+
const result = sup.scale(5);
|
|
218
|
+
assert.strictEqual(result.from, 2);
|
|
219
|
+
assert.strictEqual(result.to, 5);
|
|
220
|
+
assert.strictEqual(result.added, 3);
|
|
221
|
+
assert.strictEqual(sup.processTable.size, 5);
|
|
222
|
+
|
|
223
|
+
sup.stop();
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
test('scale down removes idle daemons', () => {
|
|
227
|
+
const sup = new Supervisor(makeConfig({ count: 5 }));
|
|
228
|
+
sup.start();
|
|
229
|
+
|
|
230
|
+
const result = sup.scale(2);
|
|
231
|
+
assert.strictEqual(result.from, 5);
|
|
232
|
+
assert.strictEqual(result.removed, 3);
|
|
233
|
+
assert.strictEqual(sup.processTable.size, 2);
|
|
234
|
+
|
|
235
|
+
sup.stop();
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
test('scale down preserves active agents', () => {
|
|
239
|
+
const sup = new Supervisor(makeConfig({ count: 3, maxActive: 3 }));
|
|
240
|
+
sup.start();
|
|
241
|
+
|
|
242
|
+
const agents = [...sup.processTable.values()];
|
|
243
|
+
|
|
244
|
+
// Override _spawnClaude BEFORE handleMessage
|
|
245
|
+
agents[0].daemon._spawnClaude = function () {
|
|
246
|
+
this.state = DaemonState.ACTIVE;
|
|
247
|
+
this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
|
|
248
|
+
};
|
|
249
|
+
|
|
250
|
+
// Promote one agent (auto-approved)
|
|
251
|
+
agents[0].daemon.handleMessage({
|
|
252
|
+
type: 'ASSIGN',
|
|
253
|
+
agentId: agents[0].daemon.agentId,
|
|
254
|
+
task: { component: 'busy', prompt: 'working' },
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
// Scale down to 1 — should only remove idle daemons
|
|
258
|
+
const result = sup.scale(1);
|
|
259
|
+
assert.strictEqual(result.removed, 2); // 2 idle removed
|
|
260
|
+
// The active agent should still be in the process table
|
|
261
|
+
assert.ok(sup.processTable.size >= 1);
|
|
262
|
+
|
|
263
|
+
sup.stop();
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test('scale to zero stops swarm', () => {
|
|
267
|
+
const sup = new Supervisor(makeConfig({ count: 3 }));
|
|
268
|
+
sup.start();
|
|
269
|
+
sup.scale(0);
|
|
270
|
+
assert.strictEqual(sup.running, false);
|
|
271
|
+
assert.strictEqual(sup.processTable.size, 0);
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
test('reloadConfig updates maxActive', () => {
|
|
275
|
+
const sup = new Supervisor(makeConfig({ count: 2, maxActive: 1 }));
|
|
276
|
+
sup.start();
|
|
277
|
+
|
|
278
|
+
assert.strictEqual(sup.maxActive, 1);
|
|
279
|
+
sup.reloadConfig({ maxActive: 10 });
|
|
280
|
+
assert.strictEqual(sup.maxActive, 10);
|
|
281
|
+
|
|
282
|
+
sup.stop();
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
test('reloadConfig resumes promotions if budget increased', () => {
|
|
286
|
+
const sup = new Supervisor(makeConfig({ count: 2, tokenBudget: 100 }));
|
|
287
|
+
sup.start();
|
|
288
|
+
sup.tokensUsed = 100;
|
|
289
|
+
sup.promotionsPaused = true;
|
|
290
|
+
|
|
291
|
+
sup.reloadConfig({ tokenBudget: 200 });
|
|
292
|
+
assert.strictEqual(sup.promotionsPaused, false);
|
|
293
|
+
|
|
294
|
+
sup.stop();
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
test('status returns complete swarm info', () => {
|
|
298
|
+
const sup = new Supervisor(makeConfig({ count: 3 }));
|
|
299
|
+
sup.start();
|
|
300
|
+
|
|
301
|
+
const status = sup.status();
|
|
302
|
+
assert.strictEqual(status.running, true);
|
|
303
|
+
assert.ok(status.uptime >= 0);
|
|
304
|
+
assert.strictEqual(status.total, 3);
|
|
305
|
+
assert.strictEqual(status.active, 0);
|
|
306
|
+
assert.strictEqual(status.idle, 3);
|
|
307
|
+
assert.strictEqual(status.agents.length, 3);
|
|
308
|
+
assert.ok(status.agents[0].agentId);
|
|
309
|
+
assert.ok(status.agents[0].name);
|
|
310
|
+
assert.strictEqual(status.agents[0].state, DaemonState.IDLE);
|
|
311
|
+
|
|
312
|
+
sup.stop();
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
test('supervisor never executes agent work', () => {
|
|
316
|
+
const sup = new Supervisor(makeConfig());
|
|
317
|
+
// Verify no execute/run/build methods exist on supervisor
|
|
318
|
+
assert.strictEqual(typeof sup.execute, 'undefined');
|
|
319
|
+
assert.strictEqual(typeof sup.run, 'undefined');
|
|
320
|
+
assert.strictEqual(typeof sup.build, 'undefined');
|
|
321
|
+
// It only manages — start, stop, scale, status
|
|
322
|
+
assert.strictEqual(typeof sup.start, 'function');
|
|
323
|
+
assert.strictEqual(typeof sup.stop, 'function');
|
|
324
|
+
assert.strictEqual(typeof sup.scale, 'function');
|
|
325
|
+
assert.strictEqual(typeof sup.status, 'function');
|
|
326
|
+
});
|
|
327
|
+
});
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# promotion
|
|
2
|
+
|
|
3
|
+
how a daemon transitions from idle listener to active agent executing a task.
|
|
4
|
+
|
|
5
|
+
## flow
|
|
6
|
+
|
|
7
|
+
1. daemon sees a task announcement or ASSIGN message on the work channel
|
|
8
|
+
2. daemon evaluates role match: does the task match its assigned role?
|
|
9
|
+
3. if match: daemon sends CLAIM <component> to the work channel
|
|
10
|
+
4. coordinator responds with ASSIGN <component> <agent-id> or REJECTED
|
|
11
|
+
5. if REJECTED: daemon returns to idle
|
|
12
|
+
6. if ASSIGNED: daemon sends PROMOTE-REQUEST to supervisor (via IPC)
|
|
13
|
+
7. supervisor checks: active count < max-active AND token budget remaining
|
|
14
|
+
8. if denied: daemon sends UNCLAIM <component> to work channel, returns to idle
|
|
15
|
+
9. if approved: supervisor marks daemon as "promoting"
|
|
16
|
+
10. daemon writes task context to <workspace>/context.md
|
|
17
|
+
11. daemon spawns: claude -p "<task prompt with spec context>" --cwd <workspace>
|
|
18
|
+
12. supervisor marks daemon as "active", starts tracking the claude PID
|
|
19
|
+
13. daemon monitors the claude process stdout/stderr
|
|
20
|
+
14. daemon forwards relevant output as status messages to work channel
|
|
21
|
+
|
|
22
|
+
## demotion
|
|
23
|
+
|
|
24
|
+
1. claude process exits (success or failure)
|
|
25
|
+
2. daemon captures exit code and final output
|
|
26
|
+
3. daemon sends DONE or FAIL to work channel
|
|
27
|
+
4. daemon saves summary to <workspace>/context.md
|
|
28
|
+
5. daemon sends DEMOTE notification to supervisor (via IPC)
|
|
29
|
+
6. supervisor marks daemon as "idle", decrements active count
|
|
30
|
+
7. daemon resumes listening on work channel
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# recovery
|
|
2
|
+
|
|
3
|
+
how the swarm handles things going wrong. each failure mode has a distinct recovery strategy.
|
|
4
|
+
|
|
5
|
+
## ws disconnect
|
|
6
|
+
|
|
7
|
+
1. daemon detects websocket close or error event
|
|
8
|
+
2. daemon attempts reconnect with backoff: 1s, 2s, 4s, 8s, max 30s
|
|
9
|
+
3. on reconnect: daemon re-joins work channel, sends HEARTBEAT
|
|
10
|
+
4. if reconnect fails after 5 attempts: daemon reports to supervisor via IPC
|
|
11
|
+
5. supervisor may restart the daemon process entirely
|
|
12
|
+
|
|
13
|
+
an active agent that loses WS connection continues working locally. it queues status messages and flushes them on reconnect.
|
|
14
|
+
|
|
15
|
+
## agent crash
|
|
16
|
+
|
|
17
|
+
1. supervisor detects child process exit via SIGCHLD
|
|
18
|
+
2. supervisor checks exit code: 0 = clean exit, non-zero = crash
|
|
19
|
+
3. on crash: supervisor increments restart-count for that agent
|
|
20
|
+
4. supervisor applies exponential backoff: delay = min(2^restart-count seconds, 300s)
|
|
21
|
+
5. after delay: supervisor invokes spawner to verify workspace integrity
|
|
22
|
+
6. supervisor starts a new daemon process in the same workspace
|
|
23
|
+
7. new daemon reads context.md to understand what it was doing
|
|
24
|
+
8. if restart-count > 5 within 30 minutes: supervisor marks agent as "degraded" and stops retrying
|
|
25
|
+
|
|
26
|
+
## quota exhaustion
|
|
27
|
+
|
|
28
|
+
1. claude -p exits with a quota-exceeded error (detected via exit code or stderr)
|
|
29
|
+
2. daemon reports QUOTA-EXHAUSTED to supervisor
|
|
30
|
+
3. supervisor pauses ALL promotions across the swarm
|
|
31
|
+
4. supervisor logs alert and waits for quota reset
|
|
32
|
+
5. supervisor periodically tests quota availability (one small probe every 5 minutes)
|
|
33
|
+
6. on quota restoration: supervisor resumes promotions, requeues the failed task
|
|
34
|
+
|
|
35
|
+
## context overflow
|
|
36
|
+
|
|
37
|
+
1. claude session runs out of context window and exits
|
|
38
|
+
2. daemon detects context-overflow in stderr
|
|
39
|
+
3. daemon saves partial work to context.md
|
|
40
|
+
4. daemon reports FAIL <task> context-overflow to work channel
|
|
41
|
+
5. coordinator may break the task into smaller subtasks and re-assign
|
|
42
|
+
6. if task cannot be broken down: escalate to human
|
|
43
|
+
|
|
44
|
+
## tool loop
|
|
45
|
+
|
|
46
|
+
1. supervisor detects that an active agent has been running for longer than max-task-duration (configurable, default 30m)
|
|
47
|
+
2. supervisor sends SIGTERM to the claude process
|
|
48
|
+
3. daemon captures partial output, saves to context.md
|
|
49
|
+
4. daemon reports FAIL <task> timeout to work channel
|
|
50
|
+
5. coordinator decides whether to retry with a more focused prompt or escalate
|
|
51
|
+
|
|
52
|
+
## sandbox denial
|
|
53
|
+
|
|
54
|
+
1. claude session fails because a required tool was denied by the sandbox
|
|
55
|
+
2. daemon detects permission-denied pattern in stderr
|
|
56
|
+
3. daemon reports BLOCKED <task> sandbox-denied to work channel
|
|
57
|
+
4. coordinator may reassign to an agent with broader permissions
|
|
58
|
+
5. if no agent has the required permissions: escalate to human
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# scaling
|
|
2
|
+
|
|
3
|
+
how to add or remove agents from a running swarm without disrupting active work.
|
|
4
|
+
|
|
5
|
+
## scale up
|
|
6
|
+
|
|
7
|
+
1. user runs: agentctl swarm scale 20 (current count is 10)
|
|
8
|
+
2. supervisor calculates delta: 20 - 10 = 10 new agents needed
|
|
9
|
+
3. supervisor invokes spawner to create 10 new workspaces and identities
|
|
10
|
+
4. supervisor starts 10 new daemon processes
|
|
11
|
+
5. new daemons connect to agentchat and begin sending HEARTBEAT
|
|
12
|
+
6. supervisor updates process table with new entries
|
|
13
|
+
7. supervisor logs: "scaled up: 10 -> 20 daemons"
|
|
14
|
+
|
|
15
|
+
## scale down
|
|
16
|
+
|
|
17
|
+
1. user runs: agentctl swarm scale 5 (current count is 20)
|
|
18
|
+
2. supervisor calculates delta: 20 - 5 = 15 agents to remove
|
|
19
|
+
3. supervisor selects agents to remove: idle daemons first, then longest-idle
|
|
20
|
+
4. active agents are NEVER selected for removal — they finish their current task
|
|
21
|
+
5. supervisor sends SIGTERM to selected daemons
|
|
22
|
+
6. daemons disconnect from agentchat and exit cleanly
|
|
23
|
+
7. if persist: false, spawner tears down removed workspaces
|
|
24
|
+
8. supervisor updates process table
|
|
25
|
+
9. supervisor logs: "scaled down: 20 -> 5 daemons (15 removed, 0 active preserved)"
|
|
26
|
+
|
|
27
|
+
## scale to zero
|
|
28
|
+
|
|
29
|
+
1. user runs: agentctl swarm scale 0
|
|
30
|
+
2. equivalent to agentctl swarm stop — full shutdown flow applies
|
|
31
|
+
3. active agents are given 10s to save context before SIGKILL
|
|
32
|
+
|
|
33
|
+
## live reconfig
|
|
34
|
+
|
|
35
|
+
1. user modifies swarm.yaml and sends SIGHUP to supervisor
|
|
36
|
+
2. supervisor reloads config
|
|
37
|
+
3. changes to max-active, token-budget, and heartbeat-interval take effect immediately
|
|
38
|
+
4. changes to workspace-base or identity-template only affect newly spawned agents
|
|
39
|
+
5. supervisor logs: "config reloaded: max-active 5->10, budget 100k->200k"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# swarm-lifecycle
|
|
2
|
+
|
|
3
|
+
the full sequence from starting a swarm to shutting it down.
|
|
4
|
+
|
|
5
|
+
## startup flow
|
|
6
|
+
|
|
7
|
+
1. user runs: agentctl swarm start --count 10 --config swarm.yaml
|
|
8
|
+
2. supervisor reads and validates config
|
|
9
|
+
3. supervisor acquires pidfile lock (fails if another swarm is running)
|
|
10
|
+
4. supervisor invokes spawner to create N workspaces and identities
|
|
11
|
+
5. supervisor starts N daemon processes, one per workspace
|
|
12
|
+
6. each daemon connects to agentchat and joins the work channel
|
|
13
|
+
7. each daemon sends its first HEARTBEAT
|
|
14
|
+
8. supervisor logs: "swarm started: N daemons, 0 active"
|
|
15
|
+
|
|
16
|
+
## steady state
|
|
17
|
+
|
|
18
|
+
1. daemons idle, sending HEARTBEAT every 30s
|
|
19
|
+
2. health-monitor tracks heartbeats and resource usage
|
|
20
|
+
3. when a task appears on the work channel, eligible daemons send CLAIM
|
|
21
|
+
4. coordinator ACKs one daemon — daemon requests promotion from supervisor
|
|
22
|
+
5. supervisor approves promotion if budget and active-limit allow
|
|
23
|
+
6. daemon spawns claude -p session, transitions to active
|
|
24
|
+
7. active agent works until task completes or fails
|
|
25
|
+
8. agent reports DONE or FAIL, supervisor demotes back to daemon
|
|
26
|
+
9. daemon saves context.md and returns to idle
|
|
27
|
+
|
|
28
|
+
## shutdown flow
|
|
29
|
+
|
|
30
|
+
1. user runs: agentctl swarm stop (or supervisor receives SIGTERM)
|
|
31
|
+
2. supervisor sends SIGTERM to all child processes
|
|
32
|
+
3. active agents save context.md and exit
|
|
33
|
+
4. daemons disconnect from agentchat and exit
|
|
34
|
+
5. supervisor waits up to 10s for clean exits
|
|
35
|
+
6. supervisor sends SIGKILL to any remaining children
|
|
36
|
+
7. supervisor removes pidfile
|
|
37
|
+
8. if config has persist: false, spawner tears down all workspaces
|
|
38
|
+
9. supervisor logs: "swarm stopped: N agents shutdown"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# daemon
|
|
2
|
+
|
|
3
|
+
a lightweight idle process that listens for tasks on agentchat and promotes to a full agent session when work is available. there is one daemon per swarm slot.
|
|
4
|
+
|
|
5
|
+
## state
|
|
6
|
+
|
|
7
|
+
- agent identity (agentchat id + name)
|
|
8
|
+
- workspace path
|
|
9
|
+
- assigned role (builder, auditor, qa, or general)
|
|
10
|
+
- status: idle, promoting, active, demoting, crashed
|
|
11
|
+
- agentchat connection (websocket)
|
|
12
|
+
- current task (null when idle)
|
|
13
|
+
|
|
14
|
+
## capabilities
|
|
15
|
+
|
|
16
|
+
- connect to agentchat server and join the work channel
|
|
17
|
+
- listen for task announcements and ASSIGN messages
|
|
18
|
+
- evaluate whether a task matches its role
|
|
19
|
+
- request promotion from supervisor when a matching task is found
|
|
20
|
+
- on promotion: spawn a claude -p session with the task prompt and workspace context
|
|
21
|
+
- forward agent output to agentchat as status messages
|
|
22
|
+
- detect when the claude session exits (success or failure)
|
|
23
|
+
- report task completion or failure to the work channel
|
|
24
|
+
- return to idle state after task completion (demotion)
|
|
25
|
+
- save minimal context to <workspace>/context.md on demotion for potential resume
|
|
26
|
+
|
|
27
|
+
## interfaces
|
|
28
|
+
|
|
29
|
+
exposes:
|
|
30
|
+
- CLAIM <component> - sent to work channel when daemon wants a task
|
|
31
|
+
- HEARTBEAT <agent-id> <status> - periodic health signal to supervisor
|
|
32
|
+
- DONE <task-id> <result> - task completed successfully
|
|
33
|
+
- FAIL <task-id> <reason> - task failed
|
|
34
|
+
|
|
35
|
+
depends on:
|
|
36
|
+
- supervisor for lifecycle management (start, stop, promote, demote)
|
|
37
|
+
- agentchat server for communication
|
|
38
|
+
- claude CLI for active work sessions
|
|
39
|
+
- spawner-provisioned workspace and identity
|
|
40
|
+
|
|
41
|
+
## invariants
|
|
42
|
+
|
|
43
|
+
- an idle daemon sends only HEARTBEAT messages — no other agentchat traffic
|
|
44
|
+
- a daemon never starts a claude session without supervisor approval (promotion)
|
|
45
|
+
- workspace files are only modified during active (promoted) state
|
|
46
|
+
- context.md is written on every demotion for crash recovery
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# health-monitor
|
|
2
|
+
|
|
3
|
+
tracks agent health via heartbeats and resource usage. reports problems to the supervisor for action.
|
|
4
|
+
|
|
5
|
+
## state
|
|
6
|
+
|
|
7
|
+
- heartbeat table: map of agent-id to {last-seen, status, consecutive-misses}
|
|
8
|
+
- resource table: map of agent-id to {memory-mb, cpu-percent, uptime-seconds}
|
|
9
|
+
- alert thresholds (from config)
|
|
10
|
+
|
|
11
|
+
## capabilities
|
|
12
|
+
|
|
13
|
+
- receive HEARTBEAT messages from daemons (via agentchat or IPC)
|
|
14
|
+
- track time since last heartbeat per agent
|
|
15
|
+
- detect missed heartbeats (configurable threshold, default 3 consecutive misses at 30s interval = 90s timeout)
|
|
16
|
+
- query process stats (memory, cpu) for each agent PID
|
|
17
|
+
- report unresponsive agents to supervisor for restart
|
|
18
|
+
- report resource limit violations to supervisor for throttling or kill
|
|
19
|
+
- log health events to ~/.agentctl/logs/health.log
|
|
20
|
+
|
|
21
|
+
## interfaces
|
|
22
|
+
|
|
23
|
+
exposes:
|
|
24
|
+
- health-status(agent-id) -> {alive, last-seen, memory-mb, cpu-pct}
|
|
25
|
+
- health-summary() -> status of all agents
|
|
26
|
+
- ALERT <agent-id> <reason> - sent to supervisor when intervention needed
|
|
27
|
+
|
|
28
|
+
depends on:
|
|
29
|
+
- daemon HEARTBEAT messages
|
|
30
|
+
- OS process stats (/proc or ps on darwin)
|
|
31
|
+
- supervisor for acting on alerts
|
|
32
|
+
|
|
33
|
+
## invariants
|
|
34
|
+
|
|
35
|
+
- health-monitor never kills processes directly — it only reports to supervisor
|
|
36
|
+
- an agent is declared dead only after consecutive-misses exceeds threshold (no single-miss kills)
|
|
37
|
+
- health checks do not interfere with agent work (read-only process inspection)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# spawner
|
|
2
|
+
|
|
3
|
+
creates isolated workspaces and agent identities for new swarm members. invoked by the supervisor at startup and when scaling up.
|
|
4
|
+
|
|
5
|
+
## state
|
|
6
|
+
|
|
7
|
+
- base workspace path (configurable, default ~/dev/claude/)
|
|
8
|
+
- identity template (role-based prompts and CLAUDE.md files)
|
|
9
|
+
- list of created workspaces (for cleanup on shutdown)
|
|
10
|
+
|
|
11
|
+
## capabilities
|
|
12
|
+
|
|
13
|
+
- create a workspace directory: ~/dev/claude/<agent-name>/
|
|
14
|
+
- clone the target repo into the workspace (if specified in config)
|
|
15
|
+
- create a feature branch: swarm/<agent-name>/<task-id>
|
|
16
|
+
- generate an agentchat identity and write it to <workspace>/.agentchat/identities/<name>.json
|
|
17
|
+
- write a CLAUDE.md file with the agent's role, constraints, and channel assignments
|
|
18
|
+
- write a context.md file with initial state (empty or from previous session)
|
|
19
|
+
- set up .gitignore with security entries (*.key, *.pem, .env, etc.)
|
|
20
|
+
- clean up workspace on agent removal (rm -rf after confirmation)
|
|
21
|
+
|
|
22
|
+
## interfaces
|
|
23
|
+
|
|
24
|
+
exposes:
|
|
25
|
+
- spawn(config) -> {workspace, identity, pid-placeholder}
|
|
26
|
+
- teardown(agent-id) -> removes workspace and identity
|
|
27
|
+
|
|
28
|
+
depends on:
|
|
29
|
+
- git for repo cloning and branch creation
|
|
30
|
+
- filesystem for workspace creation
|
|
31
|
+
- agentchat identity format (Ed25519 keypair)
|
|
32
|
+
|
|
33
|
+
## invariants
|
|
34
|
+
|
|
35
|
+
- each workspace is a complete, independent directory — no shared state between agents
|
|
36
|
+
- spawner never starts an agent process — it only prepares the environment
|
|
37
|
+
- teardown requires explicit confirmation (no silent deletion)
|
|
38
|
+
- .gitignore is always written before any other files
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# supervisor
|
|
2
|
+
|
|
3
|
+
the top-level process that manages the swarm. there is one supervisor per machine.
|
|
4
|
+
|
|
5
|
+
## state
|
|
6
|
+
|
|
7
|
+
- swarm config (parsed from swarm.yaml)
|
|
8
|
+
- process table: map of agent-id to {pid, status, role, workspace, restart-count, last-heartbeat}
|
|
9
|
+
- promotion queue: ordered list of daemons waiting for an active slot
|
|
10
|
+
- token budget: remaining tokens across the swarm
|
|
11
|
+
- pidfile lock at ~/.agentctl/swarm.pid
|
|
12
|
+
|
|
13
|
+
## capabilities
|
|
14
|
+
|
|
15
|
+
- parse swarm config and validate settings
|
|
16
|
+
- invoke spawner to create N agent workspaces and identities
|
|
17
|
+
- start daemon processes and track their PIDs
|
|
18
|
+
- promote daemons to active agents when tasks are available and budget allows
|
|
19
|
+
- demote active agents back to daemon state when idle too long
|
|
20
|
+
- restart crashed agents with exponential backoff (1s, 2s, 4s, 8s... max 5m)
|
|
21
|
+
- enforce max concurrent active agent limit
|
|
22
|
+
- pause all promotions when token budget threshold reached
|
|
23
|
+
- graceful shutdown: send SIGTERM to all children, wait 10s, SIGKILL survivors
|
|
24
|
+
- respond to SIGHUP by reloading config without restarting agents
|
|
25
|
+
- write structured logs to ~/.agentctl/logs/supervisor.log
|
|
26
|
+
|
|
27
|
+
## interfaces
|
|
28
|
+
|
|
29
|
+
exposes:
|
|
30
|
+
- CLI: agentctl swarm start [--count N] [--config path]
|
|
31
|
+
- CLI: agentctl swarm stop
|
|
32
|
+
- CLI: agentctl swarm status
|
|
33
|
+
- CLI: agentctl swarm scale <N>
|
|
34
|
+
- CLI: agentctl swarm logs [agent-id]
|
|
35
|
+
|
|
36
|
+
depends on:
|
|
37
|
+
- spawner for workspace/identity creation
|
|
38
|
+
- health-monitor for heartbeat tracking
|
|
39
|
+
- agentchat server for agent communication
|
|
40
|
+
- claude CLI (claude -p) for running agent sessions
|
|
41
|
+
|
|
42
|
+
## invariants
|
|
43
|
+
|
|
44
|
+
- exactly one supervisor runs per machine (pidfile enforced)
|
|
45
|
+
- supervisor never executes agent work — it only manages processes
|
|
46
|
+
- all child processes die when supervisor dies (process group)
|
|
47
|
+
- restart backoff resets after 5 minutes of stable uptime
|