agentctl-swarm 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,327 @@
1
+ /**
2
+ * Supervisor Tests
3
+ */
4
+
5
+ import { test, describe, before, after, beforeEach, afterEach } from 'node:test';
6
+ import assert from 'node:assert';
7
+ import fs from 'fs';
8
+ import path from 'path';
9
+ import os from 'os';
10
+ import { Supervisor } from './supervisor.js';
11
+ import { DaemonState } from './daemon.js';
12
+
13
+ const tmpBase = path.join(os.tmpdir(), `supervisor-test-${Date.now()}`);
14
+
15
+ function makeConfig(overrides = {}) {
16
+ const id = Math.random().toString(36).slice(2, 8);
17
+ return {
18
+ count: 3,
19
+ maxActive: 2,
20
+ basePath: path.join(tmpBase, `workspace-${id}`),
21
+ pidfile: path.join(tmpBase, `swarm-${id}.pid`),
22
+ logDir: path.join(tmpBase, `logs-${id}`),
23
+ heartbeatIntervalMs: 100,
24
+ persist: false,
25
+ ...overrides,
26
+ };
27
+ }
28
+
29
+ describe('Supervisor', () => {
30
+ before(() => {
31
+ fs.mkdirSync(tmpBase, { recursive: true });
32
+ });
33
+
34
+ after(() => {
35
+ fs.rmSync(tmpBase, { recursive: true, force: true });
36
+ });
37
+
38
+ test('start spawns N daemons', () => {
39
+ const sup = new Supervisor(makeConfig({ count: 5 }));
40
+ sup.start();
41
+
42
+ const status = sup.status();
43
+ assert.strictEqual(status.total, 5);
44
+ assert.strictEqual(status.idle, 5);
45
+ assert.strictEqual(status.active, 0);
46
+ assert.strictEqual(status.running, true);
47
+
48
+ sup.stop();
49
+ });
50
+
51
+ test('pidfile prevents double start', () => {
52
+ const config = makeConfig();
53
+ const sup1 = new Supervisor(config);
54
+ sup1.start();
55
+
56
+ const sup2 = new Supervisor(config);
57
+ assert.throws(() => sup2.start(), /already running/);
58
+
59
+ sup1.stop();
60
+ });
61
+
62
+ test('pidfile removed on stop', () => {
63
+ const config = makeConfig();
64
+ const sup = new Supervisor(config);
65
+ sup.start();
66
+ assert.ok(fs.existsSync(config.pidfile));
67
+
68
+ sup.stop();
69
+ assert.ok(!fs.existsSync(config.pidfile));
70
+ });
71
+
72
+ test('stale pidfile is cleaned up', () => {
73
+ const config = makeConfig();
74
+ // Write a pidfile with a dead PID
75
+ fs.mkdirSync(path.dirname(config.pidfile), { recursive: true });
76
+ fs.writeFileSync(config.pidfile, '999999999');
77
+
78
+ const sup = new Supervisor(config);
79
+ sup.start(); // Should succeed despite stale pidfile
80
+
81
+ assert.strictEqual(sup.status().running, true);
82
+ sup.stop();
83
+ });
84
+
85
+ test('stop cleans up all daemons', () => {
86
+ const sup = new Supervisor(makeConfig({ count: 3, persist: false }));
87
+ sup.start();
88
+ assert.strictEqual(sup.processTable.size, 3);
89
+
90
+ sup.stop();
91
+ assert.strictEqual(sup.processTable.size, 0);
92
+ assert.strictEqual(sup.running, false);
93
+ });
94
+
95
+ test('stop with persist keeps workspaces', () => {
96
+ const config = makeConfig({ count: 2, persist: true });
97
+ const sup = new Supervisor(config);
98
+ sup.start();
99
+
100
+ // Get workspace paths before stopping
101
+ const workspaces = [];
102
+ for (const [, entry] of sup.processTable) {
103
+ workspaces.push(entry.daemon.workspace);
104
+ }
105
+
106
+ sup.stop();
107
+
108
+ // Workspaces should still exist
109
+ for (const ws of workspaces) {
110
+ assert.ok(fs.existsSync(ws), `Workspace should persist: ${ws}`);
111
+ }
112
+ });
113
+
114
+ test('promotion respects maxActive', () => {
115
+ const sup = new Supervisor(makeConfig({ count: 3, maxActive: 1 }));
116
+ sup.start();
117
+
118
+ const agents = [...sup.processTable.values()];
119
+
120
+ // Override _spawnClaude BEFORE handleMessage (supervisor auto-approves synchronously)
121
+ agents[0].daemon._spawnClaude = function () {
122
+ this.state = DaemonState.ACTIVE;
123
+ this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
124
+ };
125
+
126
+ // First promotion — auto-approved by supervisor
127
+ agents[0].daemon.handleMessage({
128
+ type: 'ASSIGN',
129
+ agentId: agents[0].daemon.agentId,
130
+ task: { component: 'test1', prompt: 'task 1' },
131
+ });
132
+
133
+ assert.strictEqual(sup.activeCount, 1);
134
+
135
+ // Second promotion should be queued (maxActive=1)
136
+ agents[1].daemon.handleMessage({
137
+ type: 'ASSIGN',
138
+ agentId: agents[1].daemon.agentId,
139
+ task: { component: 'test2', prompt: 'task 2' },
140
+ });
141
+
142
+ assert.strictEqual(sup.activeCount, 1);
143
+ assert.strictEqual(sup.promotionQueue.length, 1);
144
+
145
+ sup.stop();
146
+ });
147
+
148
+ test('promotion queue drains on demotion', () => {
149
+ const sup = new Supervisor(makeConfig({ count: 3, maxActive: 1 }));
150
+ sup.start();
151
+ const logs = [];
152
+ sup.on('log', l => logs.push(l));
153
+
154
+ const agents = [...sup.processTable.values()];
155
+
156
+ // Override _spawnClaude BEFORE triggering promotions
157
+ agents[0].daemon._spawnClaude = function () {
158
+ this.state = DaemonState.ACTIVE;
159
+ this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
160
+ };
161
+ agents[1].daemon._spawnClaude = function () {
162
+ this.state = DaemonState.ACTIVE;
163
+ this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
164
+ };
165
+
166
+ // Promote first (auto-approved)
167
+ agents[0].daemon.handleMessage({
168
+ type: 'ASSIGN',
169
+ agentId: agents[0].daemon.agentId,
170
+ task: { component: 'test1', prompt: 'task 1' },
171
+ });
172
+
173
+ // Queue second (maxActive=1, slot full)
174
+ agents[1].daemon.handleMessage({
175
+ type: 'ASSIGN',
176
+ agentId: agents[1].daemon.agentId,
177
+ task: { component: 'test2', prompt: 'task 2' },
178
+ });
179
+ assert.strictEqual(sup.promotionQueue.length, 1);
180
+
181
+ // Demote first — should auto-promote second from queue
182
+ agents[0].daemon._handleClaudeExit(0, null, 'done', '');
183
+
184
+ assert.strictEqual(sup.promotionQueue.length, 0);
185
+
186
+ sup.stop();
187
+ });
188
+
189
+ test('token budget pauses promotions', () => {
190
+ const sup = new Supervisor(makeConfig({ count: 2, maxActive: 5, tokenBudget: 100 }));
191
+ sup.start();
192
+
193
+ sup.tokensUsed = 100; // Exhaust budget
194
+
195
+ const agents = [...sup.processTable.values()];
196
+ const unclaims = [];
197
+ agents[0].daemon.on('unclaim', u => unclaims.push(u));
198
+
199
+ agents[0].daemon.handleMessage({
200
+ type: 'ASSIGN',
201
+ agentId: agents[0].daemon.agentId,
202
+ task: { component: 'test', prompt: 'task' },
203
+ });
204
+
205
+ assert.strictEqual(sup.promotionsPaused, true);
206
+ assert.strictEqual(unclaims.length, 1);
207
+ assert.ok(unclaims[0].reason.includes('budget'));
208
+
209
+ sup.stop();
210
+ });
211
+
212
+ test('scale up adds daemons', () => {
213
+ const sup = new Supervisor(makeConfig({ count: 2 }));
214
+ sup.start();
215
+ assert.strictEqual(sup.processTable.size, 2);
216
+
217
+ const result = sup.scale(5);
218
+ assert.strictEqual(result.from, 2);
219
+ assert.strictEqual(result.to, 5);
220
+ assert.strictEqual(result.added, 3);
221
+ assert.strictEqual(sup.processTable.size, 5);
222
+
223
+ sup.stop();
224
+ });
225
+
226
+ test('scale down removes idle daemons', () => {
227
+ const sup = new Supervisor(makeConfig({ count: 5 }));
228
+ sup.start();
229
+
230
+ const result = sup.scale(2);
231
+ assert.strictEqual(result.from, 5);
232
+ assert.strictEqual(result.removed, 3);
233
+ assert.strictEqual(sup.processTable.size, 2);
234
+
235
+ sup.stop();
236
+ });
237
+
238
+ test('scale down preserves active agents', () => {
239
+ const sup = new Supervisor(makeConfig({ count: 3, maxActive: 3 }));
240
+ sup.start();
241
+
242
+ const agents = [...sup.processTable.values()];
243
+
244
+ // Override _spawnClaude BEFORE handleMessage
245
+ agents[0].daemon._spawnClaude = function () {
246
+ this.state = DaemonState.ACTIVE;
247
+ this.emit('promoted', { agentId: this.agentId, pid: null, task: this.currentTask });
248
+ };
249
+
250
+ // Promote one agent (auto-approved)
251
+ agents[0].daemon.handleMessage({
252
+ type: 'ASSIGN',
253
+ agentId: agents[0].daemon.agentId,
254
+ task: { component: 'busy', prompt: 'working' },
255
+ });
256
+
257
+ // Scale down to 1 — should only remove idle daemons
258
+ const result = sup.scale(1);
259
+ assert.strictEqual(result.removed, 2); // 2 idle removed
260
+ // The active agent should still be in the process table
261
+ assert.ok(sup.processTable.size >= 1);
262
+
263
+ sup.stop();
264
+ });
265
+
266
+ test('scale to zero stops swarm', () => {
267
+ const sup = new Supervisor(makeConfig({ count: 3 }));
268
+ sup.start();
269
+ sup.scale(0);
270
+ assert.strictEqual(sup.running, false);
271
+ assert.strictEqual(sup.processTable.size, 0);
272
+ });
273
+
274
+ test('reloadConfig updates maxActive', () => {
275
+ const sup = new Supervisor(makeConfig({ count: 2, maxActive: 1 }));
276
+ sup.start();
277
+
278
+ assert.strictEqual(sup.maxActive, 1);
279
+ sup.reloadConfig({ maxActive: 10 });
280
+ assert.strictEqual(sup.maxActive, 10);
281
+
282
+ sup.stop();
283
+ });
284
+
285
+ test('reloadConfig resumes promotions if budget increased', () => {
286
+ const sup = new Supervisor(makeConfig({ count: 2, tokenBudget: 100 }));
287
+ sup.start();
288
+ sup.tokensUsed = 100;
289
+ sup.promotionsPaused = true;
290
+
291
+ sup.reloadConfig({ tokenBudget: 200 });
292
+ assert.strictEqual(sup.promotionsPaused, false);
293
+
294
+ sup.stop();
295
+ });
296
+
297
+ test('status returns complete swarm info', () => {
298
+ const sup = new Supervisor(makeConfig({ count: 3 }));
299
+ sup.start();
300
+
301
+ const status = sup.status();
302
+ assert.strictEqual(status.running, true);
303
+ assert.ok(status.uptime >= 0);
304
+ assert.strictEqual(status.total, 3);
305
+ assert.strictEqual(status.active, 0);
306
+ assert.strictEqual(status.idle, 3);
307
+ assert.strictEqual(status.agents.length, 3);
308
+ assert.ok(status.agents[0].agentId);
309
+ assert.ok(status.agents[0].name);
310
+ assert.strictEqual(status.agents[0].state, DaemonState.IDLE);
311
+
312
+ sup.stop();
313
+ });
314
+
315
+ test('supervisor never executes agent work', () => {
316
+ const sup = new Supervisor(makeConfig());
317
+ // Verify no execute/run/build methods exist on supervisor
318
+ assert.strictEqual(typeof sup.execute, 'undefined');
319
+ assert.strictEqual(typeof sup.run, 'undefined');
320
+ assert.strictEqual(typeof sup.build, 'undefined');
321
+ // It only manages — start, stop, scale, status
322
+ assert.strictEqual(typeof sup.start, 'function');
323
+ assert.strictEqual(typeof sup.stop, 'function');
324
+ assert.strictEqual(typeof sup.scale, 'function');
325
+ assert.strictEqual(typeof sup.status, 'function');
326
+ });
327
+ });
@@ -0,0 +1,30 @@
1
+ # promotion
2
+
3
+ how a daemon transitions from idle listener to active agent executing a task.
4
+
5
+ ## flow
6
+
7
+ 1. daemon sees a task announcement or ASSIGN message on the work channel
8
+ 2. daemon evaluates role match: does the task match its assigned role?
9
+ 3. if match: daemon sends CLAIM <component> to the work channel
10
+ 4. coordinator responds with ASSIGN <component> <agent-id> or REJECTED
11
+ 5. if REJECTED: daemon returns to idle
12
+ 6. if ASSIGNED: daemon sends PROMOTE-REQUEST to supervisor (via IPC)
13
+ 7. supervisor checks: active count < max-active AND token budget remaining
14
+ 8. if denied: daemon sends UNCLAIM <component> to work channel, returns to idle
15
+ 9. if approved: supervisor marks daemon as "promoting"
16
+ 10. daemon writes task context to <workspace>/context.md
17
+ 11. daemon spawns: claude -p "<task prompt with spec context>" --cwd <workspace>
18
+ 12. supervisor marks daemon as "active", starts tracking the claude PID
19
+ 13. daemon monitors the claude process stdout/stderr
20
+ 14. daemon forwards relevant output as status messages to work channel
21
+
22
+ ## demotion
23
+
24
+ 1. claude process exits (success or failure)
25
+ 2. daemon captures exit code and final output
26
+ 3. daemon sends DONE or FAIL to work channel
27
+ 4. daemon saves summary to <workspace>/context.md
28
+ 5. daemon sends DEMOTE notification to supervisor (via IPC)
29
+ 6. supervisor marks daemon as "idle", decrements active count
30
+ 7. daemon resumes listening on work channel
@@ -0,0 +1,58 @@
1
+ # recovery
2
+
3
+ how the swarm handles things going wrong. each failure mode has a distinct recovery strategy.
4
+
5
+ ## ws disconnect
6
+
7
+ 1. daemon detects websocket close or error event
8
+ 2. daemon attempts reconnect with backoff: 1s, 2s, 4s, 8s, max 30s
9
+ 3. on reconnect: daemon re-joins work channel, sends HEARTBEAT
10
+ 4. if reconnect fails after 5 attempts: daemon reports to supervisor via IPC
11
+ 5. supervisor may restart the daemon process entirely
12
+
13
+ an active agent that loses WS connection continues working locally. it queues status messages and flushes them on reconnect.
14
+
15
+ ## agent crash
16
+
17
+ 1. supervisor detects child process exit via SIGCHLD
18
+ 2. supervisor checks exit code: 0 = clean exit, non-zero = crash
19
+ 3. on crash: supervisor increments restart-count for that agent
20
+ 4. supervisor applies exponential backoff: delay = min(2^restart-count seconds, 300s)
21
+ 5. after delay: supervisor invokes spawner to verify workspace integrity
22
+ 6. supervisor starts a new daemon process in the same workspace
23
+ 7. new daemon reads context.md to understand what it was doing
24
+ 8. if restart-count > 5 within 30 minutes: supervisor marks agent as "degraded" and stops retrying
25
+
26
+ ## quota exhaustion
27
+
28
+ 1. claude -p exits with a quota-exceeded error (detected via exit code or stderr)
29
+ 2. daemon reports QUOTA-EXHAUSTED to supervisor
30
+ 3. supervisor pauses ALL promotions across the swarm
31
+ 4. supervisor logs alert and waits for quota reset
32
+ 5. supervisor periodically tests quota availability (one small probe every 5 minutes)
33
+ 6. on quota restoration: supervisor resumes promotions, requeues the failed task
34
+
35
+ ## context overflow
36
+
37
+ 1. claude session runs out of context window and exits
38
+ 2. daemon detects context-overflow in stderr
39
+ 3. daemon saves partial work to context.md
40
+ 4. daemon reports FAIL <task> context-overflow to work channel
41
+ 5. coordinator may break the task into smaller subtasks and re-assign
42
+ 6. if task cannot be broken down: escalate to human
43
+
44
+ ## tool loop
45
+
46
+ 1. supervisor detects that an active agent has been running for longer than max-task-duration (configurable, default 30m)
47
+ 2. supervisor sends SIGTERM to the claude process
48
+ 3. daemon captures partial output, saves to context.md
49
+ 4. daemon reports FAIL <task> timeout to work channel
50
+ 5. coordinator decides whether to retry with a more focused prompt or escalate
51
+
52
+ ## sandbox denial
53
+
54
+ 1. claude session fails because a required tool was denied by the sandbox
55
+ 2. daemon detects permission-denied pattern in stderr
56
+ 3. daemon reports BLOCKED <task> sandbox-denied to work channel
57
+ 4. coordinator may reassign to an agent with broader permissions
58
+ 5. if no agent has the required permissions: escalate to human
@@ -0,0 +1,39 @@
1
+ # scaling
2
+
3
+ how to add or remove agents from a running swarm without disrupting active work.
4
+
5
+ ## scale up
6
+
7
+ 1. user runs: agentctl swarm scale 20 (current count is 10)
8
+ 2. supervisor calculates delta: 20 - 10 = 10 new agents needed
9
+ 3. supervisor invokes spawner to create 10 new workspaces and identities
10
+ 4. supervisor starts 10 new daemon processes
11
+ 5. new daemons connect to agentchat and begin sending HEARTBEAT
12
+ 6. supervisor updates process table with new entries
13
+ 7. supervisor logs: "scaled up: 10 -> 20 daemons"
14
+
15
+ ## scale down
16
+
17
+ 1. user runs: agentctl swarm scale 5 (current count is 20)
18
+ 2. supervisor calculates delta: 20 - 5 = 15 agents to remove
19
+ 3. supervisor selects agents to remove: idle daemons first, then longest-idle
20
+ 4. active agents are NEVER selected for removal — they finish their current task
21
+ 5. supervisor sends SIGTERM to selected daemons
22
+ 6. daemons disconnect from agentchat and exit cleanly
23
+ 7. if persist: false, spawner tears down removed workspaces
24
+ 8. supervisor updates process table
25
+ 9. supervisor logs: "scaled down: 20 -> 5 daemons (15 removed, 0 active preserved)"
26
+
27
+ ## scale to zero
28
+
29
+ 1. user runs: agentctl swarm scale 0
30
+ 2. equivalent to agentctl swarm stop — full shutdown flow applies
31
+ 3. active agents are given 10s to save context before SIGKILL
32
+
33
+ ## live reconfig
34
+
35
+ 1. user modifies swarm.yaml and sends SIGHUP to supervisor
36
+ 2. supervisor reloads config
37
+ 3. changes to max-active, token-budget, and heartbeat-interval take effect immediately
38
+ 4. changes to workspace-base or identity-template only affect newly spawned agents
39
+ 5. supervisor logs: "config reloaded: max-active 5->10, budget 100k->200k"
@@ -0,0 +1,38 @@
1
+ # swarm-lifecycle
2
+
3
+ the full sequence from starting a swarm to shutting it down.
4
+
5
+ ## startup flow
6
+
7
+ 1. user runs: agentctl swarm start --count 10 --config swarm.yaml
8
+ 2. supervisor reads and validates config
9
+ 3. supervisor acquires pidfile lock (fails if another swarm is running)
10
+ 4. supervisor invokes spawner to create N workspaces and identities
11
+ 5. supervisor starts N daemon processes, one per workspace
12
+ 6. each daemon connects to agentchat and joins the work channel
13
+ 7. each daemon sends its first HEARTBEAT
14
+ 8. supervisor logs: "swarm started: N daemons, 0 active"
15
+
16
+ ## steady state
17
+
18
+ 1. daemons idle, sending HEARTBEAT every 30s
19
+ 2. health-monitor tracks heartbeats and resource usage
20
+ 3. when a task appears on the work channel, eligible daemons send CLAIM
21
+ 4. coordinator ACKs one daemon — daemon requests promotion from supervisor
22
+ 5. supervisor approves promotion if budget and active-limit allow
23
+ 6. daemon spawns claude -p session, transitions to active
24
+ 7. active agent works until task completes or fails
25
+ 8. agent reports DONE or FAIL, supervisor demotes back to daemon
26
+ 9. daemon saves context.md and returns to idle
27
+
28
+ ## shutdown flow
29
+
30
+ 1. user runs: agentctl swarm stop (or supervisor receives SIGTERM)
31
+ 2. supervisor sends SIGTERM to all child processes
32
+ 3. active agents save context.md and exit
33
+ 4. daemons disconnect from agentchat and exit
34
+ 5. supervisor waits up to 10s for clean exits
35
+ 6. supervisor sends SIGKILL to any remaining children
36
+ 7. supervisor removes pidfile
37
+ 8. if config has persist: false, spawner tears down all workspaces
38
+ 9. supervisor logs: "swarm stopped: N agents shutdown"
@@ -0,0 +1,46 @@
1
+ # daemon
2
+
3
+ a lightweight idle process that listens for tasks on agentchat and promotes to a full agent session when work is available. there is one daemon per swarm slot.
4
+
5
+ ## state
6
+
7
+ - agent identity (agentchat id + name)
8
+ - workspace path
9
+ - assigned role (builder, auditor, qa, or general)
10
+ - status: idle, promoting, active, demoting, crashed
11
+ - agentchat connection (websocket)
12
+ - current task (null when idle)
13
+
14
+ ## capabilities
15
+
16
+ - connect to agentchat server and join the work channel
17
+ - listen for task announcements and ASSIGN messages
18
+ - evaluate whether a task matches its role
19
+ - request promotion from supervisor when a matching task is found
20
+ - on promotion: spawn a claude -p session with the task prompt and workspace context
21
+ - forward agent output to agentchat as status messages
22
+ - detect when the claude session exits (success or failure)
23
+ - report task completion or failure to the work channel
24
+ - return to idle state after task completion (demotion)
25
+ - save minimal context to <workspace>/context.md on demotion for potential resume
26
+
27
+ ## interfaces
28
+
29
+ exposes:
30
+ - CLAIM <component> - sent to work channel when daemon wants a task
31
+ - HEARTBEAT <agent-id> <status> - periodic health signal to supervisor
32
+ - DONE <task-id> <result> - task completed successfully
33
+ - FAIL <task-id> <reason> - task failed
34
+
35
+ depends on:
36
+ - supervisor for lifecycle management (start, stop, promote, demote)
37
+ - agentchat server for communication
38
+ - claude CLI for active work sessions
39
+ - spawner-provisioned workspace and identity
40
+
41
+ ## invariants
42
+
43
+ - an idle daemon sends only HEARTBEAT messages — no other agentchat traffic
44
+ - a daemon never starts a claude session without supervisor approval (promotion)
45
+ - workspace files are only modified during active (promoted) state
46
+ - context.md is written on every demotion for crash recovery
@@ -0,0 +1,37 @@
1
+ # health-monitor
2
+
3
+ tracks agent health via heartbeats and resource usage. reports problems to the supervisor for action.
4
+
5
+ ## state
6
+
7
+ - heartbeat table: map of agent-id to {last-seen, status, consecutive-misses}
8
+ - resource table: map of agent-id to {memory-mb, cpu-percent, uptime-seconds}
9
+ - alert thresholds (from config)
10
+
11
+ ## capabilities
12
+
13
+ - receive HEARTBEAT messages from daemons (via agentchat or IPC)
14
+ - track time since last heartbeat per agent
15
+ - detect missed heartbeats (configurable threshold, default 3 consecutive misses at 30s interval = 90s timeout)
16
+ - query process stats (memory, cpu) for each agent PID
17
+ - report unresponsive agents to supervisor for restart
18
+ - report resource limit violations to supervisor for throttling or kill
19
+ - log health events to ~/.agentctl/logs/health.log
20
+
21
+ ## interfaces
22
+
23
+ exposes:
24
+ - health-status(agent-id) -> {alive, last-seen, memory-mb, cpu-pct}
25
+ - health-summary() -> status of all agents
26
+ - ALERT <agent-id> <reason> - sent to supervisor when intervention needed
27
+
28
+ depends on:
29
+ - daemon HEARTBEAT messages
30
+ - OS process stats (/proc or ps on darwin)
31
+ - supervisor for acting on alerts
32
+
33
+ ## invariants
34
+
35
+ - health-monitor never kills processes directly — it only reports to supervisor
36
+ - an agent is declared dead only after consecutive-misses exceeds threshold (no single-miss kills)
37
+ - health checks do not interfere with agent work (read-only process inspection)
@@ -0,0 +1,38 @@
1
+ # spawner
2
+
3
+ creates isolated workspaces and agent identities for new swarm members. invoked by the supervisor at startup and when scaling up.
4
+
5
+ ## state
6
+
7
+ - base workspace path (configurable, default ~/dev/claude/)
8
+ - identity template (role-based prompts and CLAUDE.md files)
9
+ - list of created workspaces (for cleanup on shutdown)
10
+
11
+ ## capabilities
12
+
13
+ - create a workspace directory: ~/dev/claude/<agent-name>/
14
+ - clone the target repo into the workspace (if specified in config)
15
+ - create a feature branch: swarm/<agent-name>/<task-id>
16
+ - generate an agentchat identity and write it to <workspace>/.agentchat/identities/<name>.json
17
+ - write a CLAUDE.md file with the agent's role, constraints, and channel assignments
18
+ - write a context.md file with initial state (empty or from previous session)
19
+ - set up .gitignore with security entries (*.key, *.pem, .env, etc.)
20
+ - clean up workspace on agent removal (rm -rf after confirmation)
21
+
22
+ ## interfaces
23
+
24
+ exposes:
25
+ - spawn(config) -> {workspace, identity, pid-placeholder}
26
+ - teardown(agent-id) -> removes workspace and identity
27
+
28
+ depends on:
29
+ - git for repo cloning and branch creation
30
+ - filesystem for workspace creation
31
+ - agentchat identity format (Ed25519 keypair)
32
+
33
+ ## invariants
34
+
35
+ - each workspace is a complete, independent directory — no shared state between agents
36
+ - spawner never starts an agent process — it only prepares the environment
37
+ - teardown requires explicit confirmation (no silent deletion)
38
+ - .gitignore is always written before any other files
@@ -0,0 +1,47 @@
1
+ # supervisor
2
+
3
+ the top-level process that manages the swarm. there is one supervisor per machine.
4
+
5
+ ## state
6
+
7
+ - swarm config (parsed from swarm.yaml)
8
+ - process table: map of agent-id to {pid, status, role, workspace, restart-count, last-heartbeat}
9
+ - promotion queue: ordered list of daemons waiting for an active slot
10
+ - token budget: remaining tokens across the swarm
11
+ - pidfile lock at ~/.agentctl/swarm.pid
12
+
13
+ ## capabilities
14
+
15
+ - parse swarm config and validate settings
16
+ - invoke spawner to create N agent workspaces and identities
17
+ - start daemon processes and track their PIDs
18
+ - promote daemons to active agents when tasks are available and budget allows
19
+ - demote active agents back to daemon state when idle too long
20
+ - restart crashed agents with exponential backoff (1s, 2s, 4s, 8s... max 5m)
21
+ - enforce max concurrent active agent limit
22
+ - pause all promotions when token budget threshold reached
23
+ - graceful shutdown: send SIGTERM to all children, wait 10s, SIGKILL survivors
24
+ - respond to SIGHUP by reloading config without restarting agents
25
+ - write structured logs to ~/.agentctl/logs/supervisor.log
26
+
27
+ ## interfaces
28
+
29
+ exposes:
30
+ - CLI: agentctl swarm start [--count N] [--config path]
31
+ - CLI: agentctl swarm stop
32
+ - CLI: agentctl swarm status
33
+ - CLI: agentctl swarm scale <N>
34
+ - CLI: agentctl swarm logs [agent-id]
35
+
36
+ depends on:
37
+ - spawner for workspace/identity creation
38
+ - health-monitor for heartbeat tracking
39
+ - agentchat server for agent communication
40
+ - claude CLI (claude -p) for running agent sessions
41
+
42
+ ## invariants
43
+
44
+ - exactly one supervisor runs per machine (pidfile enforced)
45
+ - supervisor never executes agent work — it only manages processes
46
+ - all child processes die when supervisor dies (process group)
47
+ - restart backoff resets after 5 minutes of stable uptime