@mclawnet/swarm 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/dist/__tests__/dynamic-add-integration.test.d.ts +2 -0
  2. package/dist/__tests__/dynamic-add-integration.test.d.ts.map +1 -0
  3. package/dist/__tests__/dynamic-add-integration.test.js +108 -0
  4. package/dist/__tests__/dynamic-add-integration.test.js.map +1 -0
  5. package/dist/__tests__/membership-change.test.d.ts +2 -0
  6. package/dist/__tests__/membership-change.test.d.ts.map +1 -0
  7. package/dist/__tests__/membership-change.test.js +132 -0
  8. package/dist/__tests__/membership-change.test.js.map +1 -0
  9. package/dist/__tests__/persistence.test.js +93 -4
  10. package/dist/__tests__/persistence.test.js.map +1 -1
  11. package/dist/__tests__/phase4-5-e2e.test.js +7 -7
  12. package/dist/__tests__/phase4-5-e2e.test.js.map +1 -1
  13. package/dist/__tests__/phase6-7-e2e.test.js +10 -7
  14. package/dist/__tests__/phase6-7-e2e.test.js.map +1 -1
  15. package/dist/__tests__/plan-sync.test.d.ts +2 -0
  16. package/dist/__tests__/plan-sync.test.d.ts.map +1 -0
  17. package/dist/__tests__/plan-sync.test.js +30 -0
  18. package/dist/__tests__/plan-sync.test.js.map +1 -0
  19. package/dist/__tests__/projects-fs.test.js +30 -0
  20. package/dist/__tests__/projects-fs.test.js.map +1 -1
  21. package/dist/__tests__/recovery-forwards-to-coordinator.test.js +1 -1
  22. package/dist/__tests__/recovery-forwards-to-coordinator.test.js.map +1 -1
  23. package/dist/__tests__/recovery-resume.test.js +15 -5
  24. package/dist/__tests__/recovery-resume.test.js.map +1 -1
  25. package/dist/__tests__/role-loader-editor.test.d.ts +2 -0
  26. package/dist/__tests__/role-loader-editor.test.d.ts.map +1 -0
  27. package/dist/__tests__/role-loader-editor.test.js +138 -0
  28. package/dist/__tests__/role-loader-editor.test.js.map +1 -0
  29. package/dist/__tests__/role-loader.test.js +9 -0
  30. package/dist/__tests__/role-loader.test.js.map +1 -1
  31. package/dist/__tests__/spawn-role-injects-briefings.test.js +40 -28
  32. package/dist/__tests__/spawn-role-injects-briefings.test.js.map +1 -1
  33. package/dist/__tests__/spawn-role-mutex.test.d.ts +2 -0
  34. package/dist/__tests__/spawn-role-mutex.test.d.ts.map +1 -0
  35. package/dist/__tests__/spawn-role-mutex.test.js +158 -0
  36. package/dist/__tests__/spawn-role-mutex.test.js.map +1 -0
  37. package/dist/__tests__/spawn-role-rollback.test.d.ts +2 -0
  38. package/dist/__tests__/spawn-role-rollback.test.d.ts.map +1 -0
  39. package/dist/__tests__/spawn-role-rollback.test.js +180 -0
  40. package/dist/__tests__/spawn-role-rollback.test.js.map +1 -0
  41. package/dist/__tests__/swarm-coordinator-backend.test.d.ts +2 -0
  42. package/dist/__tests__/swarm-coordinator-backend.test.d.ts.map +1 -0
  43. package/dist/__tests__/swarm-coordinator-backend.test.js +334 -0
  44. package/dist/__tests__/swarm-coordinator-backend.test.js.map +1 -0
  45. package/dist/__tests__/swarm-coordinator-init.test.js +2 -2
  46. package/dist/__tests__/swarm-coordinator-init.test.js.map +1 -1
  47. package/dist/__tests__/swarm-coordinator-plan-sync.test.d.ts +2 -0
  48. package/dist/__tests__/swarm-coordinator-plan-sync.test.d.ts.map +1 -0
  49. package/dist/__tests__/swarm-coordinator-plan-sync.test.js +263 -0
  50. package/dist/__tests__/swarm-coordinator-plan-sync.test.js.map +1 -0
  51. package/dist/__tests__/swarm-coordinator-resume.test.js +27 -17
  52. package/dist/__tests__/swarm-coordinator-resume.test.js.map +1 -1
  53. package/dist/__tests__/swarm-coordinator-roleId.test.js +24 -9
  54. package/dist/__tests__/swarm-coordinator-roleId.test.js.map +1 -1
  55. package/dist/__tests__/sync-plan-status.test.d.ts +2 -0
  56. package/dist/__tests__/sync-plan-status.test.d.ts.map +1 -0
  57. package/dist/__tests__/sync-plan-status.test.js +198 -0
  58. package/dist/__tests__/sync-plan-status.test.js.map +1 -0
  59. package/dist/__tests__/template-loader-editor.test.d.ts +2 -0
  60. package/dist/__tests__/template-loader-editor.test.d.ts.map +1 -0
  61. package/dist/__tests__/template-loader-editor.test.js +141 -0
  62. package/dist/__tests__/template-loader-editor.test.js.map +1 -0
  63. package/dist/index.d.ts +5 -2
  64. package/dist/index.d.ts.map +1 -1
  65. package/dist/index.js +3 -2
  66. package/dist/index.js.map +1 -1
  67. package/dist/persistence.d.ts +5 -1
  68. package/dist/persistence.d.ts.map +1 -1
  69. package/dist/persistence.js +4 -1
  70. package/dist/persistence.js.map +1 -1
  71. package/dist/plan-sync.d.ts +10 -0
  72. package/dist/plan-sync.d.ts.map +1 -0
  73. package/dist/plan-sync.js +37 -0
  74. package/dist/plan-sync.js.map +1 -0
  75. package/dist/projects-fs.d.ts +2 -0
  76. package/dist/projects-fs.d.ts.map +1 -1
  77. package/dist/projects-fs.js +1 -0
  78. package/dist/projects-fs.js.map +1 -1
  79. package/dist/recovery.d.ts +1 -1
  80. package/dist/recovery.js +1 -1
  81. package/dist/roles/role-loader.d.ts +19 -0
  82. package/dist/roles/role-loader.d.ts.map +1 -1
  83. package/dist/roles/role-loader.js +115 -4
  84. package/dist/roles/role-loader.js.map +1 -1
  85. package/dist/roles/types.d.ts +13 -0
  86. package/dist/roles/types.d.ts.map +1 -1
  87. package/dist/swarm-coordinator.d.ts +87 -9
  88. package/dist/swarm-coordinator.d.ts.map +1 -1
  89. package/dist/swarm-coordinator.js +392 -60
  90. package/dist/swarm-coordinator.js.map +1 -1
  91. package/dist/templates/template-loader.d.ts +26 -0
  92. package/dist/templates/template-loader.d.ts.map +1 -1
  93. package/dist/templates/template-loader.js +128 -14
  94. package/dist/templates/template-loader.js.map +1 -1
  95. package/dist/types.d.ts +25 -1
  96. package/dist/types.d.ts.map +1 -1
  97. package/package.json +6 -6
  98. package/roles/queen.md +26 -0
  99. package/dist/message-router.d.ts +0 -37
  100. package/dist/message-router.d.ts.map +0 -1
  101. package/dist/message-router.js +0 -60
  102. package/dist/message-router.js.map +0 -1
@@ -11,10 +11,12 @@ import { InboxWatcher } from "./inbox-watcher.js";
11
11
  import { randomUUID } from "node:crypto";
12
12
  import { EvolutionPipeline } from "@mclawnet/skill-manager";
13
13
  import { TaskStore, computeLeadBriefing, computeMemberBriefing, computeTaskBriefing, formatLeadBriefing, formatMemberBriefing, formatTaskBriefing, projectRoot } from "@mclawnet/task";
14
+ import { pickStrongestStatus } from "./plan-sync.js";
14
15
  import { existsSync } from "node:fs";
15
16
  import { homedir } from "node:os";
16
17
  import { join } from "node:path";
17
18
  import { createLogger } from "@mclawnet/logger";
19
+ import { DEFAULT_SANDBOX } from "@mclawnet/shared";
18
20
  const log = createLogger({ module: "swarm" });
19
21
  // Queen periodic patrol: pure safety-net heartbeat now that crash detection
20
22
  // is event-driven via handleRoleCrashed. Worker `task_set_status` /
@@ -49,6 +51,18 @@ export class SwarmCoordinator {
49
51
  swarms = new Map();
50
52
  inboxRelay;
51
53
  inboxWatcher;
54
+ /**
55
+ * Per-swarm spawn mutex (PR#5). A simple Promise-chain serialises every
56
+ * `spawnRole` call against a given swarmId so the
57
+ * `nextInstanceSeq.get / set` read-modify-write pair is atomic from the
58
+ * caller's perspective. Without this, two concurrent dynamic adds (e.g.
59
+ * the UI clicking "+ 添加成员" twice quickly while a prior spawn awaits
60
+ * its backend createSession) both observe seq=N and collide on the
61
+ * resulting `dev-N` instanceId. The chain element is replaced on every
62
+ * call so completed entries garbage-collect naturally; we never delete
63
+ * the key (cheap one-entry-per-swarm and avoids races on cleanup).
64
+ */
65
+ spawnLocks = new Map();
52
66
  constructor(sessionAdapter, hub,
53
67
  /**
54
68
  * Optional factory that resolves a per-swarm TaskStore (workDir-scoped).
@@ -95,6 +109,85 @@ export class SwarmCoordinator {
95
109
  }
96
110
  return this.taskStoreFactory;
97
111
  }
112
+ /**
113
+ * Persist swarm snapshot to recovery.json, first reconciling plan task
114
+ * statuses from the live TaskStore (Bug #2 方案 B). All save call sites
115
+ * MUST go through this wrapper instead of saveSwarmSnapshot directly so
116
+ * the on-disk plan never lies about task progress.
117
+ *
118
+ * Why this exists:
119
+ * `swarm.plan` is a snapshot of queen's initial output — queen sets
120
+ * each task.status to "pending" once and never updates the plan
121
+ * object. Without this sync, recovery.json would keep showing
122
+ * "pending" forever, misleading both the queen LLM (when re-reading
123
+ * its own plan after compaction) and human readers.
124
+ */
125
+ persistSwarm(swarm) {
126
+ try {
127
+ this.syncPlanStatusFromTasks(swarm);
128
+ }
129
+ catch (err) {
130
+ // Sync failure must not block persistence — recovery.json is the
131
+ // critical artifact for kill+restart; a stale plan is far better
132
+ // than no snapshot at all.
133
+ log.warn({ err, swarmId: swarm.id }, "syncPlanStatusFromTasks failed, persisting anyway");
134
+ }
135
+ saveSwarmSnapshot(swarm);
136
+ }
137
+ /**
138
+ * Match plan tasks to live task store entries via the explicit
139
+ * `planTaskId` foreign key (PR#1). Live tasks without a planTaskId are
140
+ * ignored — the queen is now responsible for tagging plan-derived tasks
141
+ * when she calls `task_create` / `task_create_from_message`. When multiple
142
+ * live tasks share the same planTaskId (e.g. queen retried a failure),
143
+ * `pickStrongestStatus` folds them: completed > in_progress > pending >
144
+ * cancelled.
145
+ *
146
+ * Replaces the prior subject-prefix heuristic (PR #73 / Bug #2 方案 B),
147
+ * which was fragile to queens rephrasing subjects and t1↔t10 collisions.
148
+ */
149
+ syncPlanStatusFromTasks(swarm) {
150
+ const plan = swarm.plan;
151
+ if (!plan || !plan.phases || !swarm.workDir)
152
+ return;
153
+ const taskStore = this.resolveTaskStore(swarm.workDir);
154
+ // No-op when no TaskStore is wired (e.g. unit tests with bare
155
+ // construction, or callers that intentionally opted out of DI).
156
+ // Plan stays with queen's original statuses — a stale display is
157
+ // strictly better than a runtime crash here.
158
+ if (!taskStore)
159
+ return;
160
+ const liveTasks = taskStore.listBySwarm(swarm.id);
161
+ if (liveTasks.length === 0)
162
+ return;
163
+ // Group live tasks by planTaskId. Tasks without a planTaskId are
164
+ // intentionally skipped — they are ad-hoc work that doesn't belong to
165
+ // any plan node, and silently subject-matching them would re-introduce
166
+ // the very ambiguity this PR removes.
167
+ const byPlanId = new Map();
168
+ for (const t of liveTasks) {
169
+ if (!t.planTaskId)
170
+ continue;
171
+ const arr = byPlanId.get(t.planTaskId) ?? [];
172
+ arr.push(t);
173
+ byPlanId.set(t.planTaskId, arr);
174
+ }
175
+ let synced = 0;
176
+ let total = 0;
177
+ for (const phase of plan.phases) {
178
+ if (!phase.tasks)
179
+ continue;
180
+ for (const task of phase.tasks) {
181
+ total++;
182
+ const matches = byPlanId.get(task.id);
183
+ if (!matches || matches.length === 0)
184
+ continue;
185
+ task.status = pickStrongestStatus(matches.map((t) => t.status));
186
+ synced++;
187
+ }
188
+ }
189
+ log.debug({ swarmId: swarm.id, planTasksTotal: total, planTasksSynced: synced }, "syncPlanStatusFromTasks");
190
+ }
98
191
  // ── Public API ──────────────────────────────────────────────────────
99
192
  /** Create a new swarm with two-phase initialization. */
100
193
  async create(swarmSessionId, options) {
@@ -106,6 +199,38 @@ export class SwarmCoordinator {
106
199
  if (options.templateName) {
107
200
  const tpl = loadTemplate(options.templateName);
108
201
  roleSpecs = tpl.roles.map((r) => ({ roleName: r.roleName, count: r.count, eager: r.eager }));
202
+ // Overlay per-role backend from options.roles (UI selector). To stay
203
+ // correct under duplicate roleNames in the template (e.g. two
204
+ // {roleName:"developer"} entries), walk options.roles in order and
205
+ // claim the FIRST template spec whose name matches AND whose backend
206
+ // hasn't been overlaid yet.
207
+ //
208
+ // Note on UI/coordinator asymmetry: SwarmConfigInline collapses duplicate
209
+ // roleNames in its selection list via `.find()`, so today the UI only
210
+ // ever sends one entry per roleName and at most one slot gets overlaid
211
+ // here. This positional-claim algorithm is intentionally more general so
212
+ // non-UI callers (tests, future bulk APIs) can pass multiple entries with
213
+ // the same roleName and get distinct overlays.
214
+ if (options.roles) {
215
+ const claimed = new Set();
216
+ for (const uiRole of options.roles) {
217
+ const idx = roleSpecs.findIndex((s, i) => !claimed.has(i) && s.roleName === uiRole.roleName);
218
+ if (idx >= 0) {
219
+ claimed.add(idx);
220
+ if (uiRole.backend) {
221
+ roleSpecs[idx] = { ...roleSpecs[idx], backend: uiRole.backend };
222
+ }
223
+ // Mirror backend overlay for sandbox so user's per-role choice in
224
+ // SwarmConfigInline reaches spawnRole → role.definition.sandbox →
225
+ // persistence.json. Without this, only dynamically-added roles
226
+ // ever carried `sandbox`; the 6 initial roles silently dropped to
227
+ // undefined and recovery.json had no record of the user's pick.
228
+ if (uiRole.sandbox) {
229
+ roleSpecs[idx] = { ...roleSpecs[idx], sandbox: uiRole.sandbox };
230
+ }
231
+ }
232
+ }
233
+ }
109
234
  }
110
235
  else {
111
236
  roleSpecs = options.roles ?? [];
@@ -115,6 +240,7 @@ export class SwarmCoordinator {
115
240
  hubSessionId: swarmSessionId,
116
241
  workDir: options.workDir,
117
242
  teamName: options.templateName,
243
+ displayName: options.displayName,
118
244
  roles: new Map(),
119
245
  plan: null,
120
246
  nextInstanceSeq: new Map(),
@@ -125,7 +251,6 @@ export class SwarmCoordinator {
125
251
  planStatus: "none",
126
252
  };
127
253
  // Store pending role specs for on-demand spawning
128
- swarm._pendingRoleSpecs = roleSpecs;
129
254
  this.swarms.set(swarmSessionId, swarm);
130
255
  // Snapshot may already exist from a prior run (continuation, or
131
256
  // restart-after-crash with the same swarmId). Record this BEFORE any
@@ -153,7 +278,12 @@ export class SwarmCoordinator {
153
278
  for (const spec of eagerSpecs) {
154
279
  const count = spec.count ?? 1;
155
280
  for (let i = 0; i < count; i++) {
156
- const role = await this.spawnRole(swarmSessionId, spec.roleName, undefined, spec.customPrompt, spec.customDefinition);
281
+ const role = await this.spawnRole(swarmSessionId, spec.roleName, {
282
+ customPrompt: spec.customPrompt,
283
+ customDefinition: spec.customDefinition,
284
+ backendOverride: spec.backend,
285
+ sandboxOverride: spec.sandbox,
286
+ });
157
287
  trackOpen(role);
158
288
  }
159
289
  }
@@ -163,13 +293,12 @@ export class SwarmCoordinator {
163
293
  swarm.status = "running";
164
294
  const queen = this.findQueen(swarm);
165
295
  if (queen) {
166
- const roleList = this.buildRoleListString(swarm);
167
- // Notify queen that initialization is complete with full member list
168
- await this.deliverInbox(swarm, queen.instanceId, {
169
- from: "system",
170
- type: "system",
171
- data: `[系统] 蜂群初始化完成。当前成员:\n${roleList}\n\n等待任务分配。`,
172
- });
296
+ // Initial spawn delivers a "成员变更" envelope listing every just-spawned
297
+ // role as "+ 新增". Refactored from the prior bespoke "蜂群初始化完成"
298
+ // envelope so dynamic adds (PR#5) reuse exactly the same protocol;
299
+ // queen-side handling stays uniform across init vs. runtime adds.
300
+ const initiallyAdded = [...swarm.roles.values()].filter((r) => r.definition.type !== "queen");
301
+ await this.notifyMembershipChange(swarm, { added: initiallyAdded });
173
302
  // Inject continuation context from previous run if applicable
174
303
  if (options.isContinuation) {
175
304
  const summary = this.buildContinuationSummary(swarmSessionId);
@@ -192,7 +321,7 @@ export class SwarmCoordinator {
192
321
  }
193
322
  log.info({ swarmId: swarmSessionId, roleCount: swarm.roles.size }, "swarm created");
194
323
  // Persistence: save initial snapshot
195
- saveSwarmSnapshot(swarm);
324
+ this.persistSwarm(swarm);
196
325
  // Start inbox watcher to react to inbox file changes (best-effort).
197
326
  if (options.workDir) {
198
327
  try {
@@ -262,8 +391,11 @@ export class SwarmCoordinator {
262
391
  const { swarm, role } = this.findByRoleSessionId(roleSessionId);
263
392
  if (!swarm || !role)
264
393
  return false;
394
+ // Output is already normalized to Claude SDK shape by
395
+ // SessionManager's onOutput gateway (see normalize-backend-output.ts).
396
+ const event = data;
265
397
  // Extract text content from the streaming event
266
- const text = extractTextFromEvent(data);
398
+ const text = extractTextFromEvent(event);
267
399
  // Parse swarm action blocks
268
400
  if (text) {
269
401
  // Legacy-format safety net: if a reviewer regresses to the old
@@ -289,7 +421,7 @@ export class SwarmCoordinator {
289
421
  if (plan) {
290
422
  swarm.plan = plan;
291
423
  swarm.planStatus = "draft";
292
- saveSwarmSnapshot(swarm);
424
+ this.persistSwarm(swarm);
293
425
  this.sendStatusUpdate(swarm);
294
426
  log.info({ swarmId: swarm.id, instanceId: role.instanceId }, "plan updated (draft)");
295
427
  this.requestPlanReview(swarm, plan).catch((err) => {
@@ -307,7 +439,7 @@ export class SwarmCoordinator {
307
439
  sessionId: swarm.hubSessionId,
308
440
  instanceId: role.instanceId,
309
441
  roleName: role.roleName,
310
- data,
442
+ data: event,
311
443
  });
312
444
  return true;
313
445
  }
@@ -321,18 +453,18 @@ export class SwarmCoordinator {
321
453
  return false;
322
454
  // Update role status
323
455
  role.status = "idle";
324
- // Persist per-role claudeSessionId — turn_complete frame carries the
456
+ // Persist per-role backendSessionId — turn_complete frame carries the
325
457
  // backend's real session UUID. We need it so a future restart can
326
458
  // `--resume` this exact role's conversation (Task 4 / Phase 4-5).
327
- if (info.claudeSessionId && role.claudeSessionId !== info.claudeSessionId) {
328
- role.claudeSessionId = info.claudeSessionId;
459
+ if (info.backendSessionId && role.backendSessionId !== info.backendSessionId) {
460
+ role.backendSessionId = info.backendSessionId;
329
461
  // saveSwarmSnapshot is sync-fire (proper-lockfile internally async);
330
462
  // call directly — failures should not break turn completion.
331
463
  try {
332
- saveSwarmSnapshot(swarm);
464
+ this.persistSwarm(swarm);
333
465
  }
334
466
  catch (err) {
335
- log.warn({ err, swarmId: swarm.id, instanceId: role.instanceId }, "failed to persist claudeSessionId on turn complete");
467
+ log.warn({ err, swarmId: swarm.id, instanceId: role.instanceId }, "failed to persist backendSessionId on turn complete");
336
468
  }
337
469
  }
338
470
  // Settle inbox echoes for this turn (fire-and-forget).
@@ -349,37 +481,55 @@ export class SwarmCoordinator {
349
481
  return true;
350
482
  }
351
483
  /**
352
- * Persist the per-role claudeSessionId immediately (e.g. from `system/init`
484
+ * Persist the per-role backendSessionId immediately (e.g. from `system/init`
353
485
  * frame, before the first turn_complete). Returns true if a role was found
354
486
  * and updated; false otherwise (no-op for non-swarm sessions).
355
487
  */
356
- setRoleClaudeSessionId(swarmId, instanceId, claudeSessionId) {
488
+ setRoleBackendSessionId(swarmId, instanceId, backendSessionId) {
357
489
  const swarm = this.swarms.get(swarmId);
358
490
  if (!swarm)
359
491
  return false;
360
492
  const role = swarm.roles.get(instanceId);
361
493
  if (!role)
362
494
  return false;
363
- if (role.claudeSessionId === claudeSessionId)
495
+ if (role.backendSessionId === backendSessionId)
364
496
  return true;
365
- role.claudeSessionId = claudeSessionId;
497
+ role.backendSessionId = backendSessionId;
366
498
  try {
367
- saveSwarmSnapshot(swarm);
499
+ this.persistSwarm(swarm);
368
500
  }
369
501
  catch (err) {
370
- log.warn({ err, swarmId, instanceId }, "failed to persist claudeSessionId");
502
+ log.warn({ err, swarmId, instanceId }, "failed to persist backendSessionId");
371
503
  }
372
504
  return true;
373
505
  }
374
- /** Convenience: same as setRoleClaudeSessionId but takes the `${swarmId}::${instanceId}` roleSessionId. */
375
- setRoleClaudeSessionIdBySession(roleSessionId, claudeSessionId) {
506
+ /** Convenience: same as setRoleBackendSessionId but takes the `${swarmId}::${instanceId}` roleSessionId. */
507
+ setRoleBackendSessionIdBySession(roleSessionId, backendSessionId) {
376
508
  const { swarm, role } = this.findByRoleSessionId(roleSessionId);
377
509
  if (!swarm || !role)
378
510
  return false;
379
- return this.setRoleClaudeSessionId(swarm.id, role.instanceId, claudeSessionId);
511
+ return this.setRoleBackendSessionId(swarm.id, role.instanceId, backendSessionId);
380
512
  }
381
- /** Spawn a new role instance in a swarm. */
382
- async spawnRole(swarmId, roleName, taskPrompt, customPrompt, customDefinition, additionalDirs, resumeId, presetInstanceId) {
513
+ /**
514
+ * Public spawnRole entry point. Wraps the real body in a per-swarm
515
+ * Promise-chain mutex (see `spawnLocks`) so concurrent calls against the
516
+ * same swarmId serialise — preventing nextInstanceSeq collisions on
517
+ * parallel dynamic adds. Calls against different swarmIds run in
518
+ * parallel as before.
519
+ */
520
+ async spawnRole(swarmId, roleName, opts = {}) {
521
+ const prev = this.spawnLocks.get(swarmId) ?? Promise.resolve();
522
+ // `.catch(() => {})` here ensures a previous spawn's rejection does NOT
523
+ // poison subsequent waiters — they only need ORDER guarantees, not
524
+ // shared success/failure semantics. Each call's own rejection still
525
+ // surfaces through `next` to its own awaiter.
526
+ const next = prev.catch(() => undefined).then(() => this.doSpawnRole(swarmId, roleName, opts));
527
+ this.spawnLocks.set(swarmId, next);
528
+ return next;
529
+ }
530
+ /** Spawn a new role instance in a swarm (real implementation, serialized by spawnRole). */
531
+ async doSpawnRole(swarmId, roleName, opts = {}) {
532
+ const { taskPrompt, customPrompt, customDefinition, additionalDirs, resumeId, presetInstanceId, backendOverride, sandboxOverride, isDynamicAdd, } = opts;
383
533
  const swarm = this.swarms.get(swarmId);
384
534
  if (!swarm)
385
535
  throw new Error(`Swarm ${swarmId} not found`);
@@ -394,6 +544,7 @@ export class SwarmCoordinator {
394
544
  capabilities: customDefinition.capabilities ?? [],
395
545
  color: customDefinition.color,
396
546
  promptBody: customDefinition.promptBody,
547
+ backend: customDefinition.backend,
397
548
  };
398
549
  }
399
550
  else {
@@ -403,10 +554,36 @@ export class SwarmCoordinator {
403
554
  if (customPrompt) {
404
555
  definition.promptBody = customPrompt;
405
556
  }
557
+ // Per-spawn backend override (UI selector) takes precedence over the
558
+ // role definition's static backend. Frozen here for the lifetime of the
559
+ // role instance.
560
+ if (backendOverride) {
561
+ definition = { ...definition, backend: backendOverride };
562
+ }
563
+ // Per-spawn sandbox override — same precedence rule. Each adapter
564
+ // maps the abstract level to its own permission model (see
565
+ // SpawnOptions.sandbox in @mclawnet/agent).
566
+ if (sandboxOverride) {
567
+ definition = { ...definition, sandbox: sandboxOverride };
568
+ }
569
+ // Final default: role files don't currently declare `sandbox`, and not
570
+ // every caller passes an override (e.g. legacy recovery, programmatic
571
+ // spawns). Pin to DEFAULT_SANDBOX so snapshots and the membership-change
572
+ // envelope always show a concrete level instead of `undefined`.
573
+ if (!definition.sandbox) {
574
+ definition = { ...definition, sandbox: DEFAULT_SANDBOX };
575
+ }
576
+ log.info({
577
+ swarmId,
578
+ roleName,
579
+ backendOverride,
580
+ definitionBackend: definition.backend,
581
+ hasCustomDefinition: !!customDefinition,
582
+ }, "spawnRole: backend resolved");
406
583
  let instanceId;
407
584
  if (presetInstanceId) {
408
585
  // Recovery path: preserve the original instanceId so per-role state
409
- // (logs, inbox, claudeSessionId) lines up with prior snapshot.
586
+ // (logs, inbox, backendSessionId) lines up with prior snapshot.
410
587
  instanceId = presetInstanceId;
411
588
  }
412
589
  else {
@@ -422,9 +599,12 @@ export class SwarmCoordinator {
422
599
  roleSessionId,
423
600
  status: "spawning",
424
601
  currentTask: taskPrompt,
425
- claudeSessionId: resumeId,
602
+ backendSessionId: resumeId,
426
603
  };
427
604
  swarm.roles.set(instanceId, roleInstance);
605
+ // Track whether we successfully opened the backend session so the
606
+ // dynamic-add rollback (below) knows whether to also kill the adapter.
607
+ let sessionOpened = false;
428
608
  // Build role list for prompt
429
609
  const roleList = this.buildRoleListString(swarm);
430
610
  const systemPrompt = buildRolePrompt(definition, instanceId, roleList, {
@@ -451,29 +631,107 @@ export class SwarmCoordinator {
451
631
  // Spawn Claude CLI process via SessionAdapter
452
632
  // SessionManager handles memory injection (Pipeline A: memory prompt + roleId hint) via roleId
453
633
  const tools = resolveRoleTools(definition);
454
- await this.sessionAdapter.createSession({
455
- sessionId: roleSessionId,
456
- workDir: swarm.workDir,
457
- systemPrompt: finalPrompt,
458
- roleId,
459
- additionalDirs,
460
- // Task 5: when recovering, the caller passes resumeId = role.claudeSessionId
461
- // so the Claude conversation continues with `--resume`. Fresh spawns
462
- // (Task 3 default) leave it undefined for a new conversation.
463
- resumeId,
464
- allowedTools: tools.allowedTools,
465
- disallowedTools: tools.disallowedTools,
466
- });
634
+ try {
635
+ await this.sessionAdapter.createSession({
636
+ sessionId: roleSessionId,
637
+ workDir: swarm.workDir,
638
+ systemPrompt: finalPrompt,
639
+ roleId,
640
+ additionalDirs,
641
+ // Task 5: when recovering, the caller passes resumeId = role.backendSessionId
642
+ // so the Claude conversation continues with `--resume`. Fresh spawns
643
+ // (Task 3 default) leave it undefined for a new conversation.
644
+ resumeId,
645
+ allowedTools: tools.allowedTools,
646
+ disallowedTools: tools.disallowedTools,
647
+ backend: definition.backend,
648
+ sandbox: definition.sandbox,
649
+ });
650
+ sessionOpened = true;
651
+ }
652
+ catch (err) {
653
+ // Wrap the underlying backend error with role context so the user can
654
+ // pinpoint which role failed when (for example) codex CLI is missing.
655
+ // M3.S5 #5 acceptance: "未装 codex CLI 时配 codex role → 启动报错指明
656
+ // 哪个 role 缺 codex". Without this wrap, the surfaced message is
657
+ // "Failed to spawn codex CLI: ENOENT" with no role identifier.
658
+ const backendLabel = definition.backend ?? "claude";
659
+ const cause = err instanceof Error ? err.message : String(err);
660
+ const wrapped = new Error(`role ${roleName} (${instanceId}, backend=${backendLabel}) failed to spawn: ${cause}`);
661
+ // Preserve original error for callers that inspect stack/cause
662
+ wrapped.cause = err;
663
+ // Dynamic-add rollback (PR#5): the caller invoked spawnRole at runtime
664
+ // via swarm_add_role, not as part of the initial create() batch. The
665
+ // outer create() does not run cleanupPartialCreate for these calls, so
666
+ // we MUST locally undo the in-memory mutations (role row + seq bump)
667
+ // and free the freshly allocated instanceId for reuse. Initial-batch
668
+ // callers (isDynamicAdd=false) keep the legacy behaviour where the row
669
+ // stays in place and cleanupPartialCreate at the create() layer tears
670
+ // it down on outer failure — mixing the two would double-cleanup.
671
+ if (isDynamicAdd) {
672
+ try {
673
+ swarm.roles.delete(instanceId);
674
+ }
675
+ catch { /* ignore */ }
676
+ if (!presetInstanceId) {
677
+ const cur = swarm.nextInstanceSeq.get(roleName) ?? 0;
678
+ if (cur > 0)
679
+ swarm.nextInstanceSeq.set(roleName, cur - 1);
680
+ }
681
+ await this.notifyMembershipChangeFailed(swarm, roleName, wrapped);
682
+ }
683
+ throw wrapped;
684
+ }
467
685
  roleInstance.status = "active";
468
686
  // Persistence: save snapshot after role spawned. Skipped during recover()
469
687
  // where the caller saves once at the end (avoids N writes for N roles).
470
688
  if (!swarm._suppressSnapshot) {
471
- saveSwarmSnapshot(swarm);
689
+ try {
690
+ this.persistSwarm(swarm);
691
+ }
692
+ catch (err) {
693
+ // Dynamic-add rollback also covers persistSwarm failure — without
694
+ // this branch a disk-full / lockfile crash would leave an in-memory
695
+ // role with no snapshot, surfacing as a phantom worker on the next
696
+ // restart. Initial-batch callers continue to surface the error
697
+ // through cleanupPartialCreate.
698
+ if (isDynamicAdd) {
699
+ if (sessionOpened) {
700
+ try {
701
+ await this.sessionAdapter.closeSession(roleSessionId);
702
+ }
703
+ catch { /* ignore */ }
704
+ }
705
+ try {
706
+ swarm.roles.delete(instanceId);
707
+ }
708
+ catch { /* ignore */ }
709
+ if (!presetInstanceId) {
710
+ const cur = swarm.nextInstanceSeq.get(roleName) ?? 0;
711
+ if (cur > 0)
712
+ swarm.nextInstanceSeq.set(roleName, cur - 1);
713
+ }
714
+ await this.notifyMembershipChangeFailed(swarm, roleName, err);
715
+ }
716
+ throw err;
717
+ }
472
718
  }
473
719
  // Send swarm status update to Hub
474
720
  this.sendStatusUpdate(swarm);
475
721
  // Flush any pending inbox messages (fire-and-forget).
476
722
  void this.inboxRelay.deliver(swarmId, instanceId);
723
+ // Dynamic-add: notify the queen via the unified membership-change
724
+ // protocol so she can decide whether to dispatch the new worker.
725
+ // Best-effort — failure here must not unwind the spawn; the worker is
726
+ // already running and the swarm is in a consistent state.
727
+ if (isDynamicAdd) {
728
+ try {
729
+ await this.notifyMembershipChange(swarm, { added: [roleInstance] });
730
+ }
731
+ catch (err) {
732
+ log.warn({ err, swarmId, instanceId }, "notifyMembershipChange failed after dynamic spawn (non-fatal)");
733
+ }
734
+ }
477
735
  log.info({ swarmId, instanceId, roleName, roleId }, "role spawned");
478
736
  return roleInstance;
479
737
  }
@@ -481,9 +739,9 @@ export class SwarmCoordinator {
481
739
  * Recover a previously persisted swarm by id.
482
740
  *
483
741
  * Locates the snapshot via `listRecoverableSwarmIds()`, then for each role
484
- * in the snapshot spawns a Claude session via `--resume role.claudeSessionId`
742
+ * in the snapshot spawns a Claude session via `--resume role.backendSessionId`
485
743
  * (when present) so the per-role conversation continues. Roles without a
486
- * stored claudeSessionId start fresh.
744
+ * stored backendSessionId start fresh.
487
745
  *
488
746
  * After all roles are spawned, drains each role's offline inbox via
489
747
  * `inboxRelay.deliver`. Drain failures are best-effort: warn but never throw.
@@ -507,12 +765,13 @@ export class SwarmCoordinator {
507
765
  }
508
766
  // Bootstrap the swarm shell. We deliberately bypass create() because
509
767
  // create() would respawn eager roles from template defaults — losing the
510
- // per-role instanceId / claudeSessionId from the snapshot.
768
+ // per-role instanceId / backendSessionId from the snapshot.
511
769
  const swarm = {
512
770
  id: swarmId,
513
771
  hubSessionId: snapshot.hubSessionId,
514
772
  workDir: snapshot.workDir,
515
773
  teamName: snapshot.teamName,
774
+ displayName: snapshot.displayName,
516
775
  roles: new Map(),
517
776
  plan: snapshot.plan ?? null,
518
777
  nextInstanceSeq: new Map(Object.entries(snapshot.nextInstanceSeq ?? {})),
@@ -523,7 +782,7 @@ export class SwarmCoordinator {
523
782
  planStatus: snapshot.planStatus ?? "none",
524
783
  };
525
784
  this.swarms.set(swarmId, swarm);
526
- // Respawn each role with the same instanceId, passing claudeSessionId
785
+ // Respawn each role with the same instanceId, passing backendSessionId
527
786
  // through as resumeId so SessionAdapter can `--resume` the conversation.
528
787
  // Suppress per-role snapshot writes — we save once at the end with the
529
788
  // full role set (and partialRecover marker if any role failed).
@@ -531,10 +790,13 @@ export class SwarmCoordinator {
531
790
  let partialRecover = false;
532
791
  for (const r of snapshot.roles) {
533
792
  try {
534
- await this.spawnRole(swarmId, r.roleName, r.currentTask, undefined, // customPrompt
535
- undefined, // customDefinition
536
- undefined, // additionalDirs
537
- r.claudeSessionId, r.instanceId);
793
+ await this.spawnRole(swarmId, r.roleName, {
794
+ taskPrompt: r.currentTask,
795
+ resumeId: r.backendSessionId,
796
+ presetInstanceId: r.instanceId,
797
+ backendOverride: r.backend,
798
+ sandboxOverride: r.sandbox,
799
+ });
538
800
  }
539
801
  catch (err) {
540
802
  partialRecover = true;
@@ -543,7 +805,7 @@ export class SwarmCoordinator {
543
805
  }
544
806
  delete swarm._suppressSnapshot;
545
807
  swarm.partialRecover = partialRecover;
546
- saveSwarmSnapshot(swarm);
808
+ this.persistSwarm(swarm);
547
809
  // Drain offline inboxes — best-effort per role.
548
810
  for (const role of swarm.roles.values()) {
549
811
  try {
@@ -587,7 +849,7 @@ export class SwarmCoordinator {
587
849
  await this.sessionAdapter.closeSession(role.roleSessionId);
588
850
  swarm.roles.delete(instanceId);
589
851
  // Persistence: save snapshot after role stopped
590
- saveSwarmSnapshot(swarm);
852
+ this.persistSwarm(swarm);
591
853
  this.sendStatusUpdate(swarm);
592
854
  log.info({ swarmId, instanceId }, "role stopped");
593
855
  }
@@ -721,7 +983,7 @@ export class SwarmCoordinator {
721
983
  log.warn({ swarmId: swarm.id, instanceId: role.instanceId, roleName: role.roleName, reason }, "role crashed — flipping status and notifying queen");
722
984
  role.status = "stopped";
723
985
  try {
724
- saveSwarmSnapshot(swarm);
986
+ this.persistSwarm(swarm);
725
987
  }
726
988
  catch (err) {
727
989
  log.warn({ err, swarmId: swarm.id }, "handleRoleCrashed: saveSwarmSnapshot failed");
@@ -969,7 +1231,7 @@ export class SwarmCoordinator {
969
1231
  swarm.planReviewTimer = undefined;
970
1232
  }
971
1233
  swarm.planStatus = args.verdict === "approved" ? "approved" : "rejected";
972
- saveSwarmSnapshot(swarm);
1234
+ this.persistSwarm(swarm);
973
1235
  this.sendStatusUpdate(swarm);
974
1236
  const queen = this.findQueen(swarm);
975
1237
  if (!queen) {
@@ -1224,6 +1486,60 @@ export class SwarmCoordinator {
1224
1486
  await this.deliverInbox(swarm, role.instanceId, msg);
1225
1487
  }
1226
1488
  }
1489
+ /**
1490
+ * Deliver a unified "成员变更" envelope to the queen describing one
1491
+ * membership delta (added and/or removed roles) plus the post-change
1492
+ * roster. Used both by the initial spawn flow (create()) and by the
1493
+ * dynamic add path (spawnRole with isDynamicAdd:true) so queen-side
1494
+ * handling is the same regardless of when membership changed.
1495
+ *
1496
+ * No-op when the swarm has no queen (e.g. mid-shutdown), matching the
1497
+ * defensive behaviour of other helpers that target the queen.
1498
+ */
1499
+ async notifyMembershipChange(swarm, change) {
1500
+ const queen = this.findQueen(swarm);
1501
+ if (!queen)
1502
+ return;
1503
+ const lines = ["[系统] 成员变更:"];
1504
+ for (const r of change.added ?? []) {
1505
+ const backend = r.definition?.backend ?? "claude";
1506
+ const sandbox = r.definition?.sandbox ?? "workspace-write";
1507
+ lines.push(` + 新增 ${r.instanceId} (${r.roleName}, ${backend}, ${sandbox})`);
1508
+ }
1509
+ for (const id of change.removed ?? []) {
1510
+ lines.push(` - 移除 ${id}`);
1511
+ }
1512
+ lines.push("", "当前成员:", this.buildRoleListString(swarm));
1513
+ await this.deliverInbox(swarm, queen.instanceId, {
1514
+ from: "system",
1515
+ type: "system",
1516
+ data: lines.join("\n"),
1517
+ });
1518
+ }
1519
+ /**
1520
+ * Dynamic-add rollback companion to {@link notifyMembershipChange}: tell the
1521
+ * queen a runtime "+ 添加成员" attempt failed and the roster did NOT change,
1522
+ * so she does not sit waiting for a worker that never came online or assume
1523
+ * her dispatch plan can target a not-yet-existent instanceId. Best-effort —
1524
+ * any inbox failure is swallowed so the caller's `throw` (which surfaces the
1525
+ * underlying spawn error to the UI) is preserved.
1526
+ */
1527
+ async notifyMembershipChangeFailed(swarm, roleName, err) {
1528
+ try {
1529
+ const queen = this.findQueen(swarm);
1530
+ if (!queen)
1531
+ return;
1532
+ const reason = err instanceof Error ? err.message : String(err);
1533
+ await this.deliverInbox(swarm, queen.instanceId, {
1534
+ from: "system",
1535
+ type: "system",
1536
+ data: `[系统] 添加成员失败:尝试为蜂群添加 ${roleName} 角色失败,原因:${reason}。当前名单未变更。`,
1537
+ });
1538
+ }
1539
+ catch (notifyErr) {
1540
+ log.warn({ err: notifyErr, swarmId: swarm.id, roleName }, "notifyMembershipChangeFailed: deliverInbox to queen failed (non-fatal)");
1541
+ }
1542
+ }
1227
1543
  findQueen(swarm) {
1228
1544
  for (const role of swarm.roles.values()) {
1229
1545
  if (role.definition.type === "queen" && role.status !== "stopped")
@@ -1290,13 +1606,13 @@ export class SwarmCoordinator {
1290
1606
  if (!reviewer) {
1291
1607
  // No reviewer — auto-approve
1292
1608
  swarm.planStatus = "approved";
1293
- saveSwarmSnapshot(swarm);
1609
+ this.persistSwarm(swarm);
1294
1610
  this.sendStatusUpdate(swarm);
1295
1611
  log.info({ swarmId: swarm.id }, "plan auto-approved (no reviewer)");
1296
1612
  return;
1297
1613
  }
1298
1614
  swarm.planStatus = "reviewing";
1299
- saveSwarmSnapshot(swarm);
1615
+ this.persistSwarm(swarm);
1300
1616
  this.sendStatusUpdate(swarm);
1301
1617
  const planJson = JSON.stringify(plan, null, 2);
1302
1618
  await this.deliverInbox(swarm, reviewer.instanceId, {
@@ -1394,7 +1710,7 @@ ${planJson}
1394
1710
  if (swarm.idleCheckCount >= swarm.maxIdleChecks) {
1395
1711
  swarm.isPaused = true;
1396
1712
  swarm.status = "paused";
1397
- saveSwarmSnapshot(swarm);
1713
+ this.persistSwarm(swarm);
1398
1714
  this.sendStatusUpdate(swarm);
1399
1715
  await this.deliverInbox(swarm, queen.instanceId, {
1400
1716
  from: "system",
@@ -1442,6 +1758,22 @@ ${planJson}
1442
1758
  status: r.status,
1443
1759
  currentTask: r.currentTask,
1444
1760
  color: r.definition.color,
1761
+ // CRITICAL: include backend so the hub's swarm.status handler stores
1762
+ // the per-role backend in `crewConfig.roles` on the DB session. Hub
1763
+ // overwrites the existing crewConfig.roles on each status update
1764
+ // (chat-handler.ts), so omitting backend here silently corrupts the
1765
+ // DB — any path that later recreates the swarm from crewConfig
1766
+ // (continuation after finish, race during recovery) spawns roles
1767
+ // with default backend=claude. Recovered codex roles were silently
1768
+ // downgraded to claude with no error.
1769
+ backend: r.definition.backend,
1770
+ // Same anti-downgrade reasoning as `backend` above — without
1771
+ // including sandbox in the wire payload, the hub's swarm.status
1772
+ // handler writes crewConfig.roles WITHOUT sandbox, and any path
1773
+ // that later recreates the swarm from crewConfig (continuation,
1774
+ // race during recovery) would silently downgrade roles to the
1775
+ // default workspace-write sandbox.
1776
+ sandbox: r.definition.sandbox,
1445
1777
  })),
1446
1778
  plan: swarm.plan,
1447
1779
  };