@mclawnet/swarm 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/README.md +118 -0
  2. package/dist/__tests__/action-parser.test.js +29 -82
  3. package/dist/__tests__/action-parser.test.js.map +1 -1
  4. package/dist/__tests__/coordinator-create-tx.test.d.ts +2 -0
  5. package/dist/__tests__/coordinator-create-tx.test.d.ts.map +1 -0
  6. package/dist/__tests__/coordinator-create-tx.test.js +114 -0
  7. package/dist/__tests__/coordinator-create-tx.test.js.map +1 -0
  8. package/dist/__tests__/coordinator-inbox-migration.test.d.ts +2 -0
  9. package/dist/__tests__/coordinator-inbox-migration.test.d.ts.map +1 -0
  10. package/dist/__tests__/coordinator-inbox-migration.test.js +56 -0
  11. package/dist/__tests__/coordinator-inbox-migration.test.js.map +1 -0
  12. package/dist/__tests__/inbox-integration.test.d.ts +2 -0
  13. package/dist/__tests__/inbox-integration.test.d.ts.map +1 -0
  14. package/dist/__tests__/inbox-integration.test.js +120 -0
  15. package/dist/__tests__/inbox-integration.test.js.map +1 -0
  16. package/dist/__tests__/inbox-persistence-recovery.test.d.ts +2 -0
  17. package/dist/__tests__/inbox-persistence-recovery.test.d.ts.map +1 -0
  18. package/dist/__tests__/inbox-persistence-recovery.test.js +139 -0
  19. package/dist/__tests__/inbox-persistence-recovery.test.js.map +1 -0
  20. package/dist/__tests__/inbox-relay-interceptor.test.d.ts +2 -0
  21. package/dist/__tests__/inbox-relay-interceptor.test.d.ts.map +1 -0
  22. package/dist/__tests__/inbox-relay-interceptor.test.js +156 -0
  23. package/dist/__tests__/inbox-relay-interceptor.test.js.map +1 -0
  24. package/dist/__tests__/inbox-relay.test.d.ts +2 -0
  25. package/dist/__tests__/inbox-relay.test.d.ts.map +1 -0
  26. package/dist/__tests__/inbox-relay.test.js +318 -0
  27. package/dist/__tests__/inbox-relay.test.js.map +1 -0
  28. package/dist/__tests__/inbox-store.test.d.ts +2 -0
  29. package/dist/__tests__/inbox-store.test.d.ts.map +1 -0
  30. package/dist/__tests__/inbox-store.test.js +129 -0
  31. package/dist/__tests__/inbox-store.test.js.map +1 -0
  32. package/dist/__tests__/inbox-watcher.test.d.ts +2 -0
  33. package/dist/__tests__/inbox-watcher.test.d.ts.map +1 -0
  34. package/dist/__tests__/inbox-watcher.test.js +104 -0
  35. package/dist/__tests__/inbox-watcher.test.js.map +1 -0
  36. package/dist/__tests__/persistence-path.test.d.ts +2 -0
  37. package/dist/__tests__/persistence-path.test.d.ts.map +1 -0
  38. package/dist/__tests__/persistence-path.test.js +79 -0
  39. package/dist/__tests__/persistence-path.test.js.map +1 -0
  40. package/dist/__tests__/persistence-robust.test.d.ts +2 -0
  41. package/dist/__tests__/persistence-robust.test.d.ts.map +1 -0
  42. package/dist/__tests__/persistence-robust.test.js +125 -0
  43. package/dist/__tests__/persistence-robust.test.js.map +1 -0
  44. package/dist/__tests__/persistence.test.d.ts +2 -0
  45. package/dist/__tests__/persistence.test.d.ts.map +1 -0
  46. package/dist/__tests__/persistence.test.js +105 -0
  47. package/dist/__tests__/persistence.test.js.map +1 -0
  48. package/dist/__tests__/phase4-5-e2e.test.d.ts +2 -0
  49. package/dist/__tests__/phase4-5-e2e.test.d.ts.map +1 -0
  50. package/dist/__tests__/phase4-5-e2e.test.js +203 -0
  51. package/dist/__tests__/phase4-5-e2e.test.js.map +1 -0
  52. package/dist/__tests__/phase6-7-e2e.test.d.ts +2 -0
  53. package/dist/__tests__/phase6-7-e2e.test.d.ts.map +1 -0
  54. package/dist/__tests__/phase6-7-e2e.test.js +93 -0
  55. package/dist/__tests__/phase6-7-e2e.test.js.map +1 -0
  56. package/dist/__tests__/project-files.test.d.ts +2 -0
  57. package/dist/__tests__/project-files.test.d.ts.map +1 -0
  58. package/dist/__tests__/project-files.test.js +143 -0
  59. package/dist/__tests__/project-files.test.js.map +1 -0
  60. package/dist/__tests__/projects-fs.test.d.ts +2 -0
  61. package/dist/__tests__/projects-fs.test.d.ts.map +1 -0
  62. package/dist/__tests__/projects-fs.test.js +107 -0
  63. package/dist/__tests__/projects-fs.test.js.map +1 -0
  64. package/dist/__tests__/recovery-cross-project.test.d.ts +2 -0
  65. package/dist/__tests__/recovery-cross-project.test.d.ts.map +1 -0
  66. package/dist/__tests__/recovery-cross-project.test.js +87 -0
  67. package/dist/__tests__/recovery-cross-project.test.js.map +1 -0
  68. package/dist/__tests__/recovery-forwards-to-coordinator.test.d.ts +2 -0
  69. package/dist/__tests__/recovery-forwards-to-coordinator.test.d.ts.map +1 -0
  70. package/dist/__tests__/recovery-forwards-to-coordinator.test.js +59 -0
  71. package/dist/__tests__/recovery-forwards-to-coordinator.test.js.map +1 -0
  72. package/dist/__tests__/recovery-resume.test.d.ts +2 -0
  73. package/dist/__tests__/recovery-resume.test.d.ts.map +1 -0
  74. package/dist/__tests__/recovery-resume.test.js +132 -0
  75. package/dist/__tests__/recovery-resume.test.js.map +1 -0
  76. package/dist/__tests__/retrospective.test.js +1 -0
  77. package/dist/__tests__/retrospective.test.js.map +1 -1
  78. package/dist/__tests__/role-loader-preamble-all.test.d.ts +2 -0
  79. package/dist/__tests__/role-loader-preamble-all.test.d.ts.map +1 -0
  80. package/dist/__tests__/role-loader-preamble-all.test.js +38 -0
  81. package/dist/__tests__/role-loader-preamble-all.test.js.map +1 -0
  82. package/dist/__tests__/role-loader-tools.test.d.ts +2 -0
  83. package/dist/__tests__/role-loader-tools.test.d.ts.map +1 -0
  84. package/dist/__tests__/role-loader-tools.test.js +39 -0
  85. package/dist/__tests__/role-loader-tools.test.js.map +1 -0
  86. package/dist/__tests__/role-loader.test.js +116 -1
  87. package/dist/__tests__/role-loader.test.js.map +1 -1
  88. package/dist/__tests__/role-prompt-no-legacy-protocol.test.d.ts +2 -0
  89. package/dist/__tests__/role-prompt-no-legacy-protocol.test.d.ts.map +1 -0
  90. package/dist/__tests__/role-prompt-no-legacy-protocol.test.js +37 -0
  91. package/dist/__tests__/role-prompt-no-legacy-protocol.test.js.map +1 -0
  92. package/dist/__tests__/role-tools.test.d.ts +2 -0
  93. package/dist/__tests__/role-tools.test.d.ts.map +1 -0
  94. package/dist/__tests__/role-tools.test.js +80 -0
  95. package/dist/__tests__/role-tools.test.js.map +1 -0
  96. package/dist/__tests__/spawn-role-injects-briefings.test.d.ts +2 -0
  97. package/dist/__tests__/spawn-role-injects-briefings.test.d.ts.map +1 -0
  98. package/dist/__tests__/spawn-role-injects-briefings.test.js +182 -0
  99. package/dist/__tests__/spawn-role-injects-briefings.test.js.map +1 -0
  100. package/dist/__tests__/spawn-role-tool-policy.test.d.ts +2 -0
  101. package/dist/__tests__/spawn-role-tool-policy.test.d.ts.map +1 -0
  102. package/dist/__tests__/spawn-role-tool-policy.test.js +96 -0
  103. package/dist/__tests__/spawn-role-tool-policy.test.js.map +1 -0
  104. package/dist/__tests__/swarm-coordinator-inbox-watcher.test.d.ts +2 -0
  105. package/dist/__tests__/swarm-coordinator-inbox-watcher.test.d.ts.map +1 -0
  106. package/dist/__tests__/swarm-coordinator-inbox-watcher.test.js +61 -0
  107. package/dist/__tests__/swarm-coordinator-inbox-watcher.test.js.map +1 -0
  108. package/dist/__tests__/swarm-coordinator-inbox.test.d.ts +2 -0
  109. package/dist/__tests__/swarm-coordinator-inbox.test.d.ts.map +1 -0
  110. package/dist/__tests__/swarm-coordinator-inbox.test.js +182 -0
  111. package/dist/__tests__/swarm-coordinator-inbox.test.js.map +1 -0
  112. package/dist/__tests__/swarm-coordinator-init.test.js +36 -8
  113. package/dist/__tests__/swarm-coordinator-init.test.js.map +1 -1
  114. package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.d.ts +2 -0
  115. package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.d.ts.map +1 -0
  116. package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.js +113 -0
  117. package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.js.map +1 -0
  118. package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.d.ts +2 -0
  119. package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.d.ts.map +1 -0
  120. package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.js +465 -0
  121. package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.js.map +1 -0
  122. package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.d.ts +2 -0
  123. package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.d.ts.map +1 -0
  124. package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.js +284 -0
  125. package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.js.map +1 -0
  126. package/dist/__tests__/swarm-coordinator-plan-review.test.d.ts +2 -0
  127. package/dist/__tests__/swarm-coordinator-plan-review.test.d.ts.map +1 -0
  128. package/dist/__tests__/swarm-coordinator-plan-review.test.js +294 -0
  129. package/dist/__tests__/swarm-coordinator-plan-review.test.js.map +1 -0
  130. package/dist/__tests__/swarm-coordinator-resume.test.d.ts +2 -0
  131. package/dist/__tests__/swarm-coordinator-resume.test.d.ts.map +1 -0
  132. package/dist/__tests__/swarm-coordinator-resume.test.js +93 -0
  133. package/dist/__tests__/swarm-coordinator-resume.test.js.map +1 -0
  134. package/dist/__tests__/swarm-coordinator-roleId.test.js +2 -2
  135. package/dist/__tests__/swarm-coordinator-roleId.test.js.map +1 -1
  136. package/dist/__tests__/swarm-destroy-detach.test.d.ts +2 -0
  137. package/dist/__tests__/swarm-destroy-detach.test.d.ts.map +1 -0
  138. package/dist/__tests__/swarm-destroy-detach.test.js +135 -0
  139. package/dist/__tests__/swarm-destroy-detach.test.js.map +1 -0
  140. package/dist/action-parser.d.ts +0 -9
  141. package/dist/action-parser.d.ts.map +1 -1
  142. package/dist/action-parser.js +0 -114
  143. package/dist/action-parser.js.map +1 -1
  144. package/dist/inbox-relay.d.ts +50 -0
  145. package/dist/inbox-relay.d.ts.map +1 -0
  146. package/dist/inbox-relay.js +168 -0
  147. package/dist/inbox-relay.js.map +1 -0
  148. package/dist/inbox-store.d.ts +25 -0
  149. package/dist/inbox-store.d.ts.map +1 -0
  150. package/dist/inbox-store.js +95 -0
  151. package/dist/inbox-store.js.map +1 -0
  152. package/dist/inbox-watcher.d.ts +13 -0
  153. package/dist/inbox-watcher.d.ts.map +1 -0
  154. package/dist/inbox-watcher.js +89 -0
  155. package/dist/inbox-watcher.js.map +1 -0
  156. package/dist/index.d.ts +8 -3
  157. package/dist/index.d.ts.map +1 -1
  158. package/dist/index.js +5 -2
  159. package/dist/index.js.map +1 -1
  160. package/dist/persistence.d.ts +19 -5
  161. package/dist/persistence.d.ts.map +1 -1
  162. package/dist/persistence.js +97 -22
  163. package/dist/persistence.js.map +1 -1
  164. package/dist/project-files.d.ts +60 -0
  165. package/dist/project-files.d.ts.map +1 -0
  166. package/dist/project-files.js +214 -0
  167. package/dist/project-files.js.map +1 -0
  168. package/dist/projects-fs.d.ts +28 -0
  169. package/dist/projects-fs.d.ts.map +1 -0
  170. package/dist/projects-fs.js +111 -0
  171. package/dist/projects-fs.js.map +1 -0
  172. package/dist/recovery.d.ts +12 -0
  173. package/dist/recovery.d.ts.map +1 -1
  174. package/dist/recovery.js +14 -19
  175. package/dist/recovery.js.map +1 -1
  176. package/dist/roles/role-loader.d.ts +28 -1
  177. package/dist/roles/role-loader.d.ts.map +1 -1
  178. package/dist/roles/role-loader.js +73 -1
  179. package/dist/roles/role-loader.js.map +1 -1
  180. package/dist/roles/role-tools.d.ts +16 -0
  181. package/dist/roles/role-tools.d.ts.map +1 -0
  182. package/dist/roles/role-tools.js +25 -0
  183. package/dist/roles/role-tools.js.map +1 -0
  184. package/dist/roles/types.d.ts +4 -0
  185. package/dist/roles/types.d.ts.map +1 -1
  186. package/dist/swarm-coordinator.d.ts +176 -12
  187. package/dist/swarm-coordinator.d.ts.map +1 -1
  188. package/dist/swarm-coordinator.js +863 -370
  189. package/dist/swarm-coordinator.js.map +1 -1
  190. package/dist/types.d.ts +26 -0
  191. package/dist/types.d.ts.map +1 -1
  192. package/package.json +9 -6
  193. package/roles/analyst-livermore.md +6 -30
  194. package/roles/designer-rams.md +2 -30
  195. package/roles/dev-torvalds.md +8 -44
  196. package/roles/developer.md +5 -21
  197. package/roles/director-jia.md +20 -49
  198. package/roles/editor-boyong.md +8 -40
  199. package/roles/macro-dalio.md +6 -30
  200. package/roles/planner-maoni.md +24 -53
  201. package/roles/pm-jobs.md +20 -71
  202. package/roles/preset-analyst-simons.md +2 -18
  203. package/roles/preset-architect-knuth.md +2 -18
  204. package/roles/preset-designer-norman.md +2 -18
  205. package/roles/preset-designer.md +2 -18
  206. package/roles/preset-dev-carmack.md +2 -18
  207. package/roles/preset-dev-gosling.md +2 -18
  208. package/roles/preset-developer.md +7 -23
  209. package/roles/preset-manager-grove.md +2 -18
  210. package/roles/preset-manager-musk.md +2 -18
  211. package/roles/preset-pm.md +7 -34
  212. package/roles/preset-researcher-feynman.md +2 -18
  213. package/roles/preset-reviewer.md +5 -21
  214. package/roles/preset-strategist-buffett.md +2 -18
  215. package/roles/preset-strategist-munger.md +2 -18
  216. package/roles/preset-strategist-sunzi.md +2 -18
  217. package/roles/preset-tester-beck.md +2 -18
  218. package/roles/preset-tester.md +5 -21
  219. package/roles/preset-writer-orwell.md +2 -18
  220. package/roles/preset-writer.md +2 -18
  221. package/roles/quant-simons.md +5 -32
  222. package/roles/queen.md +25 -41
  223. package/roles/reviewer-martin.md +11 -37
  224. package/roles/reviewer.md +20 -21
  225. package/roles/rhythm-tangsan.md +5 -29
  226. package/roles/risk-taleb.md +4 -32
  227. package/roles/script-shitiesheng.md +8 -31
  228. package/roles/storyboard-xuke.md +9 -29
  229. package/roles/strategist-soros.md +16 -73
  230. package/roles/tester-beck.md +4 -40
  231. package/roles/tester.md +5 -21
  232. package/roles/trader-jones.md +4 -32
  233. package/roles/vfx-guchangwei.md +8 -27
  234. package/roles/writer-zhouzi.md +7 -39
  235. package/templates/dev-team-pro.md +4 -1
  236. package/templates/dev-team.md +3 -1
  237. package/templates/minimal.md +2 -1
  238. package/templates/trading-team.md +6 -1
  239. package/templates/video-team.md +4 -1
  240. package/templates/writing-team.md +4 -1
@@ -1,35 +1,90 @@
1
1
  import { loadRole, buildRolePrompt } from "./roles/role-loader.js";
2
+ import { resolveRoleTools } from "./roles/role-tools.js";
2
3
  import { loadTemplate } from "./templates/template-loader.js";
3
- import { LocalMessageRouter } from "./message-router.js";
4
- import { parseSwarmActions, parsePlanFromText } from "./action-parser.js";
4
+ import { parsePlanFromText } from "./action-parser.js";
5
5
  import { BUILTIN_ROLES, initDatabase, syncToAutoMemory } from "@mclawnet/memory";
6
- import { saveSwarmSnapshot, deleteSwarmSnapshot, appendMessageLog, loadSwarmSnapshot, readMessageLog } from "./persistence.js";
6
+ import { saveSwarmSnapshot, deleteSwarmSnapshot, appendMessageLog, loadSwarmSnapshot, readMessageLog, listRecoverableSwarmIds } from "./persistence.js";
7
7
  import { runRetrospective } from "./retrospective.js";
8
+ import { InboxRelay } from "./inbox-relay.js";
9
+ import { InboxStore } from "./inbox-store.js";
10
+ import { InboxWatcher } from "./inbox-watcher.js";
11
+ import { randomUUID } from "node:crypto";
8
12
  import { EvolutionPipeline } from "@mclawnet/skill-manager";
13
+ import { TaskStore, computeLeadBriefing, computeMemberBriefing, computeTaskBriefing, formatLeadBriefing, formatMemberBriefing, formatTaskBriefing, projectRoot } from "@mclawnet/task";
14
+ import { existsSync } from "node:fs";
9
15
  import { homedir } from "node:os";
10
16
  import { join } from "node:path";
11
17
  import { createLogger } from "@mclawnet/logger";
12
- import { randomUUID } from "node:crypto";
13
18
  const log = createLogger({ module: "swarm" });
14
19
  const QUEEN_CHECK_INTERVAL_MS = 300_000; // 5min when workers active
15
20
  const QUEEN_CHECK_IDLE_INTERVAL_MS = 300_000; // 5min when all roles idle
16
- const SEND_MAX_RETRIES = 3;
17
- const SEND_RETRY_BASE_MS = 1_000; // 1s, 2s, 4s exponential backoff
18
- const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
21
+ /**
22
+ * Hard deadline for the "reviewing" plan-status. If the reviewer fails to
23
+ * submit a verdict within this window the coordinator auto-approves with a
24
+ * warning so the queen is not deadlocked. Cleared on any explicit
25
+ * submitPlanReview call.
26
+ */
27
+ const PLAN_REVIEW_TIMEOUT_MS = 10 * 60 * 1000;
19
28
  /**
20
29
  * SwarmCoordinator — orchestrates multi-role Claude CLI swarms.
21
30
  *
22
31
  * Composes (not replaces) SessionAdapter. Each role instance is a separate
23
- * session in SessionAdapter. Communication between roles goes through
24
- * MessageRouter (currently LocalMessageRouter using stdin/stdout).
32
+ * session in SessionAdapter. Communication between roles flows through
33
+ * file-based InboxStore + InboxRelay.
25
34
  */
26
35
  export class SwarmCoordinator {
27
36
  sessionAdapter;
28
37
  hub;
38
+ taskStoreFactory;
39
+ home;
29
40
  swarms = new Map();
30
- constructor(sessionAdapter, hub) {
41
+ inboxRelay;
42
+ inboxWatcher;
43
+ constructor(sessionAdapter, hub,
44
+ /**
45
+ * Optional factory that resolves a per-swarm TaskStore (workDir-scoped).
46
+ * When present, spawnRole appends a "### 任务上下文" briefing section to
47
+ * the role's system prompt. Failures are best-effort logged.
48
+ *
49
+ * Backward compat: also accepts a pre-built single TaskStore instance.
50
+ */
51
+ taskStoreFactory,
52
+ /**
53
+ * Optional override for the user-home root (no `.clawnet` suffix).
54
+ * Threaded to InboxRelay and to inline `new InboxStore(...)` / `TaskStore`
55
+ * calls so test harnesses pinned to a tmpdir don't fall through to
56
+ * `homedir()` and write files into the real `~/.clawnet`. When omitted,
57
+ * inline call sites still honour `CLAWNET_HOME` env var as before.
58
+ */
59
+ home) {
31
60
  this.sessionAdapter = sessionAdapter;
32
61
  this.hub = hub;
62
+ this.taskStoreFactory = taskStoreFactory;
63
+ this.home = home;
64
+ this.inboxRelay = new InboxRelay(sessionAdapter, (id) => this.swarms.get(id), (swarmId, instanceId, msg) => this.handleInterceptedMessage(swarmId, instanceId, msg), home);
65
+ this.inboxWatcher = new InboxWatcher(this.inboxRelay);
66
+ }
67
+ /** Resolve the user-home root used for InboxStore / TaskStore lookups. */
68
+ resolveHome() {
69
+ return this.home ?? process.env.CLAWNET_HOME ?? homedir();
70
+ }
71
+ /**
72
+ * Resolve the TaskStore for a given swarm's workDir.
73
+ * Returns undefined if no factory was wired or if the factory returned undefined.
74
+ */
75
+ resolveTaskStore(workDir) {
76
+ if (!this.taskStoreFactory)
77
+ return undefined;
78
+ if (typeof this.taskStoreFactory === "function") {
79
+ try {
80
+ return this.taskStoreFactory(workDir);
81
+ }
82
+ catch (err) {
83
+ log.warn({ err, workDir }, "taskStoreFactory threw — skipping briefing");
84
+ return undefined;
85
+ }
86
+ }
87
+ return this.taskStoreFactory;
33
88
  }
34
89
  // ── Public API ──────────────────────────────────────────────────────
35
90
  /** Create a new swarm with two-phase initialization. */
@@ -46,11 +101,11 @@ export class SwarmCoordinator {
46
101
  else {
47
102
  roleSpecs = options.roles ?? [];
48
103
  }
49
- const router = new LocalMessageRouter(this.sessionAdapter);
50
104
  const swarm = {
51
105
  id: swarmSessionId,
52
106
  hubSessionId: swarmSessionId,
53
107
  workDir: options.workDir,
108
+ teamName: options.templateName,
54
109
  roles: new Map(),
55
110
  plan: null,
56
111
  nextInstanceSeq: new Map(),
@@ -60,66 +115,89 @@ export class SwarmCoordinator {
60
115
  status: "creating",
61
116
  planStatus: "none",
62
117
  };
63
- // Attach the router to the swarm for internal access
64
- swarm._router = router;
65
118
  // Store pending role specs for on-demand spawning
66
119
  swarm._pendingRoleSpecs = roleSpecs;
67
120
  this.swarms.set(swarmSessionId, swarm);
68
- // ── Phase 1: spawn eager roles (non-queen first, queen last) ────
69
- const eagerSpecs = roleSpecs.filter((r) => r.roleName === "queen" || r.eager === true);
70
- // Sort: queen last so her roleList includes all already-spawned roles
71
- eagerSpecs.sort((a, b) => {
72
- if (a.roleName === "queen")
73
- return 1;
74
- if (b.roleName === "queen")
75
- return -1;
76
- return 0;
77
- });
78
- for (const spec of eagerSpecs) {
79
- const count = spec.count ?? 1;
80
- for (let i = 0; i < count; i++) {
81
- await this.spawnRole(swarmSessionId, spec.roleName, router, undefined, spec.customPrompt, spec.customDefinition);
82
- }
83
- }
84
- // Start Queen periodic check
85
- this.startQueenCheck(swarmSessionId);
86
- // ── Phase 2: mark running + notify queen + send task ────────────
87
- swarm.status = "running";
88
- const queen = this.findQueen(swarm);
89
- if (queen) {
90
- const roleList = this.buildRoleListString(swarm);
91
- // Notify queen that initialization is complete with full member list
92
- router.send(queen.instanceId, {
93
- from: "system",
94
- type: "system",
95
- data: `[系统] 蜂群初始化完成。当前成员:\n${roleList}\n\n等待任务分配。`,
96
- timestamp: Date.now(),
121
+ // Snapshot may already exist from a prior run (continuation, or
122
+ // restart-after-crash with the same swarmId). Record this BEFORE any
123
+ // spawnRole writes a new one cleanup must not delete what wasn't ours.
124
+ const snapshotPreexisted = options.workDir
125
+ ? existsSync(join(projectRoot(options.workDir, this.resolveHome()), "swarms", swarmSessionId, "recovery.json"))
126
+ : false;
127
+ // Track opened sessions so cleanupPartialCreate() can close them on failure.
128
+ const openedSessionIds = [];
129
+ const trackOpen = (role) => {
130
+ if (role?.roleSessionId)
131
+ openedSessionIds.push(role.roleSessionId);
132
+ };
133
+ try {
134
+ // ── Phase 1: spawn eager roles (non-queen first, queen last) ────
135
+ const eagerSpecs = roleSpecs.filter((r) => r.roleName === "queen" || r.eager === true);
136
+ // Sort: queen last so her roleList includes all already-spawned roles
137
+ eagerSpecs.sort((a, b) => {
138
+ if (a.roleName === "queen")
139
+ return 1;
140
+ if (b.roleName === "queen")
141
+ return -1;
142
+ return 0;
97
143
  });
98
- // Inject continuation context from previous run if applicable
99
- if (options.isContinuation) {
100
- const summary = this.buildContinuationSummary(swarmSessionId);
101
- if (summary) {
102
- router.send(queen.instanceId, {
103
- from: "system",
104
- type: "system",
105
- data: `[系统] 这是对之前蜂群任务的继续。以下是之前的工作摘要:\n${summary}\n\n请基于已有成果继续执行新的改进指令。`,
106
- timestamp: Date.now(),
107
- });
144
+ for (const spec of eagerSpecs) {
145
+ const count = spec.count ?? 1;
146
+ for (let i = 0; i < count; i++) {
147
+ const role = await this.spawnRole(swarmSessionId, spec.roleName, undefined, spec.customPrompt, spec.customDefinition);
148
+ trackOpen(role);
108
149
  }
109
150
  }
110
- // Send user task
111
- if (options.task) {
112
- router.send(queen.instanceId, {
113
- from: "user",
114
- type: "task",
115
- data: options.task,
116
- timestamp: Date.now(),
151
+ // Start Queen periodic check
152
+ this.startQueenCheck(swarmSessionId);
153
+ // ── Phase 2: mark running + notify queen + send task ────────────
154
+ swarm.status = "running";
155
+ const queen = this.findQueen(swarm);
156
+ if (queen) {
157
+ const roleList = this.buildRoleListString(swarm);
158
+ // Notify queen that initialization is complete with full member list
159
+ await this.deliverInbox(swarm, queen.instanceId, {
160
+ from: "system",
161
+ type: "system",
162
+ data: `[系统] 蜂群初始化完成。当前成员:\n${roleList}\n\n等待任务分配。`,
117
163
  });
164
+ // Inject continuation context from previous run if applicable
165
+ if (options.isContinuation) {
166
+ const summary = this.buildContinuationSummary(swarmSessionId);
167
+ if (summary) {
168
+ await this.deliverInbox(swarm, queen.instanceId, {
169
+ from: "system",
170
+ type: "system",
171
+ data: `[系统] 这是对之前蜂群任务的继续。以下是之前的工作摘要:\n${summary}\n\n请基于已有成果继续执行新的改进指令。`,
172
+ });
173
+ }
174
+ }
175
+ // Send user task
176
+ if (options.task) {
177
+ await this.deliverInbox(swarm, queen.instanceId, {
178
+ from: "user",
179
+ type: "task",
180
+ data: options.task,
181
+ });
182
+ }
183
+ }
184
+ log.info({ swarmId: swarmSessionId, roleCount: swarm.roles.size }, "swarm created");
185
+ // Persistence: save initial snapshot
186
+ saveSwarmSnapshot(swarm);
187
+ // Start inbox watcher to react to inbox file changes (best-effort).
188
+ if (options.workDir) {
189
+ try {
190
+ this.inboxWatcher.watch(swarmSessionId, options.workDir);
191
+ }
192
+ catch (err) {
193
+ log.warn({ err, swarmId: swarmSessionId }, "inboxWatcher.watch failed (best-effort)");
194
+ }
118
195
  }
119
196
  }
120
- log.info({ swarmId: swarmSessionId, roleCount: swarm.roles.size }, "swarm created");
121
- // Persistence: save initial snapshot
122
- saveSwarmSnapshot(swarm);
197
+ catch (err) {
198
+ await this.cleanupPartialCreate(swarmSessionId, openedSessionIds, snapshotPreexisted, err);
199
+ throw err;
200
+ }
123
201
  }
124
202
  /** Handle user message directed at the swarm. */
125
203
  async handleUserMessage(swarmSessionId, content, targetInstance) {
@@ -128,12 +206,10 @@ export class SwarmCoordinator {
128
206
  log.error({ swarmId: swarmSessionId }, "handleUserMessage: swarm not found");
129
207
  return;
130
208
  }
131
- const router = swarm._router;
132
209
  const message = {
133
210
  from: "user",
134
211
  type: "task",
135
212
  data: content,
136
- timestamp: Date.now(),
137
213
  };
138
214
  // Auto-resume if paused
139
215
  if (swarm.status === "paused") {
@@ -142,21 +218,28 @@ export class SwarmCoordinator {
142
218
  swarm.idleCheckCount = 0;
143
219
  this.startQueenCheck(swarmSessionId);
144
220
  log.info({ swarmId: swarmSessionId }, "swarm resumed");
221
+ for (const r of swarm.roles.values()) {
222
+ if (r.status === "spawning" || r.status === "stopped")
223
+ continue;
224
+ void this.inboxRelay.deliver(swarmSessionId, r.instanceId);
225
+ }
145
226
  }
146
227
  if (targetInstance) {
147
228
  // Directed at a specific instance
148
- router.send(targetInstance, message);
149
- appendMessageLog(swarmSessionId, { type: "user_message", from: "user", to: targetInstance, data: content, timestamp: Date.now() });
229
+ await this.deliverInbox(swarm, targetInstance, message);
230
+ if (swarm.workDir) {
231
+ appendMessageLog(swarm.workDir, swarmSessionId, { type: "user_message", from: "user", to: targetInstance, data: content, timestamp: Date.now() });
232
+ }
150
233
  }
151
234
  else {
152
235
  // Default: send to Queen
153
236
  const queen = this.findQueen(swarm);
154
237
  if (queen) {
155
- router.send(queen.instanceId, message);
238
+ await this.deliverInbox(swarm, queen.instanceId, message);
156
239
  }
157
240
  else {
158
241
  // No queen — broadcast to all
159
- router.broadcast(message);
242
+ await this.broadcastInbox(swarm, message);
160
243
  }
161
244
  }
162
245
  }
@@ -170,14 +253,22 @@ export class SwarmCoordinator {
170
253
  const { swarm, role } = this.findByRoleSessionId(roleSessionId);
171
254
  if (!swarm || !role)
172
255
  return false;
173
- const router = swarm._router;
174
256
  // Extract text content from the streaming event
175
257
  const text = extractTextFromEvent(data);
176
258
  // Parse swarm action blocks
177
259
  if (text) {
178
- // Log when assistant event contains meaningful text (not every streaming chunk)
179
- if (text.includes("```swarm")) {
180
- log.info({ instanceId: role.instanceId, textLen: text.length }, "assistant output contains swarm block");
260
+ // Legacy-format safety net: if a reviewer regresses to the old
261
+ // `{"action":"report", "taskId":"plan_review", ...}` block while we are
262
+ // waiting on a verdict, warn loudly so the regression is diagnosable.
263
+ // The action-parser no longer consumes these blocks, so the swarm would
264
+ // otherwise silently wait until the 10-min auto-approve timer fires.
265
+ // We do NOT try to translate the block — reviewer must call
266
+ // `mcp__clawnet__plan_review_submit` instead. See PR plan-review-mcp-migration.
267
+ if (role.roleName === "reviewer" &&
268
+ swarm.planStatus === "reviewing" &&
269
+ /"action"\s*:\s*"report"/.test(text) &&
270
+ /"taskId"\s*:\s*"plan_review"/.test(text)) {
271
+ log.warn({ swarmId: swarm.id, instanceId: role.instanceId }, 'reviewer emitted legacy {"action":"report","taskId":"plan_review"} block — ignored. Call mcp__clawnet__plan_review_submit instead.');
181
272
  }
182
273
  // For queen: parse plan BEFORE executing actions, so review can gate task assignment
183
274
  if (role.definition.type === "queen") {
@@ -192,27 +283,14 @@ export class SwarmCoordinator {
192
283
  saveSwarmSnapshot(swarm);
193
284
  this.sendStatusUpdate(swarm);
194
285
  log.info({ swarmId: swarm.id, instanceId: role.instanceId }, "plan updated (draft)");
195
- this.requestPlanReview(swarm, plan);
286
+ this.requestPlanReview(swarm, plan).catch((err) => {
287
+ log.warn({ err, swarmId: swarm.id }, "requestPlanReview failed (best-effort)");
288
+ });
196
289
  }
197
290
  else if (hasPlanKeyword) {
198
291
  log.warn({ instanceId: role.instanceId, textSnippet: text.substring(0, 500) }, "queen output has plan keyword but parsePlanFromText returned null");
199
292
  }
200
293
  }
201
- const { actions } = parseSwarmActions(text);
202
- if (text.includes("```swarm") && actions.length === 0) {
203
- log.warn({ instanceId: role.instanceId, textSnippet: text.substring(0, 500) }, "swarm block found but parse failed");
204
- }
205
- // Execute actions sequentially — spawn must complete before subsequent sends
206
- (async () => {
207
- for (const action of actions) {
208
- try {
209
- await this.executeAction(swarm, role, router, action);
210
- }
211
- catch (err) {
212
- log.error({ err, swarmId: swarm.id, action: action.action }, "executeAction failed");
213
- }
214
- }
215
- })();
216
294
  }
217
295
  // Forward full output to Hub for UI display + DB storage
218
296
  this.hub.send({
@@ -234,6 +312,22 @@ export class SwarmCoordinator {
234
312
  return false;
235
313
  // Update role status
236
314
  role.status = "idle";
315
+ // Persist per-role claudeSessionId — turn_complete frame carries the
316
+ // backend's real session UUID. We need it so a future restart can
317
+ // `--resume` this exact role's conversation (Task 4 / Phase 4-5).
318
+ if (info.claudeSessionId && role.claudeSessionId !== info.claudeSessionId) {
319
+ role.claudeSessionId = info.claudeSessionId;
320
+ // saveSwarmSnapshot is sync-fire (proper-lockfile internally async);
321
+ // call directly — failures should not break turn completion.
322
+ try {
323
+ saveSwarmSnapshot(swarm);
324
+ }
325
+ catch (err) {
326
+ log.warn({ err, swarmId: swarm.id, instanceId: role.instanceId }, "failed to persist claudeSessionId on turn complete");
327
+ }
328
+ }
329
+ // Settle inbox echoes for this turn (fire-and-forget).
330
+ void this.inboxRelay.onAgentTurnSettled(swarm.id, role.instanceId);
237
331
  // Forward to Hub
238
332
  this.hub.send({
239
333
  type: "swarm.turn_complete",
@@ -245,8 +339,38 @@ export class SwarmCoordinator {
245
339
  });
246
340
  return true;
247
341
  }
342
+ /**
343
+ * Persist the per-role claudeSessionId immediately (e.g. from `system/init`
344
+ * frame, before the first turn_complete). Returns true if a role was found
345
+ * and updated; false otherwise (no-op for non-swarm sessions).
346
+ */
347
+ setRoleClaudeSessionId(swarmId, instanceId, claudeSessionId) {
348
+ const swarm = this.swarms.get(swarmId);
349
+ if (!swarm)
350
+ return false;
351
+ const role = swarm.roles.get(instanceId);
352
+ if (!role)
353
+ return false;
354
+ if (role.claudeSessionId === claudeSessionId)
355
+ return true;
356
+ role.claudeSessionId = claudeSessionId;
357
+ try {
358
+ saveSwarmSnapshot(swarm);
359
+ }
360
+ catch (err) {
361
+ log.warn({ err, swarmId, instanceId }, "failed to persist claudeSessionId");
362
+ }
363
+ return true;
364
+ }
365
+ /** Convenience: same as setRoleClaudeSessionId but takes the `${swarmId}::${instanceId}` roleSessionId. */
366
+ setRoleClaudeSessionIdBySession(roleSessionId, claudeSessionId) {
367
+ const { swarm, role } = this.findByRoleSessionId(roleSessionId);
368
+ if (!swarm || !role)
369
+ return false;
370
+ return this.setRoleClaudeSessionId(swarm.id, role.instanceId, claudeSessionId);
371
+ }
248
372
  /** Spawn a new role instance in a swarm. */
249
- async spawnRole(swarmId, roleName, router, taskPrompt, customPrompt, customDefinition, additionalDirs) {
373
+ async spawnRole(swarmId, roleName, taskPrompt, customPrompt, customDefinition, additionalDirs, resumeId, presetInstanceId) {
250
374
  const swarm = this.swarms.get(swarmId);
251
375
  if (!swarm)
252
376
  throw new Error(`Swarm ${swarmId} not found`);
@@ -270,9 +394,17 @@ export class SwarmCoordinator {
270
394
  if (customPrompt) {
271
395
  definition.promptBody = customPrompt;
272
396
  }
273
- const seq = swarm.nextInstanceSeq.get(roleName) ?? 0;
274
- swarm.nextInstanceSeq.set(roleName, seq + 1);
275
- const instanceId = `${definition.shortName}-${seq}`;
397
+ let instanceId;
398
+ if (presetInstanceId) {
399
+ // Recovery path: preserve the original instanceId so per-role state
400
+ // (logs, inbox, claudeSessionId) lines up with prior snapshot.
401
+ instanceId = presetInstanceId;
402
+ }
403
+ else {
404
+ const seq = swarm.nextInstanceSeq.get(roleName) ?? 0;
405
+ swarm.nextInstanceSeq.set(roleName, seq + 1);
406
+ instanceId = `${definition.shortName}-${seq}`;
407
+ }
276
408
  const roleSessionId = `${swarmId}::${instanceId}`;
277
409
  const roleInstance = {
278
410
  instanceId,
@@ -281,32 +413,159 @@ export class SwarmCoordinator {
281
413
  roleSessionId,
282
414
  status: "spawning",
283
415
  currentTask: taskPrompt,
416
+ claudeSessionId: resumeId,
284
417
  };
285
418
  swarm.roles.set(instanceId, roleInstance);
286
419
  // Build role list for prompt
287
420
  const roleList = this.buildRoleListString(swarm);
288
- const systemPrompt = buildRolePrompt(definition, instanceId, roleList);
421
+ const systemPrompt = buildRolePrompt(definition, instanceId, roleList, {
422
+ swarmId,
423
+ workDir: swarm.workDir,
424
+ });
289
425
  const roleId = BUILTIN_ROLES[roleName]?.id ?? `role-${roleName}`;
426
+ // Boot-time briefing injection (Phase 7-E). Best-effort: failures here
427
+ // must not abort role spawn — log and continue with the bare prompt.
428
+ let finalPrompt = systemPrompt;
429
+ const taskStore = swarm.workDir
430
+ ? this.resolveTaskStore(swarm.workDir)
431
+ : undefined;
432
+ if (taskStore) {
433
+ try {
434
+ const briefingSection = this.buildBriefingSection(taskStore, swarmId, instanceId, definition);
435
+ if (briefingSection)
436
+ finalPrompt = systemPrompt + "\n\n" + briefingSection;
437
+ }
438
+ catch (err) {
439
+ log.warn({ err, instanceId }, "briefing injection failed (non-fatal)");
440
+ }
441
+ }
290
442
  // Spawn Claude CLI process via SessionAdapter
291
443
  // SessionManager handles memory injection (Pipeline A: memory prompt + roleId hint) via roleId
444
+ const tools = resolveRoleTools(definition);
292
445
  await this.sessionAdapter.createSession({
293
446
  sessionId: roleSessionId,
294
447
  workDir: swarm.workDir,
295
- systemPrompt,
448
+ systemPrompt: finalPrompt,
296
449
  roleId,
297
450
  additionalDirs,
451
+ // Task 5: when recovering, the caller passes resumeId = role.claudeSessionId
452
+ // so the Claude conversation continues with `--resume`. Fresh spawns
453
+ // (Task 3 default) leave it undefined for a new conversation.
454
+ resumeId,
455
+ allowedTools: tools.allowedTools,
456
+ disallowedTools: tools.disallowedTools,
298
457
  });
299
458
  roleInstance.status = "active";
300
- // Persistence: save snapshot after role spawned
301
- saveSwarmSnapshot(swarm);
302
- // Register in router
303
- const r = router ?? swarm._router;
304
- r.register(instanceId, { kind: "local", roleSessionId });
459
+ // Persistence: save snapshot after role spawned. Skipped during recover()
460
+ // where the caller saves once at the end (avoids N writes for N roles).
461
+ if (!swarm._suppressSnapshot) {
462
+ saveSwarmSnapshot(swarm);
463
+ }
305
464
  // Send swarm status update to Hub
306
465
  this.sendStatusUpdate(swarm);
466
+ // Flush any pending inbox messages (fire-and-forget).
467
+ void this.inboxRelay.deliver(swarmId, instanceId);
307
468
  log.info({ swarmId, instanceId, roleName, roleId }, "role spawned");
308
469
  return roleInstance;
309
470
  }
471
+ /**
472
+ * Recover a previously persisted swarm by id.
473
+ *
474
+ * Locates the snapshot via `listRecoverableSwarmIds()`, then for each role
475
+ * in the snapshot spawns a Claude session via `--resume role.claudeSessionId`
476
+ * (when present) so the per-role conversation continues. Roles without a
477
+ * stored claudeSessionId start fresh.
478
+ *
479
+ * After all roles are spawned, drains each role's offline inbox via
480
+ * `inboxRelay.deliver`. Drain failures are best-effort: warn but never throw.
481
+ *
482
+ * Recovered swarms start in `paused` state — caller must explicitly resume
483
+ * (matches existing behaviour in `recoverSwarm` helper).
484
+ */
485
+ async recover(swarmId) {
486
+ if (this.swarms.has(swarmId)) {
487
+ log.warn({ swarmId }, "recover: swarm already loaded — skipping");
488
+ return;
489
+ }
490
+ // Locate snapshot file across all known project workDirs.
491
+ const entry = listRecoverableSwarmIds().find((e) => e.swarmId === swarmId);
492
+ if (!entry) {
493
+ throw new Error(`recover: no snapshot found for swarm ${swarmId}`);
494
+ }
495
+ const snapshot = loadSwarmSnapshot(entry.workDir, swarmId);
496
+ if (!snapshot) {
497
+ throw new Error(`recover: snapshot for ${swarmId} could not be loaded`);
498
+ }
499
+ // Bootstrap the swarm shell. We deliberately bypass create() because
500
+ // create() would respawn eager roles from template defaults — losing the
501
+ // per-role instanceId / claudeSessionId from the snapshot.
502
+ const swarm = {
503
+ id: swarmId,
504
+ hubSessionId: snapshot.hubSessionId,
505
+ workDir: snapshot.workDir,
506
+ teamName: snapshot.teamName,
507
+ roles: new Map(),
508
+ plan: snapshot.plan ?? null,
509
+ nextInstanceSeq: new Map(Object.entries(snapshot.nextInstanceSeq ?? {})),
510
+ idleCheckCount: 0,
511
+ maxIdleChecks: 10,
512
+ isPaused: true,
513
+ status: "paused",
514
+ planStatus: snapshot.planStatus ?? "none",
515
+ };
516
+ this.swarms.set(swarmId, swarm);
517
+ // Respawn each role with the same instanceId, passing claudeSessionId
518
+ // through as resumeId so SessionAdapter can `--resume` the conversation.
519
+ // Suppress per-role snapshot writes — we save once at the end with the
520
+ // full role set (and partialRecover marker if any role failed).
521
+ swarm._suppressSnapshot = true;
522
+ let partialRecover = false;
523
+ for (const r of snapshot.roles) {
524
+ try {
525
+ await this.spawnRole(swarmId, r.roleName, r.currentTask, undefined, // customPrompt
526
+ undefined, // customDefinition
527
+ undefined, // additionalDirs
528
+ r.claudeSessionId, r.instanceId);
529
+ }
530
+ catch (err) {
531
+ partialRecover = true;
532
+ log.warn({ err, swarmId, instanceId: r.instanceId }, "recover: failed to spawn role");
533
+ }
534
+ }
535
+ delete swarm._suppressSnapshot;
536
+ swarm.partialRecover = partialRecover;
537
+ saveSwarmSnapshot(swarm);
538
+ // Drain offline inboxes — best-effort per role.
539
+ for (const role of swarm.roles.values()) {
540
+ try {
541
+ await this.inboxRelay.deliver(swarmId, role.instanceId);
542
+ }
543
+ catch (err) {
544
+ log.warn({ err, swarmId, instanceId: role.instanceId }, "recover: inbox drain failed (best-effort)");
545
+ }
546
+ }
547
+ log.info({ swarmId, roleCount: swarm.roles.size }, "swarm recovered");
548
+ // Re-arm plan-review timeout on recovery: planReviewTimer is runtime-only
549
+ // (see types.ts), so a snapshot persisted while planStatus="reviewing"
550
+ // would otherwise resume with no timer and deadlock the queen indefinitely
551
+ // if the reviewer never replies. Re-arm with the full PLAN_REVIEW_TIMEOUT_MS
552
+ // (we don't persist the original deadline; worst case the swarm waits one
553
+ // additional timeout window past restart, which is bounded — vs. the
554
+ // previous unbounded hang).
555
+ if (swarm.planStatus === "reviewing") {
556
+ this.armPlanReviewTimeout(swarm);
557
+ log.info({ swarmId, timeoutMs: PLAN_REVIEW_TIMEOUT_MS }, "recover: re-armed plan_review timeout");
558
+ }
559
+ // Start inbox watcher for the recovered swarm (best-effort).
560
+ if (swarm.workDir) {
561
+ try {
562
+ this.inboxWatcher.watch(swarmId, swarm.workDir);
563
+ }
564
+ catch (err) {
565
+ log.warn({ err, swarmId }, "inboxWatcher.watch failed (best-effort)");
566
+ }
567
+ }
568
+ }
310
569
  /** Stop a role instance. */
311
570
  async stopRole(swarmId, instanceId) {
312
571
  const swarm = this.swarms.get(swarmId);
@@ -316,8 +575,6 @@ export class SwarmCoordinator {
316
575
  if (!role)
317
576
  return;
318
577
  role.status = "stopped";
319
- const router = swarm._router;
320
- router.unregister(instanceId);
321
578
  await this.sessionAdapter.closeSession(role.roleSessionId);
322
579
  swarm.roles.delete(instanceId);
323
580
  // Persistence: save snapshot after role stopped
@@ -325,6 +582,68 @@ export class SwarmCoordinator {
325
582
  this.sendStatusUpdate(swarm);
326
583
  log.info({ swarmId, instanceId }, "role stopped");
327
584
  }
585
+ /**
586
+ * Roll back side-effects from a partially-completed `create()`.
587
+ *
588
+ * Called from create()'s catch block. Mirrors `destroy()` but is
589
+ * deliberately tolerant of partial state (timer may not exist, snapshot
590
+ * may not exist, sessions list comes from the caller's accumulator
591
+ * because `swarm.roles` may not yet contain every opened session).
592
+ *
593
+ * Best-effort: every step is wrapped — we never throw out of cleanup.
594
+ * Logs a warn so operators see WHY a swarm was rolled back.
595
+ */
596
+ async cleanupPartialCreate(swarmId, openedSessionIds, snapshotPreexisted, cause) {
597
+ const swarm = this.swarms.get(swarmId);
598
+ // 1. Stop the queen check timer if it was started.
599
+ if (swarm?.checkTimer) {
600
+ try {
601
+ clearTimeout(swarm.checkTimer);
602
+ }
603
+ catch { /* ignore */ }
604
+ swarm.checkTimer = undefined;
605
+ }
606
+ if (swarm?.planReviewTimer) {
607
+ try {
608
+ clearTimeout(swarm.planReviewTimer);
609
+ }
610
+ catch { /* ignore */ }
611
+ swarm.planReviewTimer = undefined;
612
+ }
613
+ // 2. Close every Claude session we managed to open.
614
+ for (const sid of openedSessionIds) {
615
+ try {
616
+ await this.sessionAdapter.closeSession(sid);
617
+ }
618
+ catch (err) {
619
+ log.warn({ err, swarmId, sessionId: sid }, "cleanupPartialCreate: closeSession failed");
620
+ }
621
+ }
622
+ // 3. Delete recovery.json ONLY if this create() call wrote it.
623
+ // If a snapshot existed before we started (continuation path, or
624
+ // agent restart while a previous snapshot is still on disk),
625
+ // leave it untouched — it belongs to a prior run.
626
+ if (swarm?.workDir) {
627
+ if (!snapshotPreexisted) {
628
+ try {
629
+ deleteSwarmSnapshot(swarm.workDir, swarmId);
630
+ }
631
+ catch (err) {
632
+ log.warn({ err, swarmId }, "cleanupPartialCreate: deleteSwarmSnapshot failed");
633
+ }
634
+ }
635
+ else {
636
+ log.warn({ swarmId }, "cleanupPartialCreate: preserved pre-existing recovery.json");
637
+ }
638
+ }
639
+ // 4. Remove from in-memory registry so the same id can be retried.
640
+ try {
641
+ this.inboxWatcher.unwatch(swarmId);
642
+ }
643
+ catch { /* ignore */ }
644
+ this.swarms.delete(swarmId);
645
+ log.warn({ swarmId, openedCount: openedSessionIds.length, snapshotPreexisted, err: cause instanceof Error ? cause.message : String(cause) }, "cleanupPartialCreate: rolled back partial swarm state");
646
+ }
328
647
  /** Destroy an entire swarm. */
329
648
  async destroy(swarmId) {
330
649
  const swarm = this.swarms.get(swarmId);
@@ -332,12 +651,30 @@ export class SwarmCoordinator {
332
651
  return;
333
652
  if (swarm.checkTimer)
334
653
  clearTimeout(swarm.checkTimer);
654
+ if (swarm.planReviewTimer) {
655
+ clearTimeout(swarm.planReviewTimer);
656
+ swarm.planReviewTimer = undefined;
657
+ }
335
658
  for (const role of swarm.roles.values()) {
336
659
  await this.sessionAdapter.closeSession(role.roleSessionId).catch(() => { });
337
660
  }
661
+ // Detach tasks before removing the snapshot — keeps task index consistent
662
+ // even if snapshot deletion fails. Best-effort: never block destroy.
663
+ await this.detachTasks(swarmId, swarm.workDir);
664
+ try {
665
+ this.inboxWatcher.unwatch(swarmId);
666
+ }
667
+ catch (err) {
668
+ log.warn({ err, swarmId }, "inboxWatcher.unwatch failed (best-effort)");
669
+ }
338
670
  this.swarms.delete(swarmId);
339
671
  // Persistence: remove snapshot
340
- deleteSwarmSnapshot(swarmId);
672
+ if (swarm.workDir) {
673
+ deleteSwarmSnapshot(swarm.workDir, swarmId);
674
+ }
675
+ else {
676
+ log.warn({ swarmId }, "destroy: swarm.workDir missing, cannot delete snapshot");
677
+ }
341
678
  log.info({ swarmId }, "swarm destroyed");
342
679
  }
343
680
  /** Check if a session ID belongs to any swarm. */
@@ -363,6 +700,10 @@ export class SwarmCoordinator {
363
700
  clearTimeout(swarm.checkTimer);
364
701
  swarm.checkTimer = undefined;
365
702
  }
703
+ if (swarm.planReviewTimer) {
704
+ clearTimeout(swarm.planReviewTimer);
705
+ swarm.planReviewTimer = undefined;
706
+ }
366
707
  await this.runRetroAndCleanup(swarmId, swarm);
367
708
  log.info({ swarmId }, "swarm completed — all resources released");
368
709
  }
@@ -377,16 +718,204 @@ export class SwarmCoordinator {
377
718
  clearTimeout(swarm.checkTimer);
378
719
  swarm.checkTimer = undefined;
379
720
  }
721
+ if (swarm.planReviewTimer) {
722
+ clearTimeout(swarm.planReviewTimer);
723
+ swarm.planReviewTimer = undefined;
724
+ }
380
725
  await this.runRetroAndCleanup(swarmId, swarm);
381
726
  log.info({ swarmId }, "swarm failed — all resources released");
382
727
  }
728
+ /**
729
+ * InboxRelay messageInterceptor hook. Consumes `plan_review_result`
730
+ * envelopes (written by the `plan_review_submit` MCP tool out-of-process)
731
+ * and converts them into the normalized `submitPlanReview()` flow:
732
+ * planStatus flip, timer clear, plan_approved/plan_rejected to queen.
733
+ *
734
+ * Returning `true` removes the raw message from the LLM payload and marks
735
+ * it delivered so it doesn't re-fire. Returning `false` lets the relay
736
+ * push the message normally.
737
+ *
738
+ * Failures (parse error, invalid verdict) are logged + consumed to avoid a
739
+ * tight reload loop on a malformed file. The submitPlanReview call is
740
+ * idempotent so duplicate reviewer envelopes (e.g. retries) are safe.
741
+ */
742
+ async handleInterceptedMessage(swarmId, instanceId, msg) {
743
+ if (msg.type !== "plan_review_result")
744
+ return false;
745
+ // Authorization: only reviewer instances may flip planStatus via this path.
746
+ // The MCP tool's `from.startsWith("reviewer")` guard is advisory across the
747
+ // process boundary — a malicious worker could call `message_send` directly
748
+ // with type:"plan_review_result". Enforce again here so the interceptor is
749
+ // the actual source of truth, mirroring the server-side check in the
750
+ // `plan_review_submit` tool handler.
751
+ //
752
+ // We deliberately do NOT allow `from: "system"` here: the timeout fallback
753
+ // calls `submitPlanReview` directly in-process, never via the inbox, so
754
+ // accepting "system" here would only widen the spoofing surface (any worker
755
+ // with `message_send` could forge `from:"system"` since `message_send` does
756
+ // not authenticate the sender).
757
+ if (!msg.from.startsWith("reviewer")) {
758
+ log.warn({ swarmId, instanceId, msgId: msg.id, from: msg.from }, "plan_review_result interceptor: rejected non-reviewer sender — consuming");
759
+ return true;
760
+ }
761
+ // Two-stage parse:
762
+ // 1. Try the canonical JSON envelope (what the `plan_review_submit` MCP
763
+ // tool always emits): `{ verdict, body }`.
764
+ // 2. Fall back to a markdown regex `^verdict:\s*(approved|rejected)`
765
+ // against the raw `data`. Reviewers in misconfigured environments
766
+ // (e.g. their Claude session is missing the clawnet-mcp tools because
767
+ // of the legacy mcp.json bug fixed in PR #102) sometimes hand-write a
768
+ // plan_review_result envelope by direct file write, leaving `data`
769
+ // as a markdown blob. Without this fallback the verdict is lost and
770
+ // the swarm waits the full 10-minute timeout.
771
+ //
772
+ // Spoofing surface is unchanged: the from-prefix check above runs BEFORE
773
+ // either parse path, so a malicious worker cannot forge an approval just
774
+ // by writing markdown.
775
+ let verdict;
776
+ let body = "";
777
+ try {
778
+ const parsed = JSON.parse(msg.data);
779
+ if (parsed.verdict === "approved" || parsed.verdict === "rejected") {
780
+ verdict = parsed.verdict;
781
+ body = typeof parsed.body === "string" ? parsed.body : "";
782
+ }
783
+ }
784
+ catch {
785
+ // Fall through to markdown fallback below.
786
+ }
787
+ if (!verdict && typeof msg.data === "string") {
788
+ const m = msg.data.match(/^\s*verdict\s*:\s*(approved|rejected)\b/im);
789
+ if (m) {
790
+ verdict = m[1].toLowerCase();
791
+ // Carry the entire markdown blob as the body so the queen sees the
792
+ // reviewer's full commentary, not just the verdict line.
793
+ body = msg.data;
794
+ log.info({ swarmId, instanceId, msgId: msg.id, verdict }, "plan_review_result interceptor: markdown verdict fallback used");
795
+ }
796
+ }
797
+ if (!verdict) {
798
+ log.warn({ swarmId, instanceId, msgId: msg.id }, "plan_review_result interceptor: no parsable verdict (neither JSON nor markdown) — consuming");
799
+ return true;
800
+ }
801
+ // IMPORTANT: schedule submitPlanReview asynchronously instead of awaiting.
802
+ // The interceptor runs *inside* InboxRelay.runDeliver for (swarmId, queen),
803
+ // and submitPlanReview internally calls deliverInbox → inboxRelay.deliver
804
+ // for the same (swarmId, queen). Single-flight would coalesce that nested
805
+ // call onto the in-flight promise we are currently inside, and awaiting it
806
+ // here would deadlock. Returning true synchronously consumes the message
807
+ // so the outer runDeliver can finish, releasing the single-flight slot
808
+ // before the deferred submitPlanReview resumes and fires its delivery.
809
+ void this.submitPlanReview({
810
+ swarmId,
811
+ from: msg.from,
812
+ verdict,
813
+ body,
814
+ }).catch((err) => log.warn({ err, swarmId, instanceId, msgId: msg.id, verdict }, "plan_review_result interceptor: deferred submitPlanReview failed"));
815
+ return true;
816
+ }
817
+ /**
818
+ * Apply a plan review verdict from a reviewer (or system on timeout).
819
+ * Idempotent: when planStatus is already "approved" or "rejected", logs and
820
+ * returns. Clears any pending planReviewTimer.
821
+ *
822
+ * Wired to the `plan_review_submit` MCP tool by the host (Task 3); this
823
+ * method itself does not call back into MCP.
824
+ */
825
+ async submitPlanReview(args) {
826
+ const swarm = this.swarms.get(args.swarmId);
827
+ if (!swarm)
828
+ throw new Error(`swarm not found: ${args.swarmId}`);
829
+ // Idempotency guard — a late reviewer submission after the timeout fired,
830
+ // or a duplicate MCP call, must not double-notify the queen.
831
+ if (swarm.planStatus === "approved" || swarm.planStatus === "rejected") {
832
+ log.warn({
833
+ swarmId: args.swarmId,
834
+ currentStatus: swarm.planStatus,
835
+ attemptedVerdict: args.verdict,
836
+ }, "plan review already finalised, ignoring duplicate submission");
837
+ return;
838
+ }
839
+ if (swarm.planStatus !== "reviewing") {
840
+ // Refuse out-of-band submissions (planStatus = none / draft / approved /
841
+ // rejected). The `approved` / `rejected` cases were already short-circuited
842
+ // above; the remaining states (`none`, `draft`) mean either the queen has
843
+ // not yet asked for review (premature/spoofed call), or the swarm is in a
844
+ // state where flipping planStatus would silently overwrite real progress.
845
+ // No-op + warn instead of "proceeding anyway" so a misbehaving reviewer
846
+ // can't corrupt state by submitting before requestPlanReview ever ran.
847
+ log.warn({ swarmId: args.swarmId, currentStatus: swarm.planStatus, attemptedVerdict: args.verdict, from: args.from }, "submitPlanReview: planStatus !== reviewing — refusing to apply verdict");
848
+ return;
849
+ }
850
+ if (swarm.planReviewTimer) {
851
+ clearTimeout(swarm.planReviewTimer);
852
+ swarm.planReviewTimer = undefined;
853
+ }
854
+ swarm.planStatus = args.verdict === "approved" ? "approved" : "rejected";
855
+ saveSwarmSnapshot(swarm);
856
+ this.sendStatusUpdate(swarm);
857
+ const queen = this.findQueen(swarm);
858
+ if (!queen) {
859
+ log.warn({ swarmId: args.swarmId, verdict: args.verdict }, "submitPlanReview: no queen found — verdict recorded but queen not notified");
860
+ return;
861
+ }
862
+ const messageType = args.verdict === "approved" ? "plan_approved" : "plan_rejected";
863
+ const data = JSON.stringify({
864
+ verdict: args.verdict,
865
+ body: args.body ?? "",
866
+ from: args.from,
867
+ });
868
+ await this.deliverInbox(swarm, queen.instanceId, {
869
+ from: "system",
870
+ type: messageType,
871
+ data,
872
+ });
873
+ log.info({ swarmId: args.swarmId, verdict: args.verdict, from: args.from }, "plan review submitted");
874
+ }
875
+ /**
876
+ * Defensive recovery: re-emit plan_approved / plan_rejected to the queen if
877
+ * planStatus is finalised but no matching system envelope exists in the
878
+ * queen's inbox. Called from the queen check-in tick.
879
+ *
880
+ * Catches the case where the deferred submitPlanReview's call to
881
+ * deliverInbox() threw (e.g. transient fs error or lockfile contention) and
882
+ * only logged a warning — leaving planStatus = "approved" but the queen
883
+ * never receiving plan_approved, which would otherwise wedge the swarm.
884
+ *
885
+ * Idempotent: if a system plan_approved/plan_rejected already exists in the
886
+ * queen inbox (delivered or undelivered), we do nothing.
887
+ */
888
+ async reconcilePlanStatusForQueen(swarm, queenInstanceId) {
889
+ if (swarm.planStatus !== "approved" && swarm.planStatus !== "rejected") {
890
+ return;
891
+ }
892
+ if (!swarm.workDir)
893
+ return;
894
+ const expectedType = swarm.planStatus === "approved" ? "plan_approved" : "plan_rejected";
895
+ const store = new InboxStore(swarm.workDir, swarm.id, this.resolveHome());
896
+ const queueAll = await store.readAll(queenInstanceId);
897
+ const existing = queueAll.find((m) => m.type === expectedType && m.from === "system");
898
+ if (existing)
899
+ return; // Already delivered (or queued).
900
+ log.warn({ swarmId: swarm.id, planStatus: swarm.planStatus }, `reconcilePlanStatus: queen missing ${expectedType} despite finalised planStatus — re-emitting`);
901
+ const data = JSON.stringify({
902
+ verdict: swarm.planStatus,
903
+ body: "[reconcile] previous notification was lost; re-issued by queen check-in",
904
+ from: "system",
905
+ });
906
+ await this.deliverInbox(swarm, queenInstanceId, {
907
+ from: "system",
908
+ type: expectedType,
909
+ data,
910
+ });
911
+ }
383
912
  // ── Private helpers ─────────────────────────────────────────────────
384
913
  /** Run retrospective analysis then clean up swarm resources. */
385
914
  async runRetroAndCleanup(swarmId, swarm) {
386
915
  // 1. Run retrospective (best-effort)
387
916
  try {
388
- const messages = readMessageLog(swarmId);
389
- const snapshot = loadSwarmSnapshot(swarmId);
917
+ const messages = swarm.workDir ? readMessageLog(swarm.workDir, swarmId) : [];
918
+ const snapshot = swarm.workDir ? loadSwarmSnapshot(swarm.workDir, swarmId) : null;
390
919
  if (snapshot && messages.length > 0) {
391
920
  const retroResult = await runRetrospective({
392
921
  swarmId,
@@ -442,10 +971,142 @@ export class SwarmCoordinator {
442
971
  await this.sessionAdapter.closeSession(role.roleSessionId).catch(() => { });
443
972
  }
444
973
  // 4. Delete recovery.json (keep messages.jsonl)
445
- deleteSwarmSnapshot(swarmId);
974
+ if (swarm.workDir) {
975
+ // Detach tasks before deleting snapshot — keeps task index consistent
976
+ // even if snapshot removal fails. Best-effort: never abort cleanup.
977
+ await this.detachTasks(swarmId, swarm.workDir);
978
+ deleteSwarmSnapshot(swarm.workDir, swarmId);
979
+ }
446
980
  // 5. Remove from memory
447
981
  this.swarms.delete(swarmId);
448
982
  }
983
+ /**
984
+ * Detach all tasks owned by a swarm: clear their swarmId and owner so they
985
+ * survive the swarm's lifecycle and can be reassigned. Best-effort — any
986
+ * failure is logged and swallowed; never blocks destroy/complete/fail.
987
+ *
988
+ * Tasks belonging to other swarms are not touched.
989
+ */
990
+ async detachTasks(swarmId, workDir) {
991
+ if (!workDir)
992
+ return;
993
+ try {
994
+ // TaskStore.projectRoot() prepends ".clawnet/" itself, so `home` here
995
+ // must be the user home (no `.clawnet` suffix). Previously this used
996
+ // `process.env.CLAWNET_DIR ?? join(homedir(), ".clawnet")`, producing
997
+ // a `~/.clawnet/.clawnet/projects/...` ghost layout for every detach.
998
+ const home = this.resolveHome();
999
+ const store = new TaskStore({ workDir, home });
1000
+ const entries = store.listBySwarm(swarmId);
1001
+ for (const entry of entries) {
1002
+ try {
1003
+ await store.update(entry.id, { swarmId: null, owner: null });
1004
+ }
1005
+ catch (err) {
1006
+ log.warn({ err, swarmId, taskId: entry.id }, "detachTasks: update failed for task (best-effort)");
1007
+ }
1008
+ }
1009
+ }
1010
+ catch (err) {
1011
+ log.warn({ err, swarmId }, "detachTasks: failed (best-effort)");
1012
+ }
1013
+ }
1014
+ /**
1015
+ * Build the boot-time "### 任务上下文" section appended to a role's system
1016
+ * prompt. Returns null when no taskStore is wired (caller already gates on
1017
+ * that, but we double-check here for safety).
1018
+ *
1019
+ * Queen vs. worker contract:
1020
+ * - queen → lead briefing (queenInstanceId is this role itself) + member briefing
1021
+ * - other → task briefing (owner-scoped) + member briefing
1022
+ *
1023
+ * The "### 任务上下文" anchor and inner section anchors are load-bearing
1024
+ * (matched verbatim by tests and by the agent itself).
1025
+ */
1026
+ buildBriefingSection(taskStore, swarmId, instanceId, definition) {
1027
+ const isQueen = definition.type === "queen";
1028
+ if (isQueen) {
1029
+ const leadBody = computeLeadBriefing(taskStore, swarmId, instanceId);
1030
+ const memberBody = computeMemberBriefing({ swarmId, instanceId, roleType: "queen" });
1031
+ return ("### 任务上下文\n\n" +
1032
+ formatLeadBriefing(leadBody) +
1033
+ "\n\n" +
1034
+ formatMemberBriefing(memberBody));
1035
+ }
1036
+ const taskBody = computeTaskBriefing(taskStore, swarmId, instanceId);
1037
+ const memberBody = computeMemberBriefing({ swarmId, instanceId, roleType: "worker" });
1038
+ return ("### 任务上下文\n\n" +
1039
+ formatTaskBriefing(taskBody, { instanceId }) +
1040
+ "\n\n" +
1041
+ formatMemberBriefing(memberBody));
1042
+ }
1043
+ // ── Inbox-based message delivery (Task 6 / PR4) ─────────────────────
1044
+ /**
1045
+ * Append a message to a target instance's inbox file then trigger
1046
+ * InboxRelay.deliver to push it through the agent's stdin (single-flight).
1047
+ * Best-effort: returns silently when the swarm has no workDir (in-memory
1048
+ * fixtures used by tests), keeping behaviour compatible with the previous
1049
+ * router.send path.
1050
+ */
1051
+ async deliverInbox(swarm, targetInstanceId, msg) {
1052
+ if (!swarm.workDir)
1053
+ return;
1054
+ const store = new InboxStore(swarm.workDir, swarm.id, this.resolveHome());
1055
+ const full = {
1056
+ id: `${msg.type}-${randomUUID()}`,
1057
+ from: msg.from,
1058
+ type: msg.type,
1059
+ data: msg.data,
1060
+ ...(msg.taskId ? { taskId: msg.taskId } : {}),
1061
+ timestamp: Date.now(),
1062
+ delivered: false,
1063
+ };
1064
+ await store.append(targetInstanceId, full);
1065
+ // Lifecycle short-circuit: InboxRelay.runDeliver gates on
1066
+ // `spawning`/`stopped` and would no-op anyway. Looping past it just burns
1067
+ // cycles for messages that will be drained later — `stopped` is terminal
1068
+ // (the inbox row stays for any successor); `spawning` is drained by the
1069
+ // post-spawn `inboxRelay.deliver` invoked once the role flips to active.
1070
+ const role = swarm.roles.get(targetInstanceId);
1071
+ if (!role || role.status === "stopped" || role.status === "spawning")
1072
+ return;
1073
+ // InboxRelay uses single-flight per instance: a deliver() call invoked
1074
+ // while another pass is in-flight is coalesced into the existing promise
1075
+ // and may NOT see our just-appended row (the in-flight pass already
1076
+ // ran readUndelivered before our append landed). Loop a few iterations,
1077
+ // each preceded by a microtask hop so the in-flight slot has cleared,
1078
+ // until our id is no longer undelivered (or a lifecycle gate trips).
1079
+ for (let attempt = 0; attempt < 4; attempt++) {
1080
+ await this.inboxRelay.deliver(swarm.id, targetInstanceId);
1081
+ // Yield once so InboxRelay's `.finally(delete inFlight)` runs before we
1082
+ // attempt the follow-up deliver — otherwise the next call would join
1083
+ // the still-in-flight promise instead of starting a fresh pass.
1084
+ await new Promise((r) => setImmediate(r));
1085
+ const remaining = await store.readUndelivered(targetInstanceId);
1086
+ if (!remaining.some((m) => m.id === full.id))
1087
+ return;
1088
+ // Re-check lifecycle: if the role transitioned to stopped between
1089
+ // attempts (e.g. shutdown raced with delivery), don't keep spinning.
1090
+ const cur = swarm.roles.get(targetInstanceId);
1091
+ if (!cur || cur.status === "stopped")
1092
+ return;
1093
+ }
1094
+ log.warn({ swarmId: swarm.id, instanceId: targetInstanceId, msgId: full.id, type: msg.type }, "deliverInbox: message remained undelivered after 4 attempts");
1095
+ }
1096
+ /**
1097
+ * Broadcast helper: deliverInbox to every role in the swarm except those
1098
+ * already stopped. Includes `spawning` roles intentionally — the inbox file
1099
+ * is durable, so the relay (or runPostStart drain in the spawn path) will
1100
+ * deliver the message once the role's stdin opens. Matches the legacy
1101
+ * router.broadcast behaviour, which never gated on "spawning".
1102
+ */
1103
+ async broadcastInbox(swarm, msg) {
1104
+ for (const role of swarm.roles.values()) {
1105
+ if (role.status === "stopped")
1106
+ continue;
1107
+ await this.deliverInbox(swarm, role.instanceId, msg);
1108
+ }
1109
+ }
449
1110
  findQueen(swarm) {
450
1111
  for (const role of swarm.roles.values()) {
451
1112
  if (role.definition.type === "queen" && role.status !== "stopped")
@@ -453,6 +1114,18 @@ export class SwarmCoordinator {
453
1114
  }
454
1115
  return undefined;
455
1116
  }
1117
+ /**
1118
+ * Public lookup: returns the active queen role instance for a swarm, or
1119
+ * undefined when the swarm/queen is not present. Identifies the queen by
1120
+ * `definition.type === "queen"` (not by roleName) so custom queen roles
1121
+ * with non-default names are also recognised.
1122
+ */
1123
+ findQueenInstance(swarmId) {
1124
+ const swarm = this.swarms.get(swarmId);
1125
+ if (!swarm)
1126
+ return undefined;
1127
+ return this.findQueen(swarm);
1128
+ }
456
1129
  findReviewer(swarm) {
457
1130
  for (const role of swarm.roles.values()) {
458
1131
  if (role.roleName === "reviewer" && role.status !== "stopped")
@@ -462,7 +1135,10 @@ export class SwarmCoordinator {
462
1135
  }
463
1136
  /** Build a summary of the previous swarm run from messages.jsonl for continuation. */
464
1137
  buildContinuationSummary(swarmId) {
465
- const messages = readMessageLog(swarmId);
1138
+ const swarm = this.swarms.get(swarmId);
1139
+ if (!swarm?.workDir)
1140
+ return null;
1141
+ const messages = readMessageLog(swarm.workDir, swarmId);
466
1142
  if (messages.length === 0)
467
1143
  return null;
468
1144
  const MAX_LEN = 2000;
@@ -492,7 +1168,7 @@ export class SwarmCoordinator {
492
1168
  return summary;
493
1169
  }
494
1170
  /** Send plan to reviewer for review. Auto-approves if no reviewer available. */
495
- requestPlanReview(swarm, plan) {
1171
+ async requestPlanReview(swarm, plan) {
496
1172
  const reviewer = this.findReviewer(swarm);
497
1173
  if (!reviewer) {
498
1174
  // No reviewer — auto-approve
@@ -505,9 +1181,8 @@ export class SwarmCoordinator {
505
1181
  swarm.planStatus = "reviewing";
506
1182
  saveSwarmSnapshot(swarm);
507
1183
  this.sendStatusUpdate(swarm);
508
- const router = swarm._router;
509
1184
  const planJson = JSON.stringify(plan, null, 2);
510
- router.send(reviewer.instanceId, {
1185
+ await this.deliverInbox(swarm, reviewer.instanceId, {
511
1186
  from: "system",
512
1187
  type: "plan_review",
513
1188
  data: `[系统] 请审查以下执行计划,评估其可行性和风险。
@@ -523,48 +1198,38 @@ export class SwarmCoordinator {
523
1198
  ${planJson}
524
1199
  \`\`\`
525
1200
 
526
- 审查完成后,请用 report action 回复(taskId="plan_review"):
527
- - 如果批准:status="completed",output 中包含 "approved" 和你的审查意见
528
- - 如果拒绝:status="failed",output 中说明拒绝原因和改进建议`,
1201
+ 审查完成后,请通过 \`mcp__clawnet__plan_review_submit\` 工具提交结果(不要再用 report action):
1202
+ - 工具入参:\`{ workDir, swarmId, from: "<你的 instanceId,必须 reviewer-* 开头>", verdict: "approved" | "rejected", body?: "审查意见或拒绝原因" }\`
1203
+ - 批准:verdict="approved",body 中可补充意见
1204
+ - 拒绝:verdict="rejected",body 中说明拒绝原因和改进建议
1205
+ - 系统会在 10 分钟内未收到 verdict 时自动批准(带 timeout 警告),请尽快回复。`,
529
1206
  taskId: "plan_review",
530
- timestamp: Date.now(),
531
1207
  });
532
1208
  log.info({ swarmId: swarm.id, reviewer: reviewer.instanceId }, "plan sent to reviewer");
1209
+ // Register a timeout fallback so a non-responsive reviewer cannot deadlock
1210
+ // the queen. Cleared on submitPlanReview (success path) and on swarm
1211
+ // shutdown / recover. Idempotent — safe to call multiple times.
1212
+ this.armPlanReviewTimeout(swarm);
533
1213
  }
534
- /** Handle plan review result from reviewer. */
535
- handlePlanReviewResult(swarm, fromRole, action) {
536
- const router = swarm._router;
537
- const queen = this.findQueen(swarm);
538
- if (action.status === "completed" && action.output?.toLowerCase().includes("approved")) {
539
- swarm.planStatus = "approved";
540
- saveSwarmSnapshot(swarm);
541
- this.sendStatusUpdate(swarm);
542
- log.info({ swarmId: swarm.id, reviewer: fromRole.instanceId }, "plan approved by reviewer");
543
- // Notify queen to start task assignment
544
- if (queen) {
545
- router.send(queen.instanceId, {
546
- from: "system",
547
- type: "plan_approved",
548
- data: `[系统] 你的执行计划已通过审查。审查意见:${action.output}\n\n请开始按计划分配任务。`,
549
- timestamp: Date.now(),
550
- });
551
- }
552
- }
553
- else {
554
- swarm.planStatus = "rejected";
555
- saveSwarmSnapshot(swarm);
556
- this.sendStatusUpdate(swarm);
557
- log.info({ swarmId: swarm.id, reviewer: fromRole.instanceId }, "plan rejected by reviewer");
558
- // Notify queen to revise the plan
559
- if (queen) {
560
- router.send(queen.instanceId, {
561
- from: "system",
562
- type: "plan_rejected",
563
- data: `[系统] 你的执行计划未通过审查,请根据反馈修改。\n\n审查反馈:${action.output ?? "未提供具体原因"}`,
564
- timestamp: Date.now(),
565
- });
566
- }
567
- }
1214
+ /**
1215
+ * (Re)arm the plan-review timeout fallback. Clears any prior timer first so
1216
+ * call sites don't need to. The fire path delegates to submitPlanReview so
1217
+ * its idempotency guard protects against reviewer/timer races.
1218
+ */
1219
+ armPlanReviewTimeout(swarm) {
1220
+ if (swarm.planReviewTimer)
1221
+ clearTimeout(swarm.planReviewTimer);
1222
+ swarm.planReviewTimer = setTimeout(() => {
1223
+ if (swarm.planStatus !== "reviewing")
1224
+ return;
1225
+ log.warn({ swarmId: swarm.id }, "plan review timed out — auto-approving with warning");
1226
+ void this.submitPlanReview({
1227
+ swarmId: swarm.id,
1228
+ from: "system",
1229
+ verdict: "approved",
1230
+ body: "[timeout] reviewer 未在 10 分钟内提交审查结果,已自动通过;请人工复核。",
1231
+ }).catch((err) => log.error({ err, swarmId: swarm.id }, "auto-approve on timeout failed"));
1232
+ }, PLAN_REVIEW_TIMEOUT_MS);
568
1233
  }
569
1234
  findByRoleSessionId(roleSessionId) {
570
1235
  // roleSessionId format: "{swarmId}::{instanceId}"
@@ -579,240 +1244,68 @@ ${planJson}
579
1244
  const role = swarm.roles.get(instanceId);
580
1245
  return { swarm, role };
581
1246
  }
582
- async executeAction(swarm, fromRole, router, action) {
583
- log.info({ swarmId: swarm.id, from: fromRole.instanceId, action: action.action }, "executing action");
584
- switch (action.action) {
585
- case "send": {
586
- // Gate: queen cannot send task to workers while plan is under review
587
- if (fromRole.definition.type === "queen" &&
588
- action.type === "task" &&
589
- (swarm.planStatus === "draft" || swarm.planStatus === "reviewing")) {
590
- log.info({ swarmId: swarm.id, to: action.to, planStatus: swarm.planStatus }, "queen task send blocked: plan under review");
591
- const router2 = swarm._router;
592
- router2.send(fromRole.instanceId, {
593
- from: "system",
594
- type: "system",
595
- data: `[系统] 任务分配被拦截。当前计划状态为「${swarm.planStatus}」,请等待审查完成后再分配任务。`,
596
- timestamp: Date.now(),
597
- });
598
- break;
599
- }
600
- const msg = {
601
- messageId: randomUUID(),
602
- from: fromRole.instanceId,
603
- type: action.type,
604
- data: action.data,
605
- taskId: action.taskId,
606
- timestamp: Date.now(),
607
- };
608
- // On-demand spawn: if target not started yet, spawn it first
609
- if (!router.has(action.to)) {
610
- await this.ensureRoleStarted(swarm, action.to, router);
611
- }
612
- // Retry with exponential backoff
613
- this.sendWithRetry(swarm, fromRole, router, action.to, msg);
614
- break;
615
- }
616
- case "report": {
617
- // Update role status
618
- if (action.status === "completed" || action.status === "failed") {
619
- fromRole.currentTask = undefined;
620
- }
621
- // Release worker process on task completion (queen always stays alive)
622
- if (action.status === "completed" &&
623
- fromRole.definition.type !== "queen") {
624
- this.stopRole(swarm.id, fromRole.instanceId).catch((err) => {
625
- log.warn({ err, instanceId: fromRole.instanceId }, "failed to release worker after completion");
626
- });
627
- }
628
- // Check for plan review result
629
- if (action.taskId === "plan_review") {
630
- this.handlePlanReviewResult(swarm, fromRole, action);
631
- break;
1247
+ startQueenCheck(swarmId) {
1248
+ const swarm = this.swarms.get(swarmId);
1249
+ if (!swarm)
1250
+ return;
1251
+ const tick = async () => {
1252
+ try {
1253
+ // Stop ticking if swarm is no longer active
1254
+ if (swarm.status === "completed" || swarm.status === "failed" || swarm.status === "paused") {
1255
+ return;
632
1256
  }
633
- // Notify queen about the report (so she can coordinate next steps)
634
1257
  const queen = this.findQueen(swarm);
635
- if (queen && queen.instanceId !== fromRole.instanceId) {
636
- const reportMsg = {
637
- from: fromRole.instanceId,
638
- type: "report",
639
- data: `[${action.status}] ${action.output ?? "任务状态更新"}`,
640
- taskId: action.taskId,
641
- timestamp: Date.now(),
642
- };
643
- router.send(queen.instanceId, reportMsg);
644
- appendMessageLog(swarm.id, { type: "action", action: "report", from: fromRole.instanceId, to: queen.instanceId, status: action.status, data: action.output, timestamp: Date.now() });
645
- log.info({ swarmId: swarm.id, from: fromRole.instanceId, status: action.status, taskId: action.taskId }, "report forwarded to queen");
646
- }
647
- // Queen reporting completed/failed = swarm lifecycle transition
648
- if (fromRole.definition.type === "queen") {
649
- if (action.status === "completed") {
650
- this.complete(swarm.id).catch((err) => {
651
- log.error({ err, swarmId: swarm.id }, "failed to complete swarm");
652
- });
653
- }
654
- else if (action.status === "failed") {
655
- this.fail(swarm.id).catch((err) => {
656
- log.error({ err, swarmId: swarm.id }, "failed to mark swarm as failed");
657
- });
658
- }
1258
+ if (!queen || queen.status === "stopped")
1259
+ return;
1260
+ // Reconcile planStatus vs queen inbox: if planStatus is finalised
1261
+ // (approved/rejected) but the queen never received the corresponding
1262
+ // plan_approved / plan_rejected envelope, re-emit it. This recovers
1263
+ // from the rare case where the deferred submitPlanReview's call to
1264
+ // deliverInbox() failed and only logged a warning (the reviewer saw
1265
+ // {submitted:true} but the queen never woke up).
1266
+ await this.reconcilePlanStatusForQueen(swarm, queen.instanceId).catch((err) => log.warn({ err, swarmId }, "reconcilePlanStatus failed (best-effort)"));
1267
+ // Adaptive interval: slow down when only queen is left or all roles idle
1268
+ const activeWorkers = [...swarm.roles.values()].filter((r) => r.definition.type !== "queen" && r.status === "active");
1269
+ // Track consecutive idle checks
1270
+ if (activeWorkers.length > 0) {
1271
+ swarm.idleCheckCount = 0;
659
1272
  }
660
- // Forward status to Hub
661
- this.sendStatusUpdate(swarm);
662
- break;
663
- }
664
- case "broadcast": {
665
- const msg = {
666
- from: fromRole.instanceId,
667
- type: "broadcast",
668
- data: action.data,
669
- timestamp: Date.now(),
670
- };
671
- router.broadcast(msg, fromRole.instanceId);
672
- break;
673
- }
674
- case "spawn_role": {
675
- // Only queen can spawn roles — await to ensure route is registered before subsequent sends
676
- if (fromRole.definition.type === "queen") {
677
- await this.spawnRole(swarm.id, action.roleName, router, action.task);
1273
+ else {
1274
+ swarm.idleCheckCount++;
678
1275
  }
679
- break;
680
- }
681
- case "stop_role": {
682
- // Only queen can stop roles
683
- if (fromRole.definition.type === "queen") {
684
- this.stopRole(swarm.id, action.instanceId).catch((err) => {
685
- log.error({ err, instanceId: action.instanceId }, "failed to stop role");
1276
+ // Auto-pause if idle too long
1277
+ if (swarm.idleCheckCount >= swarm.maxIdleChecks) {
1278
+ swarm.isPaused = true;
1279
+ swarm.status = "paused";
1280
+ saveSwarmSnapshot(swarm);
1281
+ this.sendStatusUpdate(swarm);
1282
+ await this.deliverInbox(swarm, queen.instanceId, {
1283
+ from: "system",
1284
+ type: "system",
1285
+ data: `[系统] 蜂群已自动暂停。连续${swarm.maxIdleChecks}次巡查发现所有成员空闲,已暂停任务释放资源。用户可以resume恢复。`,
686
1286
  });
1287
+ log.info({ swarmId, idleChecks: swarm.maxIdleChecks }, "swarm auto-paused");
1288
+ return; // Don't schedule next tick
687
1289
  }
688
- break;
689
- }
690
- }
691
- }
692
- /** On-demand spawn: start a role instance if it's not currently running. */
693
- async ensureRoleStarted(swarm, instanceId, router) {
694
- // Already running?
695
- if (router.has(instanceId))
696
- return;
697
- // Determine role name from instanceId prefix (e.g. "dev-1" → "developer")
698
- const prefix = instanceId.replace(/-\d+$/, "");
699
- const pendingSpecs = swarm._pendingRoleSpecs;
700
- // Try to find matching role spec from the pending specs
701
- let roleName;
702
- let customPrompt;
703
- let customDefinition;
704
- if (pendingSpecs) {
705
- for (const spec of pendingSpecs) {
706
- const def = spec.customDefinition
707
- ? { shortName: spec.customDefinition.shortName }
708
- : (() => { try {
709
- return loadRole(spec.roleName);
710
- }
711
- catch {
712
- return null;
713
- } })();
714
- if (def && def.shortName === prefix) {
715
- roleName = spec.roleName;
716
- customPrompt = spec.customPrompt;
717
- customDefinition = spec.customDefinition;
718
- break;
719
- }
720
- }
721
- }
722
- if (!roleName) {
723
- log.warn({ swarmId: swarm.id, instanceId }, "cannot on-demand spawn: unknown role for instanceId");
724
- return;
725
- }
726
- log.info({ swarmId: swarm.id, instanceId, roleName }, "on-demand spawning role");
727
- await this.spawnRole(swarm.id, roleName, router, undefined, customPrompt, customDefinition);
728
- }
729
- /** Send a message with exponential backoff retry. Fire-and-forget (async). */
730
- async sendWithRetry(swarm, fromRole, router, targetInstanceId, msg) {
731
- for (let attempt = 0; attempt <= SEND_MAX_RETRIES; attempt++) {
732
- const result = router.send(targetInstanceId, msg);
733
- if (result.delivered) {
734
- appendMessageLog(swarm.id, {
735
- type: "action", action: "send",
736
- from: fromRole.instanceId, to: targetInstanceId,
737
- data: msg.data, messageId: msg.messageId,
738
- timestamp: Date.now(),
739
- });
740
- return;
741
- }
742
- log.warn({ swarmId: swarm.id, to: targetInstanceId, attempt, reason: result.reason, messageId: msg.messageId }, "message delivery failed");
743
- if (attempt < SEND_MAX_RETRIES) {
744
- await sleep(SEND_RETRY_BASE_MS * 2 ** attempt);
745
- }
746
- }
747
- // All retries exhausted — notify Queen
748
- log.error({ swarmId: swarm.id, to: targetInstanceId, messageId: msg.messageId }, "message delivery failed after all retries");
749
- appendMessageLog(swarm.id, {
750
- type: "action", action: "send",
751
- from: fromRole.instanceId, to: targetInstanceId,
752
- data: msg.data, messageId: msg.messageId,
753
- delivered: false,
754
- timestamp: Date.now(),
755
- });
756
- const queen = this.findQueen(swarm);
757
- if (queen) {
758
- router.send(queen.instanceId, {
759
- from: "system",
760
- type: "system",
761
- data: `[系统] 消息发送给 ${targetInstanceId} 失败(已重试 ${SEND_MAX_RETRIES} 次)。原始消息来自 ${fromRole.instanceId}。`,
762
- timestamp: Date.now(),
763
- });
764
- }
765
- }
766
- startQueenCheck(swarmId) {
767
- const swarm = this.swarms.get(swarmId);
768
- if (!swarm)
769
- return;
770
- const tick = () => {
771
- // Stop ticking if swarm is no longer active
772
- if (swarm.status === "completed" || swarm.status === "failed" || swarm.status === "paused") {
773
- return;
774
- }
775
- const queen = this.findQueen(swarm);
776
- if (!queen || queen.status === "stopped")
777
- return;
778
- const router = swarm._router;
779
- // Adaptive interval: slow down when only queen is left or all roles idle
780
- const activeWorkers = [...swarm.roles.values()].filter((r) => r.definition.type !== "queen" && r.status === "active");
781
- // Track consecutive idle checks
782
- if (activeWorkers.length > 0) {
783
- swarm.idleCheckCount = 0;
784
- }
785
- else {
786
- swarm.idleCheckCount++;
787
- }
788
- // Auto-pause if idle too long
789
- if (swarm.idleCheckCount >= swarm.maxIdleChecks) {
790
- swarm.isPaused = true;
791
- swarm.status = "paused";
792
- saveSwarmSnapshot(swarm);
793
- this.sendStatusUpdate(swarm);
794
- router.send(queen.instanceId, {
1290
+ const statusJson = JSON.stringify(this.buildStatusPayload(swarm), null, 2);
1291
+ await this.deliverInbox(swarm, queen.instanceId, {
795
1292
  from: "system",
796
1293
  type: "system",
797
- data: `[系统] 蜂群已自动暂停。连续${swarm.maxIdleChecks}次巡查发现所有成员空闲,已暂停任务释放资源。用户可以resume恢复。`,
798
- timestamp: Date.now(),
1294
+ data: `[系统] 定时巡查触发。请查看当前蜂群状态,判断是否需要采取行动。\n\n当前蜂群成员及状态:\n${statusJson}`,
799
1295
  });
800
- log.info({ swarmId, idleChecks: swarm.maxIdleChecks }, "swarm auto-paused");
801
- return; // Don't schedule next tick
1296
+ const nextInterval = activeWorkers.length > 0
1297
+ ? QUEEN_CHECK_INTERVAL_MS
1298
+ : QUEEN_CHECK_IDLE_INTERVAL_MS;
1299
+ swarm.checkTimer = setTimeout(() => { void tick().catch((err) => log.error({ err, swarmId }, "queen tick failed (best-effort)")); }, nextInterval);
1300
+ }
1301
+ catch (err) {
1302
+ // deliverInbox / disk-IO failures must not surface as unhandled
1303
+ // rejection from the setTimeout fire-and-forget. Log and swallow;
1304
+ // the next tick (if any) will retry on its own schedule.
1305
+ log.error({ err, swarmId }, "queen tick failed (best-effort)");
802
1306
  }
803
- const statusJson = JSON.stringify(this.buildStatusPayload(swarm), null, 2);
804
- router.send(queen.instanceId, {
805
- from: "system",
806
- type: "system",
807
- data: `[系统] 定时巡查触发。请查看当前蜂群状态,判断是否需要采取行动。\n\n当前蜂群成员及状态:\n${statusJson}`,
808
- timestamp: Date.now(),
809
- });
810
- const nextInterval = activeWorkers.length > 0
811
- ? QUEEN_CHECK_INTERVAL_MS
812
- : QUEEN_CHECK_IDLE_INTERVAL_MS;
813
- swarm.checkTimer = setTimeout(tick, nextInterval);
814
1307
  };
815
- swarm.checkTimer = setTimeout(tick, QUEEN_CHECK_INTERVAL_MS);
1308
+ swarm.checkTimer = setTimeout(() => { void tick().catch((err) => log.error({ err, swarmId }, "queen tick failed (best-effort)")); }, QUEEN_CHECK_INTERVAL_MS);
816
1309
  }
817
1310
  buildRoleListString(swarm) {
818
1311
  const lines = [];