@mclawnet/swarm 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/dist/__tests__/action-parser.test.js +29 -82
- package/dist/__tests__/action-parser.test.js.map +1 -1
- package/dist/__tests__/coordinator-create-tx.test.d.ts +2 -0
- package/dist/__tests__/coordinator-create-tx.test.d.ts.map +1 -0
- package/dist/__tests__/coordinator-create-tx.test.js +114 -0
- package/dist/__tests__/coordinator-create-tx.test.js.map +1 -0
- package/dist/__tests__/coordinator-inbox-migration.test.d.ts +2 -0
- package/dist/__tests__/coordinator-inbox-migration.test.d.ts.map +1 -0
- package/dist/__tests__/coordinator-inbox-migration.test.js +56 -0
- package/dist/__tests__/coordinator-inbox-migration.test.js.map +1 -0
- package/dist/__tests__/inbox-integration.test.d.ts +2 -0
- package/dist/__tests__/inbox-integration.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-integration.test.js +120 -0
- package/dist/__tests__/inbox-integration.test.js.map +1 -0
- package/dist/__tests__/inbox-persistence-recovery.test.d.ts +2 -0
- package/dist/__tests__/inbox-persistence-recovery.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-persistence-recovery.test.js +139 -0
- package/dist/__tests__/inbox-persistence-recovery.test.js.map +1 -0
- package/dist/__tests__/inbox-relay-interceptor.test.d.ts +2 -0
- package/dist/__tests__/inbox-relay-interceptor.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-relay-interceptor.test.js +156 -0
- package/dist/__tests__/inbox-relay-interceptor.test.js.map +1 -0
- package/dist/__tests__/inbox-relay.test.d.ts +2 -0
- package/dist/__tests__/inbox-relay.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-relay.test.js +318 -0
- package/dist/__tests__/inbox-relay.test.js.map +1 -0
- package/dist/__tests__/inbox-store.test.d.ts +2 -0
- package/dist/__tests__/inbox-store.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-store.test.js +129 -0
- package/dist/__tests__/inbox-store.test.js.map +1 -0
- package/dist/__tests__/inbox-watcher.test.d.ts +2 -0
- package/dist/__tests__/inbox-watcher.test.d.ts.map +1 -0
- package/dist/__tests__/inbox-watcher.test.js +104 -0
- package/dist/__tests__/inbox-watcher.test.js.map +1 -0
- package/dist/__tests__/persistence-path.test.d.ts +2 -0
- package/dist/__tests__/persistence-path.test.d.ts.map +1 -0
- package/dist/__tests__/persistence-path.test.js +79 -0
- package/dist/__tests__/persistence-path.test.js.map +1 -0
- package/dist/__tests__/persistence-robust.test.d.ts +2 -0
- package/dist/__tests__/persistence-robust.test.d.ts.map +1 -0
- package/dist/__tests__/persistence-robust.test.js +125 -0
- package/dist/__tests__/persistence-robust.test.js.map +1 -0
- package/dist/__tests__/persistence.test.d.ts +2 -0
- package/dist/__tests__/persistence.test.d.ts.map +1 -0
- package/dist/__tests__/persistence.test.js +105 -0
- package/dist/__tests__/persistence.test.js.map +1 -0
- package/dist/__tests__/phase4-5-e2e.test.d.ts +2 -0
- package/dist/__tests__/phase4-5-e2e.test.d.ts.map +1 -0
- package/dist/__tests__/phase4-5-e2e.test.js +203 -0
- package/dist/__tests__/phase4-5-e2e.test.js.map +1 -0
- package/dist/__tests__/phase6-7-e2e.test.d.ts +2 -0
- package/dist/__tests__/phase6-7-e2e.test.d.ts.map +1 -0
- package/dist/__tests__/phase6-7-e2e.test.js +93 -0
- package/dist/__tests__/phase6-7-e2e.test.js.map +1 -0
- package/dist/__tests__/recovery-cross-project.test.d.ts +2 -0
- package/dist/__tests__/recovery-cross-project.test.d.ts.map +1 -0
- package/dist/__tests__/recovery-cross-project.test.js +87 -0
- package/dist/__tests__/recovery-cross-project.test.js.map +1 -0
- package/dist/__tests__/recovery-forwards-to-coordinator.test.d.ts +2 -0
- package/dist/__tests__/recovery-forwards-to-coordinator.test.d.ts.map +1 -0
- package/dist/__tests__/recovery-forwards-to-coordinator.test.js +59 -0
- package/dist/__tests__/recovery-forwards-to-coordinator.test.js.map +1 -0
- package/dist/__tests__/recovery-resume.test.d.ts +2 -0
- package/dist/__tests__/recovery-resume.test.d.ts.map +1 -0
- package/dist/__tests__/recovery-resume.test.js +132 -0
- package/dist/__tests__/recovery-resume.test.js.map +1 -0
- package/dist/__tests__/retrospective.test.js +1 -0
- package/dist/__tests__/retrospective.test.js.map +1 -1
- package/dist/__tests__/role-loader-preamble-all.test.d.ts +2 -0
- package/dist/__tests__/role-loader-preamble-all.test.d.ts.map +1 -0
- package/dist/__tests__/role-loader-preamble-all.test.js +38 -0
- package/dist/__tests__/role-loader-preamble-all.test.js.map +1 -0
- package/dist/__tests__/role-loader-tools.test.d.ts +2 -0
- package/dist/__tests__/role-loader-tools.test.d.ts.map +1 -0
- package/dist/__tests__/role-loader-tools.test.js +39 -0
- package/dist/__tests__/role-loader-tools.test.js.map +1 -0
- package/dist/__tests__/role-loader.test.js +116 -1
- package/dist/__tests__/role-loader.test.js.map +1 -1
- package/dist/__tests__/role-prompt-no-legacy-protocol.test.d.ts +2 -0
- package/dist/__tests__/role-prompt-no-legacy-protocol.test.d.ts.map +1 -0
- package/dist/__tests__/role-prompt-no-legacy-protocol.test.js +37 -0
- package/dist/__tests__/role-prompt-no-legacy-protocol.test.js.map +1 -0
- package/dist/__tests__/role-tools.test.d.ts +2 -0
- package/dist/__tests__/role-tools.test.d.ts.map +1 -0
- package/dist/__tests__/role-tools.test.js +80 -0
- package/dist/__tests__/role-tools.test.js.map +1 -0
- package/dist/__tests__/spawn-role-injects-briefings.test.d.ts +2 -0
- package/dist/__tests__/spawn-role-injects-briefings.test.d.ts.map +1 -0
- package/dist/__tests__/spawn-role-injects-briefings.test.js +182 -0
- package/dist/__tests__/spawn-role-injects-briefings.test.js.map +1 -0
- package/dist/__tests__/spawn-role-tool-policy.test.d.ts +2 -0
- package/dist/__tests__/spawn-role-tool-policy.test.d.ts.map +1 -0
- package/dist/__tests__/spawn-role-tool-policy.test.js +96 -0
- package/dist/__tests__/spawn-role-tool-policy.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-inbox-watcher.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-inbox-watcher.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-inbox-watcher.test.js +61 -0
- package/dist/__tests__/swarm-coordinator-inbox-watcher.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-inbox.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-inbox.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-inbox.test.js +182 -0
- package/dist/__tests__/swarm-coordinator-inbox.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-init.test.js +36 -8
- package/dist/__tests__/swarm-coordinator-init.test.js.map +1 -1
- package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.js +113 -0
- package/dist/__tests__/swarm-coordinator-legacy-plan-review-warn.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.js +465 -0
- package/dist/__tests__/swarm-coordinator-plan-review-intercept.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.js +284 -0
- package/dist/__tests__/swarm-coordinator-plan-review-recovery.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-plan-review.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-plan-review.test.js +294 -0
- package/dist/__tests__/swarm-coordinator-plan-review.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-resume.test.d.ts +2 -0
- package/dist/__tests__/swarm-coordinator-resume.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-coordinator-resume.test.js +93 -0
- package/dist/__tests__/swarm-coordinator-resume.test.js.map +1 -0
- package/dist/__tests__/swarm-coordinator-roleId.test.js +2 -2
- package/dist/__tests__/swarm-coordinator-roleId.test.js.map +1 -1
- package/dist/__tests__/swarm-destroy-detach.test.d.ts +2 -0
- package/dist/__tests__/swarm-destroy-detach.test.d.ts.map +1 -0
- package/dist/__tests__/swarm-destroy-detach.test.js +135 -0
- package/dist/__tests__/swarm-destroy-detach.test.js.map +1 -0
- package/dist/action-parser.d.ts +0 -9
- package/dist/action-parser.d.ts.map +1 -1
- package/dist/action-parser.js +0 -114
- package/dist/action-parser.js.map +1 -1
- package/dist/inbox-relay.d.ts +50 -0
- package/dist/inbox-relay.d.ts.map +1 -0
- package/dist/inbox-relay.js +168 -0
- package/dist/inbox-relay.js.map +1 -0
- package/dist/inbox-store.d.ts +25 -0
- package/dist/inbox-store.d.ts.map +1 -0
- package/dist/inbox-store.js +95 -0
- package/dist/inbox-store.js.map +1 -0
- package/dist/inbox-watcher.d.ts +13 -0
- package/dist/inbox-watcher.d.ts.map +1 -0
- package/dist/inbox-watcher.js +89 -0
- package/dist/inbox-watcher.js.map +1 -0
- package/dist/index.d.ts +4 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/persistence.d.ts +19 -5
- package/dist/persistence.d.ts.map +1 -1
- package/dist/persistence.js +97 -22
- package/dist/persistence.js.map +1 -1
- package/dist/recovery.d.ts +12 -0
- package/dist/recovery.d.ts.map +1 -1
- package/dist/recovery.js +14 -19
- package/dist/recovery.js.map +1 -1
- package/dist/roles/role-loader.d.ts +28 -1
- package/dist/roles/role-loader.d.ts.map +1 -1
- package/dist/roles/role-loader.js +73 -1
- package/dist/roles/role-loader.js.map +1 -1
- package/dist/roles/role-tools.d.ts +16 -0
- package/dist/roles/role-tools.d.ts.map +1 -0
- package/dist/roles/role-tools.js +25 -0
- package/dist/roles/role-tools.js.map +1 -0
- package/dist/roles/types.d.ts +4 -0
- package/dist/roles/types.d.ts.map +1 -1
- package/dist/swarm-coordinator.d.ts +176 -12
- package/dist/swarm-coordinator.d.ts.map +1 -1
- package/dist/swarm-coordinator.js +863 -370
- package/dist/swarm-coordinator.js.map +1 -1
- package/dist/types.d.ts +26 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +9 -6
- package/roles/analyst-livermore.md +6 -30
- package/roles/designer-rams.md +2 -30
- package/roles/dev-torvalds.md +8 -44
- package/roles/developer.md +5 -21
- package/roles/director-jia.md +20 -49
- package/roles/editor-boyong.md +8 -40
- package/roles/macro-dalio.md +6 -30
- package/roles/planner-maoni.md +24 -53
- package/roles/pm-jobs.md +20 -71
- package/roles/preset-analyst-simons.md +2 -18
- package/roles/preset-architect-knuth.md +2 -18
- package/roles/preset-designer-norman.md +2 -18
- package/roles/preset-designer.md +2 -18
- package/roles/preset-dev-carmack.md +2 -18
- package/roles/preset-dev-gosling.md +2 -18
- package/roles/preset-developer.md +7 -23
- package/roles/preset-manager-grove.md +2 -18
- package/roles/preset-manager-musk.md +2 -18
- package/roles/preset-pm.md +7 -34
- package/roles/preset-researcher-feynman.md +2 -18
- package/roles/preset-reviewer.md +5 -21
- package/roles/preset-strategist-buffett.md +2 -18
- package/roles/preset-strategist-munger.md +2 -18
- package/roles/preset-strategist-sunzi.md +2 -18
- package/roles/preset-tester-beck.md +2 -18
- package/roles/preset-tester.md +5 -21
- package/roles/preset-writer-orwell.md +2 -18
- package/roles/preset-writer.md +2 -18
- package/roles/quant-simons.md +5 -32
- package/roles/queen.md +25 -41
- package/roles/reviewer-martin.md +11 -37
- package/roles/reviewer.md +20 -21
- package/roles/rhythm-tangsan.md +5 -29
- package/roles/risk-taleb.md +4 -32
- package/roles/script-shitiesheng.md +8 -31
- package/roles/storyboard-xuke.md +9 -29
- package/roles/strategist-soros.md +16 -73
- package/roles/tester-beck.md +4 -40
- package/roles/tester.md +5 -21
- package/roles/trader-jones.md +4 -32
- package/roles/vfx-guchangwei.md +8 -27
- package/roles/writer-zhouzi.md +7 -39
- package/templates/dev-team-pro.md +4 -1
- package/templates/dev-team.md +3 -1
- package/templates/minimal.md +2 -1
- package/templates/trading-team.md +6 -1
- package/templates/video-team.md +4 -1
- package/templates/writing-team.md +4 -1
|
@@ -1,35 +1,90 @@
|
|
|
1
1
|
import { loadRole, buildRolePrompt } from "./roles/role-loader.js";
|
|
2
|
+
import { resolveRoleTools } from "./roles/role-tools.js";
|
|
2
3
|
import { loadTemplate } from "./templates/template-loader.js";
|
|
3
|
-
import {
|
|
4
|
-
import { parseSwarmActions, parsePlanFromText } from "./action-parser.js";
|
|
4
|
+
import { parsePlanFromText } from "./action-parser.js";
|
|
5
5
|
import { BUILTIN_ROLES, initDatabase, syncToAutoMemory } from "@mclawnet/memory";
|
|
6
|
-
import { saveSwarmSnapshot, deleteSwarmSnapshot, appendMessageLog, loadSwarmSnapshot, readMessageLog } from "./persistence.js";
|
|
6
|
+
import { saveSwarmSnapshot, deleteSwarmSnapshot, appendMessageLog, loadSwarmSnapshot, readMessageLog, listRecoverableSwarmIds } from "./persistence.js";
|
|
7
7
|
import { runRetrospective } from "./retrospective.js";
|
|
8
|
+
import { InboxRelay } from "./inbox-relay.js";
|
|
9
|
+
import { InboxStore } from "./inbox-store.js";
|
|
10
|
+
import { InboxWatcher } from "./inbox-watcher.js";
|
|
11
|
+
import { randomUUID } from "node:crypto";
|
|
8
12
|
import { EvolutionPipeline } from "@mclawnet/skill-manager";
|
|
13
|
+
import { TaskStore, computeLeadBriefing, computeMemberBriefing, computeTaskBriefing, formatLeadBriefing, formatMemberBriefing, formatTaskBriefing, projectRoot } from "@mclawnet/task";
|
|
14
|
+
import { existsSync } from "node:fs";
|
|
9
15
|
import { homedir } from "node:os";
|
|
10
16
|
import { join } from "node:path";
|
|
11
17
|
import { createLogger } from "@mclawnet/logger";
|
|
12
|
-
import { randomUUID } from "node:crypto";
|
|
13
18
|
const log = createLogger({ module: "swarm" });
|
|
14
19
|
const QUEEN_CHECK_INTERVAL_MS = 300_000; // 5min when workers active
|
|
15
20
|
const QUEEN_CHECK_IDLE_INTERVAL_MS = 300_000; // 5min when all roles idle
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
21
|
+
/**
|
|
22
|
+
* Hard deadline for the "reviewing" plan-status. If the reviewer fails to
|
|
23
|
+
* submit a verdict within this window the coordinator auto-approves with a
|
|
24
|
+
* warning so the queen is not deadlocked. Cleared on any explicit
|
|
25
|
+
* submitPlanReview call.
|
|
26
|
+
*/
|
|
27
|
+
const PLAN_REVIEW_TIMEOUT_MS = 10 * 60 * 1000;
|
|
19
28
|
/**
|
|
20
29
|
* SwarmCoordinator — orchestrates multi-role Claude CLI swarms.
|
|
21
30
|
*
|
|
22
31
|
* Composes (not replaces) SessionAdapter. Each role instance is a separate
|
|
23
|
-
* session in SessionAdapter. Communication between roles
|
|
24
|
-
*
|
|
32
|
+
* session in SessionAdapter. Communication between roles flows through
|
|
33
|
+
* file-based InboxStore + InboxRelay.
|
|
25
34
|
*/
|
|
26
35
|
export class SwarmCoordinator {
|
|
27
36
|
sessionAdapter;
|
|
28
37
|
hub;
|
|
38
|
+
taskStoreFactory;
|
|
39
|
+
home;
|
|
29
40
|
swarms = new Map();
|
|
30
|
-
|
|
41
|
+
inboxRelay;
|
|
42
|
+
inboxWatcher;
|
|
43
|
+
constructor(sessionAdapter, hub,
|
|
44
|
+
/**
|
|
45
|
+
* Optional factory that resolves a per-swarm TaskStore (workDir-scoped).
|
|
46
|
+
* When present, spawnRole appends a "### 任务上下文" briefing section to
|
|
47
|
+
* the role's system prompt. Failures are best-effort logged.
|
|
48
|
+
*
|
|
49
|
+
* Backward compat: also accepts a pre-built single TaskStore instance.
|
|
50
|
+
*/
|
|
51
|
+
taskStoreFactory,
|
|
52
|
+
/**
|
|
53
|
+
* Optional override for the user-home root (no `.clawnet` suffix).
|
|
54
|
+
* Threaded to InboxRelay and to inline `new InboxStore(...)` / `TaskStore`
|
|
55
|
+
* calls so test harnesses pinned to a tmpdir don't fall through to
|
|
56
|
+
* `homedir()` and write files into the real `~/.clawnet`. When omitted,
|
|
57
|
+
* inline call sites still honour `CLAWNET_HOME` env var as before.
|
|
58
|
+
*/
|
|
59
|
+
home) {
|
|
31
60
|
this.sessionAdapter = sessionAdapter;
|
|
32
61
|
this.hub = hub;
|
|
62
|
+
this.taskStoreFactory = taskStoreFactory;
|
|
63
|
+
this.home = home;
|
|
64
|
+
this.inboxRelay = new InboxRelay(sessionAdapter, (id) => this.swarms.get(id), (swarmId, instanceId, msg) => this.handleInterceptedMessage(swarmId, instanceId, msg), home);
|
|
65
|
+
this.inboxWatcher = new InboxWatcher(this.inboxRelay);
|
|
66
|
+
}
|
|
67
|
+
/** Resolve the user-home root used for InboxStore / TaskStore lookups. */
|
|
68
|
+
resolveHome() {
|
|
69
|
+
return this.home ?? process.env.CLAWNET_HOME ?? homedir();
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Resolve the TaskStore for a given swarm's workDir.
|
|
73
|
+
* Returns undefined if no factory was wired or if the factory returned undefined.
|
|
74
|
+
*/
|
|
75
|
+
resolveTaskStore(workDir) {
|
|
76
|
+
if (!this.taskStoreFactory)
|
|
77
|
+
return undefined;
|
|
78
|
+
if (typeof this.taskStoreFactory === "function") {
|
|
79
|
+
try {
|
|
80
|
+
return this.taskStoreFactory(workDir);
|
|
81
|
+
}
|
|
82
|
+
catch (err) {
|
|
83
|
+
log.warn({ err, workDir }, "taskStoreFactory threw — skipping briefing");
|
|
84
|
+
return undefined;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return this.taskStoreFactory;
|
|
33
88
|
}
|
|
34
89
|
// ── Public API ──────────────────────────────────────────────────────
|
|
35
90
|
/** Create a new swarm with two-phase initialization. */
|
|
@@ -46,11 +101,11 @@ export class SwarmCoordinator {
|
|
|
46
101
|
else {
|
|
47
102
|
roleSpecs = options.roles ?? [];
|
|
48
103
|
}
|
|
49
|
-
const router = new LocalMessageRouter(this.sessionAdapter);
|
|
50
104
|
const swarm = {
|
|
51
105
|
id: swarmSessionId,
|
|
52
106
|
hubSessionId: swarmSessionId,
|
|
53
107
|
workDir: options.workDir,
|
|
108
|
+
teamName: options.templateName,
|
|
54
109
|
roles: new Map(),
|
|
55
110
|
plan: null,
|
|
56
111
|
nextInstanceSeq: new Map(),
|
|
@@ -60,66 +115,89 @@ export class SwarmCoordinator {
|
|
|
60
115
|
status: "creating",
|
|
61
116
|
planStatus: "none",
|
|
62
117
|
};
|
|
63
|
-
// Attach the router to the swarm for internal access
|
|
64
|
-
swarm._router = router;
|
|
65
118
|
// Store pending role specs for on-demand spawning
|
|
66
119
|
swarm._pendingRoleSpecs = roleSpecs;
|
|
67
120
|
this.swarms.set(swarmSessionId, swarm);
|
|
68
|
-
//
|
|
69
|
-
|
|
70
|
-
//
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
const roleList = this.buildRoleListString(swarm);
|
|
91
|
-
// Notify queen that initialization is complete with full member list
|
|
92
|
-
router.send(queen.instanceId, {
|
|
93
|
-
from: "system",
|
|
94
|
-
type: "system",
|
|
95
|
-
data: `[系统] 蜂群初始化完成。当前成员:\n${roleList}\n\n等待任务分配。`,
|
|
96
|
-
timestamp: Date.now(),
|
|
121
|
+
// Snapshot may already exist from a prior run (continuation, or
|
|
122
|
+
// restart-after-crash with the same swarmId). Record this BEFORE any
|
|
123
|
+
// spawnRole writes a new one — cleanup must not delete what wasn't ours.
|
|
124
|
+
const snapshotPreexisted = options.workDir
|
|
125
|
+
? existsSync(join(projectRoot(options.workDir, this.resolveHome()), "swarms", swarmSessionId, "recovery.json"))
|
|
126
|
+
: false;
|
|
127
|
+
// Track opened sessions so cleanupPartialCreate() can close them on failure.
|
|
128
|
+
const openedSessionIds = [];
|
|
129
|
+
const trackOpen = (role) => {
|
|
130
|
+
if (role?.roleSessionId)
|
|
131
|
+
openedSessionIds.push(role.roleSessionId);
|
|
132
|
+
};
|
|
133
|
+
try {
|
|
134
|
+
// ── Phase 1: spawn eager roles (non-queen first, queen last) ────
|
|
135
|
+
const eagerSpecs = roleSpecs.filter((r) => r.roleName === "queen" || r.eager === true);
|
|
136
|
+
// Sort: queen last so her roleList includes all already-spawned roles
|
|
137
|
+
eagerSpecs.sort((a, b) => {
|
|
138
|
+
if (a.roleName === "queen")
|
|
139
|
+
return 1;
|
|
140
|
+
if (b.roleName === "queen")
|
|
141
|
+
return -1;
|
|
142
|
+
return 0;
|
|
97
143
|
});
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
from: "system",
|
|
104
|
-
type: "system",
|
|
105
|
-
data: `[系统] 这是对之前蜂群任务的继续。以下是之前的工作摘要:\n${summary}\n\n请基于已有成果继续执行新的改进指令。`,
|
|
106
|
-
timestamp: Date.now(),
|
|
107
|
-
});
|
|
144
|
+
for (const spec of eagerSpecs) {
|
|
145
|
+
const count = spec.count ?? 1;
|
|
146
|
+
for (let i = 0; i < count; i++) {
|
|
147
|
+
const role = await this.spawnRole(swarmSessionId, spec.roleName, undefined, spec.customPrompt, spec.customDefinition);
|
|
148
|
+
trackOpen(role);
|
|
108
149
|
}
|
|
109
150
|
}
|
|
110
|
-
//
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
151
|
+
// Start Queen periodic check
|
|
152
|
+
this.startQueenCheck(swarmSessionId);
|
|
153
|
+
// ── Phase 2: mark running + notify queen + send task ────────────
|
|
154
|
+
swarm.status = "running";
|
|
155
|
+
const queen = this.findQueen(swarm);
|
|
156
|
+
if (queen) {
|
|
157
|
+
const roleList = this.buildRoleListString(swarm);
|
|
158
|
+
// Notify queen that initialization is complete with full member list
|
|
159
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
160
|
+
from: "system",
|
|
161
|
+
type: "system",
|
|
162
|
+
data: `[系统] 蜂群初始化完成。当前成员:\n${roleList}\n\n等待任务分配。`,
|
|
117
163
|
});
|
|
164
|
+
// Inject continuation context from previous run if applicable
|
|
165
|
+
if (options.isContinuation) {
|
|
166
|
+
const summary = this.buildContinuationSummary(swarmSessionId);
|
|
167
|
+
if (summary) {
|
|
168
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
169
|
+
from: "system",
|
|
170
|
+
type: "system",
|
|
171
|
+
data: `[系统] 这是对之前蜂群任务的继续。以下是之前的工作摘要:\n${summary}\n\n请基于已有成果继续执行新的改进指令。`,
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// Send user task
|
|
176
|
+
if (options.task) {
|
|
177
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
178
|
+
from: "user",
|
|
179
|
+
type: "task",
|
|
180
|
+
data: options.task,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
log.info({ swarmId: swarmSessionId, roleCount: swarm.roles.size }, "swarm created");
|
|
185
|
+
// Persistence: save initial snapshot
|
|
186
|
+
saveSwarmSnapshot(swarm);
|
|
187
|
+
// Start inbox watcher to react to inbox file changes (best-effort).
|
|
188
|
+
if (options.workDir) {
|
|
189
|
+
try {
|
|
190
|
+
this.inboxWatcher.watch(swarmSessionId, options.workDir);
|
|
191
|
+
}
|
|
192
|
+
catch (err) {
|
|
193
|
+
log.warn({ err, swarmId: swarmSessionId }, "inboxWatcher.watch failed (best-effort)");
|
|
194
|
+
}
|
|
118
195
|
}
|
|
119
196
|
}
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
197
|
+
catch (err) {
|
|
198
|
+
await this.cleanupPartialCreate(swarmSessionId, openedSessionIds, snapshotPreexisted, err);
|
|
199
|
+
throw err;
|
|
200
|
+
}
|
|
123
201
|
}
|
|
124
202
|
/** Handle user message directed at the swarm. */
|
|
125
203
|
async handleUserMessage(swarmSessionId, content, targetInstance) {
|
|
@@ -128,12 +206,10 @@ export class SwarmCoordinator {
|
|
|
128
206
|
log.error({ swarmId: swarmSessionId }, "handleUserMessage: swarm not found");
|
|
129
207
|
return;
|
|
130
208
|
}
|
|
131
|
-
const router = swarm._router;
|
|
132
209
|
const message = {
|
|
133
210
|
from: "user",
|
|
134
211
|
type: "task",
|
|
135
212
|
data: content,
|
|
136
|
-
timestamp: Date.now(),
|
|
137
213
|
};
|
|
138
214
|
// Auto-resume if paused
|
|
139
215
|
if (swarm.status === "paused") {
|
|
@@ -142,21 +218,28 @@ export class SwarmCoordinator {
|
|
|
142
218
|
swarm.idleCheckCount = 0;
|
|
143
219
|
this.startQueenCheck(swarmSessionId);
|
|
144
220
|
log.info({ swarmId: swarmSessionId }, "swarm resumed");
|
|
221
|
+
for (const r of swarm.roles.values()) {
|
|
222
|
+
if (r.status === "spawning" || r.status === "stopped")
|
|
223
|
+
continue;
|
|
224
|
+
void this.inboxRelay.deliver(swarmSessionId, r.instanceId);
|
|
225
|
+
}
|
|
145
226
|
}
|
|
146
227
|
if (targetInstance) {
|
|
147
228
|
// Directed at a specific instance
|
|
148
|
-
|
|
149
|
-
|
|
229
|
+
await this.deliverInbox(swarm, targetInstance, message);
|
|
230
|
+
if (swarm.workDir) {
|
|
231
|
+
appendMessageLog(swarm.workDir, swarmSessionId, { type: "user_message", from: "user", to: targetInstance, data: content, timestamp: Date.now() });
|
|
232
|
+
}
|
|
150
233
|
}
|
|
151
234
|
else {
|
|
152
235
|
// Default: send to Queen
|
|
153
236
|
const queen = this.findQueen(swarm);
|
|
154
237
|
if (queen) {
|
|
155
|
-
|
|
238
|
+
await this.deliverInbox(swarm, queen.instanceId, message);
|
|
156
239
|
}
|
|
157
240
|
else {
|
|
158
241
|
// No queen — broadcast to all
|
|
159
|
-
|
|
242
|
+
await this.broadcastInbox(swarm, message);
|
|
160
243
|
}
|
|
161
244
|
}
|
|
162
245
|
}
|
|
@@ -170,14 +253,22 @@ export class SwarmCoordinator {
|
|
|
170
253
|
const { swarm, role } = this.findByRoleSessionId(roleSessionId);
|
|
171
254
|
if (!swarm || !role)
|
|
172
255
|
return false;
|
|
173
|
-
const router = swarm._router;
|
|
174
256
|
// Extract text content from the streaming event
|
|
175
257
|
const text = extractTextFromEvent(data);
|
|
176
258
|
// Parse swarm action blocks
|
|
177
259
|
if (text) {
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
260
|
+
// Legacy-format safety net: if a reviewer regresses to the old
|
|
261
|
+
// `{"action":"report", "taskId":"plan_review", ...}` block while we are
|
|
262
|
+
// waiting on a verdict, warn loudly so the regression is diagnosable.
|
|
263
|
+
// The action-parser no longer consumes these blocks, so the swarm would
|
|
264
|
+
// otherwise silently wait until the 10-min auto-approve timer fires.
|
|
265
|
+
// We do NOT try to translate the block — reviewer must call
|
|
266
|
+
// `mcp__clawnet__plan_review_submit` instead. See PR plan-review-mcp-migration.
|
|
267
|
+
if (role.roleName === "reviewer" &&
|
|
268
|
+
swarm.planStatus === "reviewing" &&
|
|
269
|
+
/"action"\s*:\s*"report"/.test(text) &&
|
|
270
|
+
/"taskId"\s*:\s*"plan_review"/.test(text)) {
|
|
271
|
+
log.warn({ swarmId: swarm.id, instanceId: role.instanceId }, 'reviewer emitted legacy {"action":"report","taskId":"plan_review"} block — ignored. Call mcp__clawnet__plan_review_submit instead.');
|
|
181
272
|
}
|
|
182
273
|
// For queen: parse plan BEFORE executing actions, so review can gate task assignment
|
|
183
274
|
if (role.definition.type === "queen") {
|
|
@@ -192,27 +283,14 @@ export class SwarmCoordinator {
|
|
|
192
283
|
saveSwarmSnapshot(swarm);
|
|
193
284
|
this.sendStatusUpdate(swarm);
|
|
194
285
|
log.info({ swarmId: swarm.id, instanceId: role.instanceId }, "plan updated (draft)");
|
|
195
|
-
this.requestPlanReview(swarm, plan)
|
|
286
|
+
this.requestPlanReview(swarm, plan).catch((err) => {
|
|
287
|
+
log.warn({ err, swarmId: swarm.id }, "requestPlanReview failed (best-effort)");
|
|
288
|
+
});
|
|
196
289
|
}
|
|
197
290
|
else if (hasPlanKeyword) {
|
|
198
291
|
log.warn({ instanceId: role.instanceId, textSnippet: text.substring(0, 500) }, "queen output has plan keyword but parsePlanFromText returned null");
|
|
199
292
|
}
|
|
200
293
|
}
|
|
201
|
-
const { actions } = parseSwarmActions(text);
|
|
202
|
-
if (text.includes("```swarm") && actions.length === 0) {
|
|
203
|
-
log.warn({ instanceId: role.instanceId, textSnippet: text.substring(0, 500) }, "swarm block found but parse failed");
|
|
204
|
-
}
|
|
205
|
-
// Execute actions sequentially — spawn must complete before subsequent sends
|
|
206
|
-
(async () => {
|
|
207
|
-
for (const action of actions) {
|
|
208
|
-
try {
|
|
209
|
-
await this.executeAction(swarm, role, router, action);
|
|
210
|
-
}
|
|
211
|
-
catch (err) {
|
|
212
|
-
log.error({ err, swarmId: swarm.id, action: action.action }, "executeAction failed");
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
})();
|
|
216
294
|
}
|
|
217
295
|
// Forward full output to Hub for UI display + DB storage
|
|
218
296
|
this.hub.send({
|
|
@@ -234,6 +312,22 @@ export class SwarmCoordinator {
|
|
|
234
312
|
return false;
|
|
235
313
|
// Update role status
|
|
236
314
|
role.status = "idle";
|
|
315
|
+
// Persist per-role claudeSessionId — turn_complete frame carries the
|
|
316
|
+
// backend's real session UUID. We need it so a future restart can
|
|
317
|
+
// `--resume` this exact role's conversation (Task 4 / Phase 4-5).
|
|
318
|
+
if (info.claudeSessionId && role.claudeSessionId !== info.claudeSessionId) {
|
|
319
|
+
role.claudeSessionId = info.claudeSessionId;
|
|
320
|
+
// saveSwarmSnapshot is sync-fire (proper-lockfile internally async);
|
|
321
|
+
// call directly — failures should not break turn completion.
|
|
322
|
+
try {
|
|
323
|
+
saveSwarmSnapshot(swarm);
|
|
324
|
+
}
|
|
325
|
+
catch (err) {
|
|
326
|
+
log.warn({ err, swarmId: swarm.id, instanceId: role.instanceId }, "failed to persist claudeSessionId on turn complete");
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
// Settle inbox echoes for this turn (fire-and-forget).
|
|
330
|
+
void this.inboxRelay.onAgentTurnSettled(swarm.id, role.instanceId);
|
|
237
331
|
// Forward to Hub
|
|
238
332
|
this.hub.send({
|
|
239
333
|
type: "swarm.turn_complete",
|
|
@@ -245,8 +339,38 @@ export class SwarmCoordinator {
|
|
|
245
339
|
});
|
|
246
340
|
return true;
|
|
247
341
|
}
|
|
342
|
+
/**
|
|
343
|
+
* Persist the per-role claudeSessionId immediately (e.g. from `system/init`
|
|
344
|
+
* frame, before the first turn_complete). Returns true if a role was found
|
|
345
|
+
* and updated; false otherwise (no-op for non-swarm sessions).
|
|
346
|
+
*/
|
|
347
|
+
setRoleClaudeSessionId(swarmId, instanceId, claudeSessionId) {
|
|
348
|
+
const swarm = this.swarms.get(swarmId);
|
|
349
|
+
if (!swarm)
|
|
350
|
+
return false;
|
|
351
|
+
const role = swarm.roles.get(instanceId);
|
|
352
|
+
if (!role)
|
|
353
|
+
return false;
|
|
354
|
+
if (role.claudeSessionId === claudeSessionId)
|
|
355
|
+
return true;
|
|
356
|
+
role.claudeSessionId = claudeSessionId;
|
|
357
|
+
try {
|
|
358
|
+
saveSwarmSnapshot(swarm);
|
|
359
|
+
}
|
|
360
|
+
catch (err) {
|
|
361
|
+
log.warn({ err, swarmId, instanceId }, "failed to persist claudeSessionId");
|
|
362
|
+
}
|
|
363
|
+
return true;
|
|
364
|
+
}
|
|
365
|
+
/** Convenience: same as setRoleClaudeSessionId but takes the `${swarmId}::${instanceId}` roleSessionId. */
|
|
366
|
+
setRoleClaudeSessionIdBySession(roleSessionId, claudeSessionId) {
|
|
367
|
+
const { swarm, role } = this.findByRoleSessionId(roleSessionId);
|
|
368
|
+
if (!swarm || !role)
|
|
369
|
+
return false;
|
|
370
|
+
return this.setRoleClaudeSessionId(swarm.id, role.instanceId, claudeSessionId);
|
|
371
|
+
}
|
|
248
372
|
/** Spawn a new role instance in a swarm. */
|
|
249
|
-
async spawnRole(swarmId, roleName,
|
|
373
|
+
async spawnRole(swarmId, roleName, taskPrompt, customPrompt, customDefinition, additionalDirs, resumeId, presetInstanceId) {
|
|
250
374
|
const swarm = this.swarms.get(swarmId);
|
|
251
375
|
if (!swarm)
|
|
252
376
|
throw new Error(`Swarm ${swarmId} not found`);
|
|
@@ -270,9 +394,17 @@ export class SwarmCoordinator {
|
|
|
270
394
|
if (customPrompt) {
|
|
271
395
|
definition.promptBody = customPrompt;
|
|
272
396
|
}
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
397
|
+
let instanceId;
|
|
398
|
+
if (presetInstanceId) {
|
|
399
|
+
// Recovery path: preserve the original instanceId so per-role state
|
|
400
|
+
// (logs, inbox, claudeSessionId) lines up with prior snapshot.
|
|
401
|
+
instanceId = presetInstanceId;
|
|
402
|
+
}
|
|
403
|
+
else {
|
|
404
|
+
const seq = swarm.nextInstanceSeq.get(roleName) ?? 0;
|
|
405
|
+
swarm.nextInstanceSeq.set(roleName, seq + 1);
|
|
406
|
+
instanceId = `${definition.shortName}-${seq}`;
|
|
407
|
+
}
|
|
276
408
|
const roleSessionId = `${swarmId}::${instanceId}`;
|
|
277
409
|
const roleInstance = {
|
|
278
410
|
instanceId,
|
|
@@ -281,32 +413,159 @@ export class SwarmCoordinator {
|
|
|
281
413
|
roleSessionId,
|
|
282
414
|
status: "spawning",
|
|
283
415
|
currentTask: taskPrompt,
|
|
416
|
+
claudeSessionId: resumeId,
|
|
284
417
|
};
|
|
285
418
|
swarm.roles.set(instanceId, roleInstance);
|
|
286
419
|
// Build role list for prompt
|
|
287
420
|
const roleList = this.buildRoleListString(swarm);
|
|
288
|
-
const systemPrompt = buildRolePrompt(definition, instanceId, roleList
|
|
421
|
+
const systemPrompt = buildRolePrompt(definition, instanceId, roleList, {
|
|
422
|
+
swarmId,
|
|
423
|
+
workDir: swarm.workDir,
|
|
424
|
+
});
|
|
289
425
|
const roleId = BUILTIN_ROLES[roleName]?.id ?? `role-${roleName}`;
|
|
426
|
+
// Boot-time briefing injection (Phase 7-E). Best-effort: failures here
|
|
427
|
+
// must not abort role spawn — log and continue with the bare prompt.
|
|
428
|
+
let finalPrompt = systemPrompt;
|
|
429
|
+
const taskStore = swarm.workDir
|
|
430
|
+
? this.resolveTaskStore(swarm.workDir)
|
|
431
|
+
: undefined;
|
|
432
|
+
if (taskStore) {
|
|
433
|
+
try {
|
|
434
|
+
const briefingSection = this.buildBriefingSection(taskStore, swarmId, instanceId, definition);
|
|
435
|
+
if (briefingSection)
|
|
436
|
+
finalPrompt = systemPrompt + "\n\n" + briefingSection;
|
|
437
|
+
}
|
|
438
|
+
catch (err) {
|
|
439
|
+
log.warn({ err, instanceId }, "briefing injection failed (non-fatal)");
|
|
440
|
+
}
|
|
441
|
+
}
|
|
290
442
|
// Spawn Claude CLI process via SessionAdapter
|
|
291
443
|
// SessionManager handles memory injection (Pipeline A: memory prompt + roleId hint) via roleId
|
|
444
|
+
const tools = resolveRoleTools(definition);
|
|
292
445
|
await this.sessionAdapter.createSession({
|
|
293
446
|
sessionId: roleSessionId,
|
|
294
447
|
workDir: swarm.workDir,
|
|
295
|
-
systemPrompt,
|
|
448
|
+
systemPrompt: finalPrompt,
|
|
296
449
|
roleId,
|
|
297
450
|
additionalDirs,
|
|
451
|
+
// Task 5: when recovering, the caller passes resumeId = role.claudeSessionId
|
|
452
|
+
// so the Claude conversation continues with `--resume`. Fresh spawns
|
|
453
|
+
// (Task 3 default) leave it undefined for a new conversation.
|
|
454
|
+
resumeId,
|
|
455
|
+
allowedTools: tools.allowedTools,
|
|
456
|
+
disallowedTools: tools.disallowedTools,
|
|
298
457
|
});
|
|
299
458
|
roleInstance.status = "active";
|
|
300
|
-
// Persistence: save snapshot after role spawned
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
459
|
+
// Persistence: save snapshot after role spawned. Skipped during recover()
|
|
460
|
+
// where the caller saves once at the end (avoids N writes for N roles).
|
|
461
|
+
if (!swarm._suppressSnapshot) {
|
|
462
|
+
saveSwarmSnapshot(swarm);
|
|
463
|
+
}
|
|
305
464
|
// Send swarm status update to Hub
|
|
306
465
|
this.sendStatusUpdate(swarm);
|
|
466
|
+
// Flush any pending inbox messages (fire-and-forget).
|
|
467
|
+
void this.inboxRelay.deliver(swarmId, instanceId);
|
|
307
468
|
log.info({ swarmId, instanceId, roleName, roleId }, "role spawned");
|
|
308
469
|
return roleInstance;
|
|
309
470
|
}
|
|
471
|
+
/**
|
|
472
|
+
* Recover a previously persisted swarm by id.
|
|
473
|
+
*
|
|
474
|
+
* Locates the snapshot via `listRecoverableSwarmIds()`, then for each role
|
|
475
|
+
* in the snapshot spawns a Claude session via `--resume role.claudeSessionId`
|
|
476
|
+
* (when present) so the per-role conversation continues. Roles without a
|
|
477
|
+
* stored claudeSessionId start fresh.
|
|
478
|
+
*
|
|
479
|
+
* After all roles are spawned, drains each role's offline inbox via
|
|
480
|
+
* `inboxRelay.deliver`. Drain failures are best-effort: warn but never throw.
|
|
481
|
+
*
|
|
482
|
+
* Recovered swarms start in `paused` state — caller must explicitly resume
|
|
483
|
+
* (matches existing behaviour in `recoverSwarm` helper).
|
|
484
|
+
*/
|
|
485
|
+
async recover(swarmId) {
|
|
486
|
+
if (this.swarms.has(swarmId)) {
|
|
487
|
+
log.warn({ swarmId }, "recover: swarm already loaded — skipping");
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
490
|
+
// Locate snapshot file across all known project workDirs.
|
|
491
|
+
const entry = listRecoverableSwarmIds().find((e) => e.swarmId === swarmId);
|
|
492
|
+
if (!entry) {
|
|
493
|
+
throw new Error(`recover: no snapshot found for swarm ${swarmId}`);
|
|
494
|
+
}
|
|
495
|
+
const snapshot = loadSwarmSnapshot(entry.workDir, swarmId);
|
|
496
|
+
if (!snapshot) {
|
|
497
|
+
throw new Error(`recover: snapshot for ${swarmId} could not be loaded`);
|
|
498
|
+
}
|
|
499
|
+
// Bootstrap the swarm shell. We deliberately bypass create() because
|
|
500
|
+
// create() would respawn eager roles from template defaults — losing the
|
|
501
|
+
// per-role instanceId / claudeSessionId from the snapshot.
|
|
502
|
+
const swarm = {
|
|
503
|
+
id: swarmId,
|
|
504
|
+
hubSessionId: snapshot.hubSessionId,
|
|
505
|
+
workDir: snapshot.workDir,
|
|
506
|
+
teamName: snapshot.teamName,
|
|
507
|
+
roles: new Map(),
|
|
508
|
+
plan: snapshot.plan ?? null,
|
|
509
|
+
nextInstanceSeq: new Map(Object.entries(snapshot.nextInstanceSeq ?? {})),
|
|
510
|
+
idleCheckCount: 0,
|
|
511
|
+
maxIdleChecks: 10,
|
|
512
|
+
isPaused: true,
|
|
513
|
+
status: "paused",
|
|
514
|
+
planStatus: snapshot.planStatus ?? "none",
|
|
515
|
+
};
|
|
516
|
+
this.swarms.set(swarmId, swarm);
|
|
517
|
+
// Respawn each role with the same instanceId, passing claudeSessionId
|
|
518
|
+
// through as resumeId so SessionAdapter can `--resume` the conversation.
|
|
519
|
+
// Suppress per-role snapshot writes — we save once at the end with the
|
|
520
|
+
// full role set (and partialRecover marker if any role failed).
|
|
521
|
+
swarm._suppressSnapshot = true;
|
|
522
|
+
let partialRecover = false;
|
|
523
|
+
for (const r of snapshot.roles) {
|
|
524
|
+
try {
|
|
525
|
+
await this.spawnRole(swarmId, r.roleName, r.currentTask, undefined, // customPrompt
|
|
526
|
+
undefined, // customDefinition
|
|
527
|
+
undefined, // additionalDirs
|
|
528
|
+
r.claudeSessionId, r.instanceId);
|
|
529
|
+
}
|
|
530
|
+
catch (err) {
|
|
531
|
+
partialRecover = true;
|
|
532
|
+
log.warn({ err, swarmId, instanceId: r.instanceId }, "recover: failed to spawn role");
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
delete swarm._suppressSnapshot;
|
|
536
|
+
swarm.partialRecover = partialRecover;
|
|
537
|
+
saveSwarmSnapshot(swarm);
|
|
538
|
+
// Drain offline inboxes — best-effort per role.
|
|
539
|
+
for (const role of swarm.roles.values()) {
|
|
540
|
+
try {
|
|
541
|
+
await this.inboxRelay.deliver(swarmId, role.instanceId);
|
|
542
|
+
}
|
|
543
|
+
catch (err) {
|
|
544
|
+
log.warn({ err, swarmId, instanceId: role.instanceId }, "recover: inbox drain failed (best-effort)");
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
log.info({ swarmId, roleCount: swarm.roles.size }, "swarm recovered");
|
|
548
|
+
// Re-arm plan-review timeout on recovery: planReviewTimer is runtime-only
|
|
549
|
+
// (see types.ts), so a snapshot persisted while planStatus="reviewing"
|
|
550
|
+
// would otherwise resume with no timer and deadlock the queen indefinitely
|
|
551
|
+
// if the reviewer never replies. Re-arm with the full PLAN_REVIEW_TIMEOUT_MS
|
|
552
|
+
// (we don't persist the original deadline; worst case the swarm waits one
|
|
553
|
+
// additional timeout window past restart, which is bounded — vs. the
|
|
554
|
+
// previous unbounded hang).
|
|
555
|
+
if (swarm.planStatus === "reviewing") {
|
|
556
|
+
this.armPlanReviewTimeout(swarm);
|
|
557
|
+
log.info({ swarmId, timeoutMs: PLAN_REVIEW_TIMEOUT_MS }, "recover: re-armed plan_review timeout");
|
|
558
|
+
}
|
|
559
|
+
// Start inbox watcher for the recovered swarm (best-effort).
|
|
560
|
+
if (swarm.workDir) {
|
|
561
|
+
try {
|
|
562
|
+
this.inboxWatcher.watch(swarmId, swarm.workDir);
|
|
563
|
+
}
|
|
564
|
+
catch (err) {
|
|
565
|
+
log.warn({ err, swarmId }, "inboxWatcher.watch failed (best-effort)");
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
310
569
|
/** Stop a role instance. */
|
|
311
570
|
async stopRole(swarmId, instanceId) {
|
|
312
571
|
const swarm = this.swarms.get(swarmId);
|
|
@@ -316,8 +575,6 @@ export class SwarmCoordinator {
|
|
|
316
575
|
if (!role)
|
|
317
576
|
return;
|
|
318
577
|
role.status = "stopped";
|
|
319
|
-
const router = swarm._router;
|
|
320
|
-
router.unregister(instanceId);
|
|
321
578
|
await this.sessionAdapter.closeSession(role.roleSessionId);
|
|
322
579
|
swarm.roles.delete(instanceId);
|
|
323
580
|
// Persistence: save snapshot after role stopped
|
|
@@ -325,6 +582,68 @@ export class SwarmCoordinator {
|
|
|
325
582
|
this.sendStatusUpdate(swarm);
|
|
326
583
|
log.info({ swarmId, instanceId }, "role stopped");
|
|
327
584
|
}
|
|
585
|
+
/**
|
|
586
|
+
* Roll back side-effects from a partially-completed `create()`.
|
|
587
|
+
*
|
|
588
|
+
* Called from create()'s catch block. Mirrors `destroy()` but is
|
|
589
|
+
* deliberately tolerant of partial state (timer may not exist, snapshot
|
|
590
|
+
* may not exist, sessions list comes from the caller's accumulator
|
|
591
|
+
* because `swarm.roles` may not yet contain every opened session).
|
|
592
|
+
*
|
|
593
|
+
* Best-effort: every step is wrapped — we never throw out of cleanup.
|
|
594
|
+
* Logs a warn so operators see WHY a swarm was rolled back.
|
|
595
|
+
*/
|
|
596
|
+
async cleanupPartialCreate(swarmId, openedSessionIds, snapshotPreexisted, cause) {
|
|
597
|
+
const swarm = this.swarms.get(swarmId);
|
|
598
|
+
// 1. Stop the queen check timer if it was started.
|
|
599
|
+
if (swarm?.checkTimer) {
|
|
600
|
+
try {
|
|
601
|
+
clearTimeout(swarm.checkTimer);
|
|
602
|
+
}
|
|
603
|
+
catch { /* ignore */ }
|
|
604
|
+
swarm.checkTimer = undefined;
|
|
605
|
+
}
|
|
606
|
+
if (swarm?.planReviewTimer) {
|
|
607
|
+
try {
|
|
608
|
+
clearTimeout(swarm.planReviewTimer);
|
|
609
|
+
}
|
|
610
|
+
catch { /* ignore */ }
|
|
611
|
+
swarm.planReviewTimer = undefined;
|
|
612
|
+
}
|
|
613
|
+
// 2. Close every Claude session we managed to open.
|
|
614
|
+
for (const sid of openedSessionIds) {
|
|
615
|
+
try {
|
|
616
|
+
await this.sessionAdapter.closeSession(sid);
|
|
617
|
+
}
|
|
618
|
+
catch (err) {
|
|
619
|
+
log.warn({ err, swarmId, sessionId: sid }, "cleanupPartialCreate: closeSession failed");
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
// 3. Delete recovery.json ONLY if this create() call wrote it.
|
|
623
|
+
// If a snapshot existed before we started (continuation path, or
|
|
624
|
+
// agent restart while a previous snapshot is still on disk),
|
|
625
|
+
// leave it untouched — it belongs to a prior run.
|
|
626
|
+
if (swarm?.workDir) {
|
|
627
|
+
if (!snapshotPreexisted) {
|
|
628
|
+
try {
|
|
629
|
+
deleteSwarmSnapshot(swarm.workDir, swarmId);
|
|
630
|
+
}
|
|
631
|
+
catch (err) {
|
|
632
|
+
log.warn({ err, swarmId }, "cleanupPartialCreate: deleteSwarmSnapshot failed");
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
else {
|
|
636
|
+
log.warn({ swarmId }, "cleanupPartialCreate: preserved pre-existing recovery.json");
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
// 4. Remove from in-memory registry so the same id can be retried.
|
|
640
|
+
try {
|
|
641
|
+
this.inboxWatcher.unwatch(swarmId);
|
|
642
|
+
}
|
|
643
|
+
catch { /* ignore */ }
|
|
644
|
+
this.swarms.delete(swarmId);
|
|
645
|
+
log.warn({ swarmId, openedCount: openedSessionIds.length, snapshotPreexisted, err: cause instanceof Error ? cause.message : String(cause) }, "cleanupPartialCreate: rolled back partial swarm state");
|
|
646
|
+
}
|
|
328
647
|
/** Destroy an entire swarm. */
|
|
329
648
|
async destroy(swarmId) {
|
|
330
649
|
const swarm = this.swarms.get(swarmId);
|
|
@@ -332,12 +651,30 @@ export class SwarmCoordinator {
|
|
|
332
651
|
return;
|
|
333
652
|
if (swarm.checkTimer)
|
|
334
653
|
clearTimeout(swarm.checkTimer);
|
|
654
|
+
if (swarm.planReviewTimer) {
|
|
655
|
+
clearTimeout(swarm.planReviewTimer);
|
|
656
|
+
swarm.planReviewTimer = undefined;
|
|
657
|
+
}
|
|
335
658
|
for (const role of swarm.roles.values()) {
|
|
336
659
|
await this.sessionAdapter.closeSession(role.roleSessionId).catch(() => { });
|
|
337
660
|
}
|
|
661
|
+
// Detach tasks before removing the snapshot — keeps task index consistent
|
|
662
|
+
// even if snapshot deletion fails. Best-effort: never block destroy.
|
|
663
|
+
await this.detachTasks(swarmId, swarm.workDir);
|
|
664
|
+
try {
|
|
665
|
+
this.inboxWatcher.unwatch(swarmId);
|
|
666
|
+
}
|
|
667
|
+
catch (err) {
|
|
668
|
+
log.warn({ err, swarmId }, "inboxWatcher.unwatch failed (best-effort)");
|
|
669
|
+
}
|
|
338
670
|
this.swarms.delete(swarmId);
|
|
339
671
|
// Persistence: remove snapshot
|
|
340
|
-
|
|
672
|
+
if (swarm.workDir) {
|
|
673
|
+
deleteSwarmSnapshot(swarm.workDir, swarmId);
|
|
674
|
+
}
|
|
675
|
+
else {
|
|
676
|
+
log.warn({ swarmId }, "destroy: swarm.workDir missing, cannot delete snapshot");
|
|
677
|
+
}
|
|
341
678
|
log.info({ swarmId }, "swarm destroyed");
|
|
342
679
|
}
|
|
343
680
|
/** Check if a session ID belongs to any swarm. */
|
|
@@ -363,6 +700,10 @@ export class SwarmCoordinator {
|
|
|
363
700
|
clearTimeout(swarm.checkTimer);
|
|
364
701
|
swarm.checkTimer = undefined;
|
|
365
702
|
}
|
|
703
|
+
if (swarm.planReviewTimer) {
|
|
704
|
+
clearTimeout(swarm.planReviewTimer);
|
|
705
|
+
swarm.planReviewTimer = undefined;
|
|
706
|
+
}
|
|
366
707
|
await this.runRetroAndCleanup(swarmId, swarm);
|
|
367
708
|
log.info({ swarmId }, "swarm completed — all resources released");
|
|
368
709
|
}
|
|
@@ -377,16 +718,204 @@ export class SwarmCoordinator {
|
|
|
377
718
|
clearTimeout(swarm.checkTimer);
|
|
378
719
|
swarm.checkTimer = undefined;
|
|
379
720
|
}
|
|
721
|
+
if (swarm.planReviewTimer) {
|
|
722
|
+
clearTimeout(swarm.planReviewTimer);
|
|
723
|
+
swarm.planReviewTimer = undefined;
|
|
724
|
+
}
|
|
380
725
|
await this.runRetroAndCleanup(swarmId, swarm);
|
|
381
726
|
log.info({ swarmId }, "swarm failed — all resources released");
|
|
382
727
|
}
|
|
728
|
+
/**
|
|
729
|
+
* InboxRelay messageInterceptor hook. Consumes `plan_review_result`
|
|
730
|
+
* envelopes (written by the `plan_review_submit` MCP tool out-of-process)
|
|
731
|
+
* and converts them into the normalized `submitPlanReview()` flow:
|
|
732
|
+
* planStatus flip, timer clear, plan_approved/plan_rejected to queen.
|
|
733
|
+
*
|
|
734
|
+
* Returning `true` removes the raw message from the LLM payload and marks
|
|
735
|
+
* it delivered so it doesn't re-fire. Returning `false` lets the relay
|
|
736
|
+
* push the message normally.
|
|
737
|
+
*
|
|
738
|
+
* Failures (parse error, invalid verdict) are logged + consumed to avoid a
|
|
739
|
+
* tight reload loop on a malformed file. The submitPlanReview call is
|
|
740
|
+
* idempotent so duplicate reviewer envelopes (e.g. retries) are safe.
|
|
741
|
+
*/
|
|
742
|
+
async handleInterceptedMessage(swarmId, instanceId, msg) {
|
|
743
|
+
if (msg.type !== "plan_review_result")
|
|
744
|
+
return false;
|
|
745
|
+
// Authorization: only reviewer instances may flip planStatus via this path.
|
|
746
|
+
// The MCP tool's `from.startsWith("reviewer")` guard is advisory across the
|
|
747
|
+
// process boundary — a malicious worker could call `message_send` directly
|
|
748
|
+
// with type:"plan_review_result". Enforce again here so the interceptor is
|
|
749
|
+
// the actual source of truth, mirroring the server-side check in the
|
|
750
|
+
// `plan_review_submit` tool handler.
|
|
751
|
+
//
|
|
752
|
+
// We deliberately do NOT allow `from: "system"` here: the timeout fallback
|
|
753
|
+
// calls `submitPlanReview` directly in-process, never via the inbox, so
|
|
754
|
+
// accepting "system" here would only widen the spoofing surface (any worker
|
|
755
|
+
// with `message_send` could forge `from:"system"` since `message_send` does
|
|
756
|
+
// not authenticate the sender).
|
|
757
|
+
if (!msg.from.startsWith("reviewer")) {
|
|
758
|
+
log.warn({ swarmId, instanceId, msgId: msg.id, from: msg.from }, "plan_review_result interceptor: rejected non-reviewer sender — consuming");
|
|
759
|
+
return true;
|
|
760
|
+
}
|
|
761
|
+
// Two-stage parse:
|
|
762
|
+
// 1. Try the canonical JSON envelope (what the `plan_review_submit` MCP
|
|
763
|
+
// tool always emits): `{ verdict, body }`.
|
|
764
|
+
// 2. Fall back to a markdown regex `^verdict:\s*(approved|rejected)`
|
|
765
|
+
// against the raw `data`. Reviewers in misconfigured environments
|
|
766
|
+
// (e.g. their Claude session is missing the clawnet-mcp tools because
|
|
767
|
+
// of the legacy mcp.json bug fixed in PR #102) sometimes hand-write a
|
|
768
|
+
// plan_review_result envelope by direct file write, leaving `data`
|
|
769
|
+
// as a markdown blob. Without this fallback the verdict is lost and
|
|
770
|
+
// the swarm waits the full 10-minute timeout.
|
|
771
|
+
//
|
|
772
|
+
// Spoofing surface is unchanged: the from-prefix check above runs BEFORE
|
|
773
|
+
// either parse path, so a malicious worker cannot forge an approval just
|
|
774
|
+
// by writing markdown.
|
|
775
|
+
let verdict;
|
|
776
|
+
let body = "";
|
|
777
|
+
try {
|
|
778
|
+
const parsed = JSON.parse(msg.data);
|
|
779
|
+
if (parsed.verdict === "approved" || parsed.verdict === "rejected") {
|
|
780
|
+
verdict = parsed.verdict;
|
|
781
|
+
body = typeof parsed.body === "string" ? parsed.body : "";
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
catch {
|
|
785
|
+
// Fall through to markdown fallback below.
|
|
786
|
+
}
|
|
787
|
+
if (!verdict && typeof msg.data === "string") {
|
|
788
|
+
const m = msg.data.match(/^\s*verdict\s*:\s*(approved|rejected)\b/im);
|
|
789
|
+
if (m) {
|
|
790
|
+
verdict = m[1].toLowerCase();
|
|
791
|
+
// Carry the entire markdown blob as the body so the queen sees the
|
|
792
|
+
// reviewer's full commentary, not just the verdict line.
|
|
793
|
+
body = msg.data;
|
|
794
|
+
log.info({ swarmId, instanceId, msgId: msg.id, verdict }, "plan_review_result interceptor: markdown verdict fallback used");
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
if (!verdict) {
|
|
798
|
+
log.warn({ swarmId, instanceId, msgId: msg.id }, "plan_review_result interceptor: no parsable verdict (neither JSON nor markdown) — consuming");
|
|
799
|
+
return true;
|
|
800
|
+
}
|
|
801
|
+
// IMPORTANT: schedule submitPlanReview asynchronously instead of awaiting.
|
|
802
|
+
// The interceptor runs *inside* InboxRelay.runDeliver for (swarmId, queen),
|
|
803
|
+
// and submitPlanReview internally calls deliverInbox → inboxRelay.deliver
|
|
804
|
+
// for the same (swarmId, queen). Single-flight would coalesce that nested
|
|
805
|
+
// call onto the in-flight promise we are currently inside, and awaiting it
|
|
806
|
+
// here would deadlock. Returning true synchronously consumes the message
|
|
807
|
+
// so the outer runDeliver can finish, releasing the single-flight slot
|
|
808
|
+
// before the deferred submitPlanReview resumes and fires its delivery.
|
|
809
|
+
void this.submitPlanReview({
|
|
810
|
+
swarmId,
|
|
811
|
+
from: msg.from,
|
|
812
|
+
verdict,
|
|
813
|
+
body,
|
|
814
|
+
}).catch((err) => log.warn({ err, swarmId, instanceId, msgId: msg.id, verdict }, "plan_review_result interceptor: deferred submitPlanReview failed"));
|
|
815
|
+
return true;
|
|
816
|
+
}
|
|
817
|
+
/**
|
|
818
|
+
* Apply a plan review verdict from a reviewer (or system on timeout).
|
|
819
|
+
* Idempotent: when planStatus is already "approved" or "rejected", logs and
|
|
820
|
+
* returns. Clears any pending planReviewTimer.
|
|
821
|
+
*
|
|
822
|
+
* Wired to the `plan_review_submit` MCP tool by the host (Task 3); this
|
|
823
|
+
* method itself does not call back into MCP.
|
|
824
|
+
*/
|
|
825
|
+
async submitPlanReview(args) {
|
|
826
|
+
const swarm = this.swarms.get(args.swarmId);
|
|
827
|
+
if (!swarm)
|
|
828
|
+
throw new Error(`swarm not found: ${args.swarmId}`);
|
|
829
|
+
// Idempotency guard — a late reviewer submission after the timeout fired,
|
|
830
|
+
// or a duplicate MCP call, must not double-notify the queen.
|
|
831
|
+
if (swarm.planStatus === "approved" || swarm.planStatus === "rejected") {
|
|
832
|
+
log.warn({
|
|
833
|
+
swarmId: args.swarmId,
|
|
834
|
+
currentStatus: swarm.planStatus,
|
|
835
|
+
attemptedVerdict: args.verdict,
|
|
836
|
+
}, "plan review already finalised, ignoring duplicate submission");
|
|
837
|
+
return;
|
|
838
|
+
}
|
|
839
|
+
if (swarm.planStatus !== "reviewing") {
|
|
840
|
+
// Refuse out-of-band submissions (planStatus = none / draft / approved /
|
|
841
|
+
// rejected). The `approved` / `rejected` cases were already short-circuited
|
|
842
|
+
// above; the remaining states (`none`, `draft`) mean either the queen has
|
|
843
|
+
// not yet asked for review (premature/spoofed call), or the swarm is in a
|
|
844
|
+
// state where flipping planStatus would silently overwrite real progress.
|
|
845
|
+
// No-op + warn instead of "proceeding anyway" so a misbehaving reviewer
|
|
846
|
+
// can't corrupt state by submitting before requestPlanReview ever ran.
|
|
847
|
+
log.warn({ swarmId: args.swarmId, currentStatus: swarm.planStatus, attemptedVerdict: args.verdict, from: args.from }, "submitPlanReview: planStatus !== reviewing — refusing to apply verdict");
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
if (swarm.planReviewTimer) {
|
|
851
|
+
clearTimeout(swarm.planReviewTimer);
|
|
852
|
+
swarm.planReviewTimer = undefined;
|
|
853
|
+
}
|
|
854
|
+
swarm.planStatus = args.verdict === "approved" ? "approved" : "rejected";
|
|
855
|
+
saveSwarmSnapshot(swarm);
|
|
856
|
+
this.sendStatusUpdate(swarm);
|
|
857
|
+
const queen = this.findQueen(swarm);
|
|
858
|
+
if (!queen) {
|
|
859
|
+
log.warn({ swarmId: args.swarmId, verdict: args.verdict }, "submitPlanReview: no queen found — verdict recorded but queen not notified");
|
|
860
|
+
return;
|
|
861
|
+
}
|
|
862
|
+
const messageType = args.verdict === "approved" ? "plan_approved" : "plan_rejected";
|
|
863
|
+
const data = JSON.stringify({
|
|
864
|
+
verdict: args.verdict,
|
|
865
|
+
body: args.body ?? "",
|
|
866
|
+
from: args.from,
|
|
867
|
+
});
|
|
868
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
869
|
+
from: "system",
|
|
870
|
+
type: messageType,
|
|
871
|
+
data,
|
|
872
|
+
});
|
|
873
|
+
log.info({ swarmId: args.swarmId, verdict: args.verdict, from: args.from }, "plan review submitted");
|
|
874
|
+
}
|
|
875
|
+
/**
|
|
876
|
+
* Defensive recovery: re-emit plan_approved / plan_rejected to the queen if
|
|
877
|
+
* planStatus is finalised but no matching system envelope exists in the
|
|
878
|
+
* queen's inbox. Called from the queen check-in tick.
|
|
879
|
+
*
|
|
880
|
+
* Catches the case where the deferred submitPlanReview's call to
|
|
881
|
+
* deliverInbox() threw (e.g. transient fs error or lockfile contention) and
|
|
882
|
+
* only logged a warning — leaving planStatus = "approved" but the queen
|
|
883
|
+
* never receiving plan_approved, which would otherwise wedge the swarm.
|
|
884
|
+
*
|
|
885
|
+
* Idempotent: if a system plan_approved/plan_rejected already exists in the
|
|
886
|
+
* queen inbox (delivered or undelivered), we do nothing.
|
|
887
|
+
*/
|
|
888
|
+
async reconcilePlanStatusForQueen(swarm, queenInstanceId) {
|
|
889
|
+
if (swarm.planStatus !== "approved" && swarm.planStatus !== "rejected") {
|
|
890
|
+
return;
|
|
891
|
+
}
|
|
892
|
+
if (!swarm.workDir)
|
|
893
|
+
return;
|
|
894
|
+
const expectedType = swarm.planStatus === "approved" ? "plan_approved" : "plan_rejected";
|
|
895
|
+
const store = new InboxStore(swarm.workDir, swarm.id, this.resolveHome());
|
|
896
|
+
const queueAll = await store.readAll(queenInstanceId);
|
|
897
|
+
const existing = queueAll.find((m) => m.type === expectedType && m.from === "system");
|
|
898
|
+
if (existing)
|
|
899
|
+
return; // Already delivered (or queued).
|
|
900
|
+
log.warn({ swarmId: swarm.id, planStatus: swarm.planStatus }, `reconcilePlanStatus: queen missing ${expectedType} despite finalised planStatus — re-emitting`);
|
|
901
|
+
const data = JSON.stringify({
|
|
902
|
+
verdict: swarm.planStatus,
|
|
903
|
+
body: "[reconcile] previous notification was lost; re-issued by queen check-in",
|
|
904
|
+
from: "system",
|
|
905
|
+
});
|
|
906
|
+
await this.deliverInbox(swarm, queenInstanceId, {
|
|
907
|
+
from: "system",
|
|
908
|
+
type: expectedType,
|
|
909
|
+
data,
|
|
910
|
+
});
|
|
911
|
+
}
|
|
383
912
|
// ── Private helpers ─────────────────────────────────────────────────
|
|
384
913
|
/** Run retrospective analysis then clean up swarm resources. */
|
|
385
914
|
async runRetroAndCleanup(swarmId, swarm) {
|
|
386
915
|
// 1. Run retrospective (best-effort)
|
|
387
916
|
try {
|
|
388
|
-
const messages = readMessageLog(swarmId);
|
|
389
|
-
const snapshot = loadSwarmSnapshot(swarmId);
|
|
917
|
+
const messages = swarm.workDir ? readMessageLog(swarm.workDir, swarmId) : [];
|
|
918
|
+
const snapshot = swarm.workDir ? loadSwarmSnapshot(swarm.workDir, swarmId) : null;
|
|
390
919
|
if (snapshot && messages.length > 0) {
|
|
391
920
|
const retroResult = await runRetrospective({
|
|
392
921
|
swarmId,
|
|
@@ -442,10 +971,142 @@ export class SwarmCoordinator {
|
|
|
442
971
|
await this.sessionAdapter.closeSession(role.roleSessionId).catch(() => { });
|
|
443
972
|
}
|
|
444
973
|
// 4. Delete recovery.json (keep messages.jsonl)
|
|
445
|
-
|
|
974
|
+
if (swarm.workDir) {
|
|
975
|
+
// Detach tasks before deleting snapshot — keeps task index consistent
|
|
976
|
+
// even if snapshot removal fails. Best-effort: never abort cleanup.
|
|
977
|
+
await this.detachTasks(swarmId, swarm.workDir);
|
|
978
|
+
deleteSwarmSnapshot(swarm.workDir, swarmId);
|
|
979
|
+
}
|
|
446
980
|
// 5. Remove from memory
|
|
447
981
|
this.swarms.delete(swarmId);
|
|
448
982
|
}
|
|
983
|
+
/**
|
|
984
|
+
* Detach all tasks owned by a swarm: clear their swarmId and owner so they
|
|
985
|
+
* survive the swarm's lifecycle and can be reassigned. Best-effort — any
|
|
986
|
+
* failure is logged and swallowed; never blocks destroy/complete/fail.
|
|
987
|
+
*
|
|
988
|
+
* Tasks belonging to other swarms are not touched.
|
|
989
|
+
*/
|
|
990
|
+
async detachTasks(swarmId, workDir) {
|
|
991
|
+
if (!workDir)
|
|
992
|
+
return;
|
|
993
|
+
try {
|
|
994
|
+
// TaskStore.projectRoot() prepends ".clawnet/" itself, so `home` here
|
|
995
|
+
// must be the user home (no `.clawnet` suffix). Previously this used
|
|
996
|
+
// `process.env.CLAWNET_DIR ?? join(homedir(), ".clawnet")`, producing
|
|
997
|
+
// a `~/.clawnet/.clawnet/projects/...` ghost layout for every detach.
|
|
998
|
+
const home = this.resolveHome();
|
|
999
|
+
const store = new TaskStore({ workDir, home });
|
|
1000
|
+
const entries = store.listBySwarm(swarmId);
|
|
1001
|
+
for (const entry of entries) {
|
|
1002
|
+
try {
|
|
1003
|
+
await store.update(entry.id, { swarmId: null, owner: null });
|
|
1004
|
+
}
|
|
1005
|
+
catch (err) {
|
|
1006
|
+
log.warn({ err, swarmId, taskId: entry.id }, "detachTasks: update failed for task (best-effort)");
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
catch (err) {
|
|
1011
|
+
log.warn({ err, swarmId }, "detachTasks: failed (best-effort)");
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
/**
|
|
1015
|
+
* Build the boot-time "### 任务上下文" section appended to a role's system
|
|
1016
|
+
* prompt. Returns null when no taskStore is wired (caller already gates on
|
|
1017
|
+
* that, but we double-check here for safety).
|
|
1018
|
+
*
|
|
1019
|
+
* Queen vs. worker contract:
|
|
1020
|
+
* - queen → lead briefing (queenInstanceId is this role itself) + member briefing
|
|
1021
|
+
* - other → task briefing (owner-scoped) + member briefing
|
|
1022
|
+
*
|
|
1023
|
+
* The "### 任务上下文" anchor and inner section anchors are load-bearing
|
|
1024
|
+
* (matched verbatim by tests and by the agent itself).
|
|
1025
|
+
*/
|
|
1026
|
+
buildBriefingSection(taskStore, swarmId, instanceId, definition) {
|
|
1027
|
+
const isQueen = definition.type === "queen";
|
|
1028
|
+
if (isQueen) {
|
|
1029
|
+
const leadBody = computeLeadBriefing(taskStore, swarmId, instanceId);
|
|
1030
|
+
const memberBody = computeMemberBriefing({ swarmId, instanceId, roleType: "queen" });
|
|
1031
|
+
return ("### 任务上下文\n\n" +
|
|
1032
|
+
formatLeadBriefing(leadBody) +
|
|
1033
|
+
"\n\n" +
|
|
1034
|
+
formatMemberBriefing(memberBody));
|
|
1035
|
+
}
|
|
1036
|
+
const taskBody = computeTaskBriefing(taskStore, swarmId, instanceId);
|
|
1037
|
+
const memberBody = computeMemberBriefing({ swarmId, instanceId, roleType: "worker" });
|
|
1038
|
+
return ("### 任务上下文\n\n" +
|
|
1039
|
+
formatTaskBriefing(taskBody, { instanceId }) +
|
|
1040
|
+
"\n\n" +
|
|
1041
|
+
formatMemberBriefing(memberBody));
|
|
1042
|
+
}
|
|
1043
|
+
// ── Inbox-based message delivery (Task 6 / PR4) ─────────────────────
|
|
1044
|
+
/**
|
|
1045
|
+
* Append a message to a target instance's inbox file then trigger
|
|
1046
|
+
* InboxRelay.deliver to push it through the agent's stdin (single-flight).
|
|
1047
|
+
* Best-effort: returns silently when the swarm has no workDir (in-memory
|
|
1048
|
+
* fixtures used by tests), keeping behaviour compatible with the previous
|
|
1049
|
+
* router.send path.
|
|
1050
|
+
*/
|
|
1051
|
+
async deliverInbox(swarm, targetInstanceId, msg) {
|
|
1052
|
+
if (!swarm.workDir)
|
|
1053
|
+
return;
|
|
1054
|
+
const store = new InboxStore(swarm.workDir, swarm.id, this.resolveHome());
|
|
1055
|
+
const full = {
|
|
1056
|
+
id: `${msg.type}-${randomUUID()}`,
|
|
1057
|
+
from: msg.from,
|
|
1058
|
+
type: msg.type,
|
|
1059
|
+
data: msg.data,
|
|
1060
|
+
...(msg.taskId ? { taskId: msg.taskId } : {}),
|
|
1061
|
+
timestamp: Date.now(),
|
|
1062
|
+
delivered: false,
|
|
1063
|
+
};
|
|
1064
|
+
await store.append(targetInstanceId, full);
|
|
1065
|
+
// Lifecycle short-circuit: InboxRelay.runDeliver gates on
|
|
1066
|
+
// `spawning`/`stopped` and would no-op anyway. Looping past it just burns
|
|
1067
|
+
// cycles for messages that will be drained later — `stopped` is terminal
|
|
1068
|
+
// (the inbox row stays for any successor); `spawning` is drained by the
|
|
1069
|
+
// post-spawn `inboxRelay.deliver` invoked once the role flips to active.
|
|
1070
|
+
const role = swarm.roles.get(targetInstanceId);
|
|
1071
|
+
if (!role || role.status === "stopped" || role.status === "spawning")
|
|
1072
|
+
return;
|
|
1073
|
+
// InboxRelay uses single-flight per instance: a deliver() call invoked
|
|
1074
|
+
// while another pass is in-flight is coalesced into the existing promise
|
|
1075
|
+
// and may NOT see our just-appended row (the in-flight pass already
|
|
1076
|
+
// ran readUndelivered before our append landed). Loop a few iterations,
|
|
1077
|
+
// each preceded by a microtask hop so the in-flight slot has cleared,
|
|
1078
|
+
// until our id is no longer undelivered (or a lifecycle gate trips).
|
|
1079
|
+
for (let attempt = 0; attempt < 4; attempt++) {
|
|
1080
|
+
await this.inboxRelay.deliver(swarm.id, targetInstanceId);
|
|
1081
|
+
// Yield once so InboxRelay's `.finally(delete inFlight)` runs before we
|
|
1082
|
+
// attempt the follow-up deliver — otherwise the next call would join
|
|
1083
|
+
// the still-in-flight promise instead of starting a fresh pass.
|
|
1084
|
+
await new Promise((r) => setImmediate(r));
|
|
1085
|
+
const remaining = await store.readUndelivered(targetInstanceId);
|
|
1086
|
+
if (!remaining.some((m) => m.id === full.id))
|
|
1087
|
+
return;
|
|
1088
|
+
// Re-check lifecycle: if the role transitioned to stopped between
|
|
1089
|
+
// attempts (e.g. shutdown raced with delivery), don't keep spinning.
|
|
1090
|
+
const cur = swarm.roles.get(targetInstanceId);
|
|
1091
|
+
if (!cur || cur.status === "stopped")
|
|
1092
|
+
return;
|
|
1093
|
+
}
|
|
1094
|
+
log.warn({ swarmId: swarm.id, instanceId: targetInstanceId, msgId: full.id, type: msg.type }, "deliverInbox: message remained undelivered after 4 attempts");
|
|
1095
|
+
}
|
|
1096
|
+
/**
|
|
1097
|
+
* Broadcast helper: deliverInbox to every role in the swarm except those
|
|
1098
|
+
* already stopped. Includes `spawning` roles intentionally — the inbox file
|
|
1099
|
+
* is durable, so the relay (or runPostStart drain in the spawn path) will
|
|
1100
|
+
* deliver the message once the role's stdin opens. Matches the legacy
|
|
1101
|
+
* router.broadcast behaviour, which never gated on "spawning".
|
|
1102
|
+
*/
|
|
1103
|
+
async broadcastInbox(swarm, msg) {
|
|
1104
|
+
for (const role of swarm.roles.values()) {
|
|
1105
|
+
if (role.status === "stopped")
|
|
1106
|
+
continue;
|
|
1107
|
+
await this.deliverInbox(swarm, role.instanceId, msg);
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
449
1110
|
findQueen(swarm) {
|
|
450
1111
|
for (const role of swarm.roles.values()) {
|
|
451
1112
|
if (role.definition.type === "queen" && role.status !== "stopped")
|
|
@@ -453,6 +1114,18 @@ export class SwarmCoordinator {
|
|
|
453
1114
|
}
|
|
454
1115
|
return undefined;
|
|
455
1116
|
}
|
|
1117
|
+
/**
|
|
1118
|
+
* Public lookup: returns the active queen role instance for a swarm, or
|
|
1119
|
+
* undefined when the swarm/queen is not present. Identifies the queen by
|
|
1120
|
+
* `definition.type === "queen"` (not by roleName) so custom queen roles
|
|
1121
|
+
* with non-default names are also recognised.
|
|
1122
|
+
*/
|
|
1123
|
+
findQueenInstance(swarmId) {
|
|
1124
|
+
const swarm = this.swarms.get(swarmId);
|
|
1125
|
+
if (!swarm)
|
|
1126
|
+
return undefined;
|
|
1127
|
+
return this.findQueen(swarm);
|
|
1128
|
+
}
|
|
456
1129
|
findReviewer(swarm) {
|
|
457
1130
|
for (const role of swarm.roles.values()) {
|
|
458
1131
|
if (role.roleName === "reviewer" && role.status !== "stopped")
|
|
@@ -462,7 +1135,10 @@ export class SwarmCoordinator {
|
|
|
462
1135
|
}
|
|
463
1136
|
/** Build a summary of the previous swarm run from messages.jsonl for continuation. */
|
|
464
1137
|
buildContinuationSummary(swarmId) {
|
|
465
|
-
const
|
|
1138
|
+
const swarm = this.swarms.get(swarmId);
|
|
1139
|
+
if (!swarm?.workDir)
|
|
1140
|
+
return null;
|
|
1141
|
+
const messages = readMessageLog(swarm.workDir, swarmId);
|
|
466
1142
|
if (messages.length === 0)
|
|
467
1143
|
return null;
|
|
468
1144
|
const MAX_LEN = 2000;
|
|
@@ -492,7 +1168,7 @@ export class SwarmCoordinator {
|
|
|
492
1168
|
return summary;
|
|
493
1169
|
}
|
|
494
1170
|
/** Send plan to reviewer for review. Auto-approves if no reviewer available. */
|
|
495
|
-
requestPlanReview(swarm, plan) {
|
|
1171
|
+
async requestPlanReview(swarm, plan) {
|
|
496
1172
|
const reviewer = this.findReviewer(swarm);
|
|
497
1173
|
if (!reviewer) {
|
|
498
1174
|
// No reviewer — auto-approve
|
|
@@ -505,9 +1181,8 @@ export class SwarmCoordinator {
|
|
|
505
1181
|
swarm.planStatus = "reviewing";
|
|
506
1182
|
saveSwarmSnapshot(swarm);
|
|
507
1183
|
this.sendStatusUpdate(swarm);
|
|
508
|
-
const router = swarm._router;
|
|
509
1184
|
const planJson = JSON.stringify(plan, null, 2);
|
|
510
|
-
|
|
1185
|
+
await this.deliverInbox(swarm, reviewer.instanceId, {
|
|
511
1186
|
from: "system",
|
|
512
1187
|
type: "plan_review",
|
|
513
1188
|
data: `[系统] 请审查以下执行计划,评估其可行性和风险。
|
|
@@ -523,48 +1198,38 @@ export class SwarmCoordinator {
|
|
|
523
1198
|
${planJson}
|
|
524
1199
|
\`\`\`
|
|
525
1200
|
|
|
526
|
-
|
|
527
|
-
-
|
|
528
|
-
-
|
|
1201
|
+
审查完成后,请通过 \`mcp__clawnet__plan_review_submit\` 工具提交结果(不要再用 report action):
|
|
1202
|
+
- 工具入参:\`{ workDir, swarmId, from: "<你的 instanceId,必须 reviewer-* 开头>", verdict: "approved" | "rejected", body?: "审查意见或拒绝原因" }\`
|
|
1203
|
+
- 批准:verdict="approved",body 中可补充意见
|
|
1204
|
+
- 拒绝:verdict="rejected",body 中说明拒绝原因和改进建议
|
|
1205
|
+
- 系统会在 10 分钟内未收到 verdict 时自动批准(带 timeout 警告),请尽快回复。`,
|
|
529
1206
|
taskId: "plan_review",
|
|
530
|
-
timestamp: Date.now(),
|
|
531
1207
|
});
|
|
532
1208
|
log.info({ swarmId: swarm.id, reviewer: reviewer.instanceId }, "plan sent to reviewer");
|
|
1209
|
+
// Register a timeout fallback so a non-responsive reviewer cannot deadlock
|
|
1210
|
+
// the queen. Cleared on submitPlanReview (success path) and on swarm
|
|
1211
|
+
// shutdown / recover. Idempotent — safe to call multiple times.
|
|
1212
|
+
this.armPlanReviewTimeout(swarm);
|
|
533
1213
|
}
|
|
534
|
-
/**
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
else {
|
|
554
|
-
swarm.planStatus = "rejected";
|
|
555
|
-
saveSwarmSnapshot(swarm);
|
|
556
|
-
this.sendStatusUpdate(swarm);
|
|
557
|
-
log.info({ swarmId: swarm.id, reviewer: fromRole.instanceId }, "plan rejected by reviewer");
|
|
558
|
-
// Notify queen to revise the plan
|
|
559
|
-
if (queen) {
|
|
560
|
-
router.send(queen.instanceId, {
|
|
561
|
-
from: "system",
|
|
562
|
-
type: "plan_rejected",
|
|
563
|
-
data: `[系统] 你的执行计划未通过审查,请根据反馈修改。\n\n审查反馈:${action.output ?? "未提供具体原因"}`,
|
|
564
|
-
timestamp: Date.now(),
|
|
565
|
-
});
|
|
566
|
-
}
|
|
567
|
-
}
|
|
1214
|
+
/**
|
|
1215
|
+
* (Re)arm the plan-review timeout fallback. Clears any prior timer first so
|
|
1216
|
+
* call sites don't need to. The fire path delegates to submitPlanReview so
|
|
1217
|
+
* its idempotency guard protects against reviewer/timer races.
|
|
1218
|
+
*/
|
|
1219
|
+
armPlanReviewTimeout(swarm) {
|
|
1220
|
+
if (swarm.planReviewTimer)
|
|
1221
|
+
clearTimeout(swarm.planReviewTimer);
|
|
1222
|
+
swarm.planReviewTimer = setTimeout(() => {
|
|
1223
|
+
if (swarm.planStatus !== "reviewing")
|
|
1224
|
+
return;
|
|
1225
|
+
log.warn({ swarmId: swarm.id }, "plan review timed out — auto-approving with warning");
|
|
1226
|
+
void this.submitPlanReview({
|
|
1227
|
+
swarmId: swarm.id,
|
|
1228
|
+
from: "system",
|
|
1229
|
+
verdict: "approved",
|
|
1230
|
+
body: "[timeout] reviewer 未在 10 分钟内提交审查结果,已自动通过;请人工复核。",
|
|
1231
|
+
}).catch((err) => log.error({ err, swarmId: swarm.id }, "auto-approve on timeout failed"));
|
|
1232
|
+
}, PLAN_REVIEW_TIMEOUT_MS);
|
|
568
1233
|
}
|
|
569
1234
|
findByRoleSessionId(roleSessionId) {
|
|
570
1235
|
// roleSessionId format: "{swarmId}::{instanceId}"
|
|
@@ -579,240 +1244,68 @@ ${planJson}
|
|
|
579
1244
|
const role = swarm.roles.get(instanceId);
|
|
580
1245
|
return { swarm, role };
|
|
581
1246
|
}
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
const router2 = swarm._router;
|
|
592
|
-
router2.send(fromRole.instanceId, {
|
|
593
|
-
from: "system",
|
|
594
|
-
type: "system",
|
|
595
|
-
data: `[系统] 任务分配被拦截。当前计划状态为「${swarm.planStatus}」,请等待审查完成后再分配任务。`,
|
|
596
|
-
timestamp: Date.now(),
|
|
597
|
-
});
|
|
598
|
-
break;
|
|
599
|
-
}
|
|
600
|
-
const msg = {
|
|
601
|
-
messageId: randomUUID(),
|
|
602
|
-
from: fromRole.instanceId,
|
|
603
|
-
type: action.type,
|
|
604
|
-
data: action.data,
|
|
605
|
-
taskId: action.taskId,
|
|
606
|
-
timestamp: Date.now(),
|
|
607
|
-
};
|
|
608
|
-
// On-demand spawn: if target not started yet, spawn it first
|
|
609
|
-
if (!router.has(action.to)) {
|
|
610
|
-
await this.ensureRoleStarted(swarm, action.to, router);
|
|
611
|
-
}
|
|
612
|
-
// Retry with exponential backoff
|
|
613
|
-
this.sendWithRetry(swarm, fromRole, router, action.to, msg);
|
|
614
|
-
break;
|
|
615
|
-
}
|
|
616
|
-
case "report": {
|
|
617
|
-
// Update role status
|
|
618
|
-
if (action.status === "completed" || action.status === "failed") {
|
|
619
|
-
fromRole.currentTask = undefined;
|
|
620
|
-
}
|
|
621
|
-
// Release worker process on task completion (queen always stays alive)
|
|
622
|
-
if (action.status === "completed" &&
|
|
623
|
-
fromRole.definition.type !== "queen") {
|
|
624
|
-
this.stopRole(swarm.id, fromRole.instanceId).catch((err) => {
|
|
625
|
-
log.warn({ err, instanceId: fromRole.instanceId }, "failed to release worker after completion");
|
|
626
|
-
});
|
|
627
|
-
}
|
|
628
|
-
// Check for plan review result
|
|
629
|
-
if (action.taskId === "plan_review") {
|
|
630
|
-
this.handlePlanReviewResult(swarm, fromRole, action);
|
|
631
|
-
break;
|
|
1247
|
+
startQueenCheck(swarmId) {
|
|
1248
|
+
const swarm = this.swarms.get(swarmId);
|
|
1249
|
+
if (!swarm)
|
|
1250
|
+
return;
|
|
1251
|
+
const tick = async () => {
|
|
1252
|
+
try {
|
|
1253
|
+
// Stop ticking if swarm is no longer active
|
|
1254
|
+
if (swarm.status === "completed" || swarm.status === "failed" || swarm.status === "paused") {
|
|
1255
|
+
return;
|
|
632
1256
|
}
|
|
633
|
-
// Notify queen about the report (so she can coordinate next steps)
|
|
634
1257
|
const queen = this.findQueen(swarm);
|
|
635
|
-
if (queen
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
if (action.status === "completed") {
|
|
650
|
-
this.complete(swarm.id).catch((err) => {
|
|
651
|
-
log.error({ err, swarmId: swarm.id }, "failed to complete swarm");
|
|
652
|
-
});
|
|
653
|
-
}
|
|
654
|
-
else if (action.status === "failed") {
|
|
655
|
-
this.fail(swarm.id).catch((err) => {
|
|
656
|
-
log.error({ err, swarmId: swarm.id }, "failed to mark swarm as failed");
|
|
657
|
-
});
|
|
658
|
-
}
|
|
1258
|
+
if (!queen || queen.status === "stopped")
|
|
1259
|
+
return;
|
|
1260
|
+
// Reconcile planStatus vs queen inbox: if planStatus is finalised
|
|
1261
|
+
// (approved/rejected) but the queen never received the corresponding
|
|
1262
|
+
// plan_approved / plan_rejected envelope, re-emit it. This recovers
|
|
1263
|
+
// from the rare case where the deferred submitPlanReview's call to
|
|
1264
|
+
// deliverInbox() failed and only logged a warning (the reviewer saw
|
|
1265
|
+
// {submitted:true} but the queen never woke up).
|
|
1266
|
+
await this.reconcilePlanStatusForQueen(swarm, queen.instanceId).catch((err) => log.warn({ err, swarmId }, "reconcilePlanStatus failed (best-effort)"));
|
|
1267
|
+
// Adaptive interval: slow down when only queen is left or all roles idle
|
|
1268
|
+
const activeWorkers = [...swarm.roles.values()].filter((r) => r.definition.type !== "queen" && r.status === "active");
|
|
1269
|
+
// Track consecutive idle checks
|
|
1270
|
+
if (activeWorkers.length > 0) {
|
|
1271
|
+
swarm.idleCheckCount = 0;
|
|
659
1272
|
}
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
break;
|
|
663
|
-
}
|
|
664
|
-
case "broadcast": {
|
|
665
|
-
const msg = {
|
|
666
|
-
from: fromRole.instanceId,
|
|
667
|
-
type: "broadcast",
|
|
668
|
-
data: action.data,
|
|
669
|
-
timestamp: Date.now(),
|
|
670
|
-
};
|
|
671
|
-
router.broadcast(msg, fromRole.instanceId);
|
|
672
|
-
break;
|
|
673
|
-
}
|
|
674
|
-
case "spawn_role": {
|
|
675
|
-
// Only queen can spawn roles — await to ensure route is registered before subsequent sends
|
|
676
|
-
if (fromRole.definition.type === "queen") {
|
|
677
|
-
await this.spawnRole(swarm.id, action.roleName, router, action.task);
|
|
1273
|
+
else {
|
|
1274
|
+
swarm.idleCheckCount++;
|
|
678
1275
|
}
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
this.
|
|
685
|
-
|
|
1276
|
+
// Auto-pause if idle too long
|
|
1277
|
+
if (swarm.idleCheckCount >= swarm.maxIdleChecks) {
|
|
1278
|
+
swarm.isPaused = true;
|
|
1279
|
+
swarm.status = "paused";
|
|
1280
|
+
saveSwarmSnapshot(swarm);
|
|
1281
|
+
this.sendStatusUpdate(swarm);
|
|
1282
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
1283
|
+
from: "system",
|
|
1284
|
+
type: "system",
|
|
1285
|
+
data: `[系统] 蜂群已自动暂停。连续${swarm.maxIdleChecks}次巡查发现所有成员空闲,已暂停任务释放资源。用户可以resume恢复。`,
|
|
686
1286
|
});
|
|
1287
|
+
log.info({ swarmId, idleChecks: swarm.maxIdleChecks }, "swarm auto-paused");
|
|
1288
|
+
return; // Don't schedule next tick
|
|
687
1289
|
}
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
}
|
|
691
|
-
}
|
|
692
|
-
/** On-demand spawn: start a role instance if it's not currently running. */
|
|
693
|
-
async ensureRoleStarted(swarm, instanceId, router) {
|
|
694
|
-
// Already running?
|
|
695
|
-
if (router.has(instanceId))
|
|
696
|
-
return;
|
|
697
|
-
// Determine role name from instanceId prefix (e.g. "dev-1" → "developer")
|
|
698
|
-
const prefix = instanceId.replace(/-\d+$/, "");
|
|
699
|
-
const pendingSpecs = swarm._pendingRoleSpecs;
|
|
700
|
-
// Try to find matching role spec from the pending specs
|
|
701
|
-
let roleName;
|
|
702
|
-
let customPrompt;
|
|
703
|
-
let customDefinition;
|
|
704
|
-
if (pendingSpecs) {
|
|
705
|
-
for (const spec of pendingSpecs) {
|
|
706
|
-
const def = spec.customDefinition
|
|
707
|
-
? { shortName: spec.customDefinition.shortName }
|
|
708
|
-
: (() => { try {
|
|
709
|
-
return loadRole(spec.roleName);
|
|
710
|
-
}
|
|
711
|
-
catch {
|
|
712
|
-
return null;
|
|
713
|
-
} })();
|
|
714
|
-
if (def && def.shortName === prefix) {
|
|
715
|
-
roleName = spec.roleName;
|
|
716
|
-
customPrompt = spec.customPrompt;
|
|
717
|
-
customDefinition = spec.customDefinition;
|
|
718
|
-
break;
|
|
719
|
-
}
|
|
720
|
-
}
|
|
721
|
-
}
|
|
722
|
-
if (!roleName) {
|
|
723
|
-
log.warn({ swarmId: swarm.id, instanceId }, "cannot on-demand spawn: unknown role for instanceId");
|
|
724
|
-
return;
|
|
725
|
-
}
|
|
726
|
-
log.info({ swarmId: swarm.id, instanceId, roleName }, "on-demand spawning role");
|
|
727
|
-
await this.spawnRole(swarm.id, roleName, router, undefined, customPrompt, customDefinition);
|
|
728
|
-
}
|
|
729
|
-
/** Send a message with exponential backoff retry. Fire-and-forget (async). */
|
|
730
|
-
async sendWithRetry(swarm, fromRole, router, targetInstanceId, msg) {
|
|
731
|
-
for (let attempt = 0; attempt <= SEND_MAX_RETRIES; attempt++) {
|
|
732
|
-
const result = router.send(targetInstanceId, msg);
|
|
733
|
-
if (result.delivered) {
|
|
734
|
-
appendMessageLog(swarm.id, {
|
|
735
|
-
type: "action", action: "send",
|
|
736
|
-
from: fromRole.instanceId, to: targetInstanceId,
|
|
737
|
-
data: msg.data, messageId: msg.messageId,
|
|
738
|
-
timestamp: Date.now(),
|
|
739
|
-
});
|
|
740
|
-
return;
|
|
741
|
-
}
|
|
742
|
-
log.warn({ swarmId: swarm.id, to: targetInstanceId, attempt, reason: result.reason, messageId: msg.messageId }, "message delivery failed");
|
|
743
|
-
if (attempt < SEND_MAX_RETRIES) {
|
|
744
|
-
await sleep(SEND_RETRY_BASE_MS * 2 ** attempt);
|
|
745
|
-
}
|
|
746
|
-
}
|
|
747
|
-
// All retries exhausted — notify Queen
|
|
748
|
-
log.error({ swarmId: swarm.id, to: targetInstanceId, messageId: msg.messageId }, "message delivery failed after all retries");
|
|
749
|
-
appendMessageLog(swarm.id, {
|
|
750
|
-
type: "action", action: "send",
|
|
751
|
-
from: fromRole.instanceId, to: targetInstanceId,
|
|
752
|
-
data: msg.data, messageId: msg.messageId,
|
|
753
|
-
delivered: false,
|
|
754
|
-
timestamp: Date.now(),
|
|
755
|
-
});
|
|
756
|
-
const queen = this.findQueen(swarm);
|
|
757
|
-
if (queen) {
|
|
758
|
-
router.send(queen.instanceId, {
|
|
759
|
-
from: "system",
|
|
760
|
-
type: "system",
|
|
761
|
-
data: `[系统] 消息发送给 ${targetInstanceId} 失败(已重试 ${SEND_MAX_RETRIES} 次)。原始消息来自 ${fromRole.instanceId}。`,
|
|
762
|
-
timestamp: Date.now(),
|
|
763
|
-
});
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
startQueenCheck(swarmId) {
|
|
767
|
-
const swarm = this.swarms.get(swarmId);
|
|
768
|
-
if (!swarm)
|
|
769
|
-
return;
|
|
770
|
-
const tick = () => {
|
|
771
|
-
// Stop ticking if swarm is no longer active
|
|
772
|
-
if (swarm.status === "completed" || swarm.status === "failed" || swarm.status === "paused") {
|
|
773
|
-
return;
|
|
774
|
-
}
|
|
775
|
-
const queen = this.findQueen(swarm);
|
|
776
|
-
if (!queen || queen.status === "stopped")
|
|
777
|
-
return;
|
|
778
|
-
const router = swarm._router;
|
|
779
|
-
// Adaptive interval: slow down when only queen is left or all roles idle
|
|
780
|
-
const activeWorkers = [...swarm.roles.values()].filter((r) => r.definition.type !== "queen" && r.status === "active");
|
|
781
|
-
// Track consecutive idle checks
|
|
782
|
-
if (activeWorkers.length > 0) {
|
|
783
|
-
swarm.idleCheckCount = 0;
|
|
784
|
-
}
|
|
785
|
-
else {
|
|
786
|
-
swarm.idleCheckCount++;
|
|
787
|
-
}
|
|
788
|
-
// Auto-pause if idle too long
|
|
789
|
-
if (swarm.idleCheckCount >= swarm.maxIdleChecks) {
|
|
790
|
-
swarm.isPaused = true;
|
|
791
|
-
swarm.status = "paused";
|
|
792
|
-
saveSwarmSnapshot(swarm);
|
|
793
|
-
this.sendStatusUpdate(swarm);
|
|
794
|
-
router.send(queen.instanceId, {
|
|
1290
|
+
const statusJson = JSON.stringify(this.buildStatusPayload(swarm), null, 2);
|
|
1291
|
+
await this.deliverInbox(swarm, queen.instanceId, {
|
|
795
1292
|
from: "system",
|
|
796
1293
|
type: "system",
|
|
797
|
-
data: `[系统]
|
|
798
|
-
timestamp: Date.now(),
|
|
1294
|
+
data: `[系统] 定时巡查触发。请查看当前蜂群状态,判断是否需要采取行动。\n\n当前蜂群成员及状态:\n${statusJson}`,
|
|
799
1295
|
});
|
|
800
|
-
|
|
801
|
-
|
|
1296
|
+
const nextInterval = activeWorkers.length > 0
|
|
1297
|
+
? QUEEN_CHECK_INTERVAL_MS
|
|
1298
|
+
: QUEEN_CHECK_IDLE_INTERVAL_MS;
|
|
1299
|
+
swarm.checkTimer = setTimeout(() => { void tick().catch((err) => log.error({ err, swarmId }, "queen tick failed (best-effort)")); }, nextInterval);
|
|
1300
|
+
}
|
|
1301
|
+
catch (err) {
|
|
1302
|
+
// deliverInbox / disk-IO failures must not surface as unhandled
|
|
1303
|
+
// rejection from the setTimeout fire-and-forget. Log and swallow;
|
|
1304
|
+
// the next tick (if any) will retry on its own schedule.
|
|
1305
|
+
log.error({ err, swarmId }, "queen tick failed (best-effort)");
|
|
802
1306
|
}
|
|
803
|
-
const statusJson = JSON.stringify(this.buildStatusPayload(swarm), null, 2);
|
|
804
|
-
router.send(queen.instanceId, {
|
|
805
|
-
from: "system",
|
|
806
|
-
type: "system",
|
|
807
|
-
data: `[系统] 定时巡查触发。请查看当前蜂群状态,判断是否需要采取行动。\n\n当前蜂群成员及状态:\n${statusJson}`,
|
|
808
|
-
timestamp: Date.now(),
|
|
809
|
-
});
|
|
810
|
-
const nextInterval = activeWorkers.length > 0
|
|
811
|
-
? QUEEN_CHECK_INTERVAL_MS
|
|
812
|
-
: QUEEN_CHECK_IDLE_INTERVAL_MS;
|
|
813
|
-
swarm.checkTimer = setTimeout(tick, nextInterval);
|
|
814
1307
|
};
|
|
815
|
-
swarm.checkTimer = setTimeout(tick, QUEEN_CHECK_INTERVAL_MS);
|
|
1308
|
+
swarm.checkTimer = setTimeout(() => { void tick().catch((err) => log.error({ err, swarmId }, "queen tick failed (best-effort)")); }, QUEEN_CHECK_INTERVAL_MS);
|
|
816
1309
|
}
|
|
817
1310
|
buildRoleListString(swarm) {
|
|
818
1311
|
const lines = [];
|