opencode-swarm-plugin 0.38.0 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +27 -0
- package/.hive/memories.jsonl +23 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/CHANGELOG.md +182 -0
- package/README.md +29 -12
- package/bin/swarm.test.ts +881 -0
- package/bin/swarm.ts +686 -0
- package/dist/compaction-hook.d.ts +8 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-observability.d.ts +173 -0
- package/dist/compaction-observability.d.ts.map +1 -0
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +174 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +80 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +16098 -651
- package/dist/plugin.js +16012 -756
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/schemas/task.d.ts +3 -3
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +4 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +702 -105
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
- package/evals/scorers/coordinator-discipline.ts +348 -15
- package/evals/scorers/index.test.ts +146 -0
- package/evals/scorers/index.ts +104 -0
- package/evals/swarm-decomposition.eval.ts +9 -2
- package/examples/commands/swarm.md +291 -21
- package/examples/plugin-wrapper-template.ts +117 -0
- package/package.json +7 -5
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +42 -0
- package/src/compaction-hook.ts +315 -86
- package/src/compaction-observability.integration.test.ts +139 -0
- package/src/compaction-observability.test.ts +187 -0
- package/src/compaction-observability.ts +324 -0
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +626 -1
- package/src/eval-capture.ts +286 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/eval-runner.test.ts +96 -0
- package/src/eval-runner.ts +356 -0
- package/src/hive.ts +34 -0
- package/src/index.ts +115 -2
- package/src/memory.test.ts +110 -0
- package/src/memory.ts +34 -0
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.ts +2 -2
- package/src/swarm-prompts.ts +2 -2
- package/src/swarm-review.ts +3 -3
- package/dist/beads.d.ts +0 -386
- package/dist/beads.d.ts.map +0 -1
- package/dist/schemas/bead-events.d.ts +0 -698
- package/dist/schemas/bead-events.d.ts.map +0 -1
- package/dist/schemas/bead.d.ts +0 -255
- package/dist/schemas/bead.d.ts.map +0 -1
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for decomposition scorers
|
|
3
|
+
*
|
|
4
|
+
* Uses Vitest (evalite's test runner), not Bun's test runner.
|
|
5
|
+
*
|
|
6
|
+
* Note: evalite's Score type only exposes `score`, not `message`.
|
|
7
|
+
* We test scores only - message testing requires accessing internal scorer.
|
|
8
|
+
*/
|
|
9
|
+
import { describe, expect, test } from "vitest";
|
|
10
|
+
import {
|
|
11
|
+
coverageCompleteness,
|
|
12
|
+
decompositionCoherence,
|
|
13
|
+
instructionClarity,
|
|
14
|
+
subtaskIndependence,
|
|
15
|
+
} from "./index.js";
|
|
16
|
+
|
|
17
|
+
describe("Heuristic Scorers", () => {
|
|
18
|
+
const goodDecomposition = JSON.stringify({
|
|
19
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
20
|
+
subtasks: [
|
|
21
|
+
{
|
|
22
|
+
title: "Add login form component",
|
|
23
|
+
description: "Create React component for login with email/password",
|
|
24
|
+
files: ["src/components/LoginForm.tsx"],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
title: "Add auth API routes",
|
|
28
|
+
description: "Create API endpoints for login/logout/session",
|
|
29
|
+
files: ["src/api/auth.ts"],
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
title: "Add auth middleware",
|
|
33
|
+
description: "Create middleware to protect routes",
|
|
34
|
+
files: ["src/middleware/auth.ts"],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
const conflictingDecomposition = JSON.stringify({
|
|
40
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
41
|
+
subtasks: [
|
|
42
|
+
{
|
|
43
|
+
title: "Add login",
|
|
44
|
+
files: ["src/auth.ts"],
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
title: "Add logout",
|
|
48
|
+
files: ["src/auth.ts"], // Same file - conflict!
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("subtaskIndependence scores 1.0 for no conflicts", async () => {
|
|
54
|
+
const result = await subtaskIndependence({
|
|
55
|
+
output: goodDecomposition,
|
|
56
|
+
expected: undefined,
|
|
57
|
+
input: {},
|
|
58
|
+
});
|
|
59
|
+
expect(result.score).toBe(1);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test("subtaskIndependence scores 0 for file conflicts", async () => {
|
|
63
|
+
const result = await subtaskIndependence({
|
|
64
|
+
output: conflictingDecomposition,
|
|
65
|
+
expected: undefined,
|
|
66
|
+
input: {},
|
|
67
|
+
});
|
|
68
|
+
expect(result.score).toBe(0);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("instructionClarity scores higher for detailed subtasks", async () => {
|
|
72
|
+
const result = await instructionClarity({
|
|
73
|
+
output: goodDecomposition,
|
|
74
|
+
expected: undefined,
|
|
75
|
+
input: {},
|
|
76
|
+
});
|
|
77
|
+
expect(result.score).toBeGreaterThan(0.7);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test("coverageCompleteness checks subtask count", async () => {
|
|
81
|
+
const result = await coverageCompleteness({
|
|
82
|
+
output: goodDecomposition,
|
|
83
|
+
expected: { minSubtasks: 2, maxSubtasks: 5 },
|
|
84
|
+
input: {},
|
|
85
|
+
});
|
|
86
|
+
expect(result.score).toBe(1);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
describe("LLM-as-Judge Scorer", () => {
|
|
91
|
+
// Skip LLM test in CI - requires API key
|
|
92
|
+
const hasApiKey = !!process.env.AI_GATEWAY_API_KEY;
|
|
93
|
+
|
|
94
|
+
test(
|
|
95
|
+
"decompositionCoherence returns valid score",
|
|
96
|
+
async () => {
|
|
97
|
+
if (!hasApiKey) {
|
|
98
|
+
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const decomposition = JSON.stringify({
|
|
103
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
104
|
+
subtasks: [
|
|
105
|
+
{
|
|
106
|
+
title: "Add login form",
|
|
107
|
+
description: "Create login UI",
|
|
108
|
+
files: ["src/LoginForm.tsx"],
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
title: "Add auth API",
|
|
112
|
+
description: "Create auth endpoints",
|
|
113
|
+
files: ["src/api/auth.ts"],
|
|
114
|
+
},
|
|
115
|
+
],
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
const result = await decompositionCoherence({
|
|
119
|
+
output: decomposition,
|
|
120
|
+
expected: undefined,
|
|
121
|
+
input: { task: "Add user authentication with login/logout" },
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
expect(result.score).toBeGreaterThanOrEqual(0);
|
|
125
|
+
expect(result.score).toBeLessThanOrEqual(1);
|
|
126
|
+
},
|
|
127
|
+
30000,
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
test("decompositionCoherence scores invalid decomposition low", async () => {
|
|
131
|
+
if (!process.env.AI_GATEWAY_API_KEY) {
|
|
132
|
+
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const result = await decompositionCoherence({
|
|
137
|
+
output: "not valid json at all {{{",
|
|
138
|
+
expected: undefined,
|
|
139
|
+
input: {},
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// LLM should recognize garbage input and score it very low
|
|
143
|
+
// (0 or close to 0, not 0.5 fallback)
|
|
144
|
+
expect(result.score).toBeLessThanOrEqual(0.2);
|
|
145
|
+
}, 30000);
|
|
146
|
+
});
|
package/evals/scorers/index.ts
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import { createScorer } from "evalite";
|
|
2
|
+
import { generateText, gateway } from "ai";
|
|
3
|
+
import type { GatewayModelId } from "ai";
|
|
2
4
|
import type { CellTree } from "../../src/schemas/index.js";
|
|
3
5
|
|
|
6
|
+
const JUDGE_MODEL: GatewayModelId = "anthropic/claude-haiku-4-5";
|
|
7
|
+
|
|
4
8
|
/**
|
|
5
9
|
* Custom scorers for evaluating swarm task decomposition quality
|
|
6
10
|
*/
|
|
@@ -222,3 +226,103 @@ export const instructionClarity = createScorer({
|
|
|
222
226
|
}
|
|
223
227
|
},
|
|
224
228
|
});
|
|
229
|
+
|
|
230
|
+
// ============================================================================
|
|
231
|
+
// LLM-as-Judge Scorers
|
|
232
|
+
// ============================================================================
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* LLM-as-judge scorer for decomposition coherence
|
|
236
|
+
*
|
|
237
|
+
* Uses Claude Haiku to evaluate whether subtasks are truly independent,
|
|
238
|
+
* well-scoped, and complete. This catches nuances that heuristics miss:
|
|
239
|
+
* - Semantic dependencies between subtasks
|
|
240
|
+
* - Scope that's too big or too trivial
|
|
241
|
+
* - Missing pieces that would block completion
|
|
242
|
+
*
|
|
243
|
+
* Only use for decomposition evals - this is where it matters.
|
|
244
|
+
*/
|
|
245
|
+
export const decompositionCoherence = createScorer({
|
|
246
|
+
name: "Decomposition Coherence (LLM Judge)",
|
|
247
|
+
description:
|
|
248
|
+
"LLM evaluates whether subtasks are truly independent and well-scoped",
|
|
249
|
+
scorer: async ({ output, input }) => {
|
|
250
|
+
try {
|
|
251
|
+
const decomposition =
|
|
252
|
+
typeof output === "string" ? output : JSON.stringify(output, null, 2);
|
|
253
|
+
|
|
254
|
+
// Get original task from input if available
|
|
255
|
+
const originalTask =
|
|
256
|
+
typeof input === "object" && input !== null && "task" in input
|
|
257
|
+
? String((input as { task: string }).task)
|
|
258
|
+
: "Unknown task";
|
|
259
|
+
|
|
260
|
+
const { text } = await generateText({
|
|
261
|
+
model: gateway(JUDGE_MODEL),
|
|
262
|
+
prompt: `You are evaluating a task decomposition for parallel agent execution.
|
|
263
|
+
|
|
264
|
+
ORIGINAL TASK:
|
|
265
|
+
${originalTask}
|
|
266
|
+
|
|
267
|
+
DECOMPOSITION:
|
|
268
|
+
${decomposition}
|
|
269
|
+
|
|
270
|
+
Evaluate on these criteria (be harsh - bad decompositions waste expensive parallel work):
|
|
271
|
+
|
|
272
|
+
1. INDEPENDENCE (25%): Can subtasks truly run in parallel? Look for:
|
|
273
|
+
- Shared state dependencies (one writes, another reads)
|
|
274
|
+
- Ordering requirements hidden in the task descriptions
|
|
275
|
+
- Shared files that will cause merge conflicts
|
|
276
|
+
|
|
277
|
+
2. SCOPE (25%): Is each subtask right-sized?
|
|
278
|
+
- Too big: Should be split further (>2 hours of work)
|
|
279
|
+
- Too small: Trivial tasks that waste agent spawn overhead
|
|
280
|
+
- Goldilocks: 30min-2hr of focused work
|
|
281
|
+
|
|
282
|
+
3. COMPLETENESS (25%): Does the sum equal the whole?
|
|
283
|
+
- Missing pieces that would leave the task incomplete
|
|
284
|
+
- Gaps between subtasks (who handles X?)
|
|
285
|
+
- Implicit work not captured in any subtask
|
|
286
|
+
|
|
287
|
+
4. CLARITY (25%): Would an agent know what to do?
|
|
288
|
+
- Vague descriptions that invite interpretation
|
|
289
|
+
- Missing context needed to start work
|
|
290
|
+
- Ambiguous boundaries between subtasks
|
|
291
|
+
|
|
292
|
+
Return ONLY valid JSON (no markdown, no explanation):
|
|
293
|
+
{"score": <0-100>, "issues": ["issue1", "issue2"], "strengths": ["strength1"]}`,
|
|
294
|
+
maxOutputTokens: 512,
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
// Parse JSON response - handle potential markdown wrapping
|
|
298
|
+
let jsonText = text.trim();
|
|
299
|
+
if (jsonText.startsWith("```")) {
|
|
300
|
+
jsonText = jsonText.replace(/```json?\n?/g, "").replace(/```$/g, "");
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
const result = JSON.parse(jsonText) as {
|
|
304
|
+
score: number;
|
|
305
|
+
issues: string[];
|
|
306
|
+
strengths?: string[];
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
const issueText =
|
|
310
|
+
result.issues.length > 0 ? result.issues.join("; ") : "No issues";
|
|
311
|
+
const strengthText =
|
|
312
|
+
result.strengths && result.strengths.length > 0
|
|
313
|
+
? ` | Strengths: ${result.strengths.join("; ")}`
|
|
314
|
+
: "";
|
|
315
|
+
|
|
316
|
+
return {
|
|
317
|
+
score: result.score / 100,
|
|
318
|
+
message: `${issueText}${strengthText}`,
|
|
319
|
+
};
|
|
320
|
+
} catch (error) {
|
|
321
|
+
// Don't fail the eval if judge fails - return neutral score
|
|
322
|
+
return {
|
|
323
|
+
score: 0.5,
|
|
324
|
+
message: `LLM judge error: ${error instanceof Error ? error.message : String(error)}`,
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
},
|
|
328
|
+
});
|
|
@@ -19,6 +19,7 @@ import {
|
|
|
19
19
|
subtaskIndependence,
|
|
20
20
|
coverageCompleteness,
|
|
21
21
|
instructionClarity,
|
|
22
|
+
decompositionCoherence,
|
|
22
23
|
} from "./scorers/index.js";
|
|
23
24
|
import { decompositionCases } from "./fixtures/decomposition-cases.js";
|
|
24
25
|
import {
|
|
@@ -82,7 +83,13 @@ evalite("Swarm Decomposition Quality", {
|
|
|
82
83
|
},
|
|
83
84
|
|
|
84
85
|
// Scorers evaluate decomposition quality
|
|
85
|
-
|
|
86
|
+
// decompositionCoherence uses LLM-as-judge for nuanced evaluation
|
|
87
|
+
scorers: [
|
|
88
|
+
subtaskIndependence,
|
|
89
|
+
coverageCompleteness,
|
|
90
|
+
instructionClarity,
|
|
91
|
+
decompositionCoherence,
|
|
92
|
+
],
|
|
86
93
|
});
|
|
87
94
|
|
|
88
95
|
/**
|
|
@@ -108,5 +115,5 @@ evalite("Decomposition Edge Cases", {
|
|
|
108
115
|
return extractJson(response);
|
|
109
116
|
},
|
|
110
117
|
|
|
111
|
-
scorers: [subtaskIndependence, coverageCompleteness],
|
|
118
|
+
scorers: [subtaskIndependence, coverageCompleteness, decompositionCoherence],
|
|
112
119
|
});
|
|
@@ -34,6 +34,77 @@ $ARGUMENTS
|
|
|
34
34
|
/swarm --fast --to-main "quick fix" # Fast mode + push to main
|
|
35
35
|
```
|
|
36
36
|
|
|
37
|
+
## What Good Looks Like 🎯
|
|
38
|
+
|
|
39
|
+
**Coordinators orchestrate, workers execute.** You're a conductor, not a performer.
|
|
40
|
+
|
|
41
|
+
### ✅ GOOD Coordinator Behavior
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
45
|
+
│ COORDINATOR EXCELLENCE │
|
|
46
|
+
├─────────────────────────────────────────────────────────────┤
|
|
47
|
+
│ │
|
|
48
|
+
│ ✅ Spawned researcher for Next.js 16 Cache Components │
|
|
49
|
+
│ → Got condensed summary, stored full findings in │
|
|
50
|
+
│ semantic-memory for future agents │
|
|
51
|
+
│ │
|
|
52
|
+
│ ✅ Loaded testing-patterns skill BEFORE spawning workers │
|
|
53
|
+
│ → Included skill recommendations in shared_context │
|
|
54
|
+
│ → Workers knew exactly which skills to use │
|
|
55
|
+
│ │
|
|
56
|
+
│ ✅ Checked swarmmail_inbox every 5 minutes │
|
|
57
|
+
│ → Caught worker blocked on database schema │
|
|
58
|
+
│ → Unblocked by coordinating with upstream worker │
|
|
59
|
+
│ │
|
|
60
|
+
│ ✅ Delegated planning to swarm/planner subagent │
|
|
61
|
+
│ → Main context stayed clean (only received JSON) │
|
|
62
|
+
│ → Scaled to 7 workers without context exhaustion │
|
|
63
|
+
│ │
|
|
64
|
+
│ ✅ Workers reserved their OWN files │
|
|
65
|
+
│ → Coordinator never called swarmmail_reserve │
|
|
66
|
+
│ → Conflict detection worked, no edit collisions │
|
|
67
|
+
│ │
|
|
68
|
+
│ ✅ Reviewed worker output with swarm_review │
|
|
69
|
+
│ → Sent specific feedback via swarm_review_feedback │
|
|
70
|
+
│ → Caught integration issue before merge │
|
|
71
|
+
│ │
|
|
72
|
+
└─────────────────────────────────────────────────────────────┘
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### ❌ COMMON MISTAKES (Avoid These)
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
79
|
+
│ COORDINATOR ANTI-PATTERNS │
|
|
80
|
+
├─────────────────────────────────────────────────────────────┤
|
|
81
|
+
│ │
|
|
82
|
+
│ ❌ Called context7 directly → dumped 50KB of docs into │
|
|
83
|
+
│ main thread → context exhaustion before workers spawned │
|
|
84
|
+
│ │
|
|
85
|
+
│ ❌ Skipped skill loading → workers didn't know about │
|
|
86
|
+
│ testing-patterns → reinvented dependency-breaking │
|
|
87
|
+
│ techniques already documented in skills │
|
|
88
|
+
│ │
|
|
89
|
+
│ ❌ Never checked inbox → worker stuck for 15 minutes on │
|
|
90
|
+
│ blocker → silent failure, wasted time │
|
|
91
|
+
│ │
|
|
92
|
+
│ ❌ Decomposed task inline in main thread → read 12 files, │
|
|
93
|
+
│ ran CASS queries, reasoned for 100 messages → burned │
|
|
94
|
+
│ 50% of context budget BEFORE spawning workers │
|
|
95
|
+
│ │
|
|
96
|
+
│ ❌ Reserved files as coordinator → workers blocked trying │
|
|
97
|
+
│ to reserve same files → swarm stalled, manual cleanup │
|
|
98
|
+
│ │
|
|
99
|
+
│ ❌ Edited worker's code directly → no swarm_complete call │
|
|
100
|
+
│ → learning signals lost, reservations not released │
|
|
101
|
+
│ │
|
|
102
|
+
│ ❌ Closed cells manually when workers said "done" │
|
|
103
|
+
│ → Skipped swarm_review → shipped broken integration │
|
|
104
|
+
│ │
|
|
105
|
+
└─────────────────────────────────────────────────────────────┘
|
|
106
|
+
```
|
|
107
|
+
|
|
37
108
|
## MANDATORY: Swarm Mail
|
|
38
109
|
|
|
39
110
|
**ALL coordination MUST use `swarmmail_*` tools.** This is non-negotiable.
|
|
@@ -97,9 +168,11 @@ swarmmail_init(project_path="$PWD", task_description="Swarm: <task summary>")
|
|
|
97
168
|
|
|
98
169
|
This registers you as the coordinator agent.
|
|
99
170
|
|
|
171
|
+
**Event tracked:** `session_initialized`
|
|
172
|
+
|
|
100
173
|
### 2. Knowledge Gathering (MANDATORY)
|
|
101
174
|
|
|
102
|
-
**Before decomposing, query
|
|
175
|
+
**Before decomposing, query these knowledge sources:**
|
|
103
176
|
|
|
104
177
|
```bash
|
|
105
178
|
# Past learnings from this project
|
|
@@ -108,14 +181,11 @@ semantic-memory_find(query="<task keywords>", limit=5)
|
|
|
108
181
|
# How similar tasks were solved before
|
|
109
182
|
cass_search(query="<task description>", limit=5)
|
|
110
183
|
|
|
111
|
-
# Design patterns and prior art
|
|
112
|
-
pdf-brain_search(query="<domain concepts>", limit=5)
|
|
113
|
-
|
|
114
184
|
# Available skills to inject into workers
|
|
115
185
|
skills_list()
|
|
116
186
|
```
|
|
117
187
|
|
|
118
|
-
**Load coordinator skills based on task type:**
|
|
188
|
+
**Load coordinator skills based on task type (MANDATORY):**
|
|
119
189
|
|
|
120
190
|
```bash
|
|
121
191
|
# For swarm coordination (ALWAYS load this)
|
|
@@ -131,12 +201,87 @@ skills_use(name="testing-patterns")
|
|
|
131
201
|
skills_use(name="cli-builder")
|
|
132
202
|
```
|
|
133
203
|
|
|
134
|
-
|
|
204
|
+
**Event tracked:** `skill_loaded` (for each skill)
|
|
205
|
+
|
|
206
|
+
**✅ GOOD:**
|
|
207
|
+
- Load skills_use(name="swarm-coordination") at start of every swarm
|
|
208
|
+
- Load task-specific skills based on keywords in task description
|
|
209
|
+
- Include skill recommendations in shared_context for workers
|
|
135
210
|
|
|
136
|
-
|
|
137
|
-
-
|
|
138
|
-
-
|
|
139
|
-
-
|
|
211
|
+
**❌ BAD:**
|
|
212
|
+
- Skip skill loading → workers reinvent patterns
|
|
213
|
+
- Load skills inline during decomposition → burns context
|
|
214
|
+
- Forget to mention skills in shared_context → workers don't know they exist
|
|
215
|
+
|
|
216
|
+
Synthesize findings into shared context for workers.
|
|
217
|
+
|
|
218
|
+
### 2.5. Research Phase (SPAWN RESEARCHER IF NEEDED - MANDATORY CHECK)
|
|
219
|
+
|
|
220
|
+
**⚠️ Coordinators CANNOT call pdf-brain, context7, or webfetch directly.** These dump massive context into your expensive Sonnet thread. Instead, spawn a researcher.
|
|
221
|
+
|
|
222
|
+
```
|
|
223
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
224
|
+
│ WHEN TO SPAWN A RESEARCHER │
|
|
225
|
+
├─────────────────────────────────────────────────────────────┤
|
|
226
|
+
│ │
|
|
227
|
+
│ ✅ SPAWN RESEARCHER WHEN: │
|
|
228
|
+
│ • Task involves unfamiliar framework/library │
|
|
229
|
+
│ • Need version-specific API docs (Next.js 16 vs 14) │
|
|
230
|
+
│ • Working with experimental/preview features │
|
|
231
|
+
│ • Need architectural guidance from pdf-brain │
|
|
232
|
+
│ • Want quotes from pdf-brain for changesets │
|
|
233
|
+
│ │
|
|
234
|
+
│ ❌ DON'T SPAWN WHEN: │
|
|
235
|
+
│ • Using well-known stable APIs │
|
|
236
|
+
│ • Pure refactoring of existing code │
|
|
237
|
+
│ • semantic-memory already has the answer │
|
|
238
|
+
│ │
|
|
239
|
+
└─────────────────────────────────────────────────────────────┘
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**How to spawn a researcher:**
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
Task(
|
|
246
|
+
subagent_type="swarm-researcher",
|
|
247
|
+
description="Research: <topic>",
|
|
248
|
+
prompt="Research <topic> for the swarm task '<task>'.
|
|
249
|
+
|
|
250
|
+
Use these tools:
|
|
251
|
+
- pdf-brain_search(query='<domain concepts>', limit=5) - software literature
|
|
252
|
+
- context7_get-library-docs - library-specific docs
|
|
253
|
+
- webfetch - official documentation sites
|
|
254
|
+
|
|
255
|
+
Store full findings in semantic-memory for future agents.
|
|
256
|
+
Return a 3-5 bullet summary for shared_context.
|
|
257
|
+
If writing a changeset, include a thematic quote from pdf-brain."
|
|
258
|
+
)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
**Event tracked:** `researcher_spawned`
|
|
262
|
+
|
|
263
|
+
**Researcher outputs:**
|
|
264
|
+
- Full findings stored in semantic-memory (searchable forever)
|
|
265
|
+
- Condensed summary returned for coordinator's shared_context
|
|
266
|
+
- Quotes for changesets if requested
|
|
267
|
+
|
|
268
|
+
**Example triggers:**
|
|
269
|
+
| Task Contains | Spawn Researcher For |
|
|
270
|
+
|---------------|----------------------|
|
|
271
|
+
| "Next.js 16", "cache components" | Next.js 16 Cache Components API |
|
|
272
|
+
| "Effect-TS", "Layer" | Effect-TS service patterns |
|
|
273
|
+
| "event sourcing" | Event sourcing patterns from pdf-brain |
|
|
274
|
+
| "OAuth", "PKCE" | OAuth 2.0 PKCE flow specifics |
|
|
275
|
+
|
|
276
|
+
**✅ GOOD:**
|
|
277
|
+
- Spawn researcher for Next.js 16 Cache Components → got API patterns, stored in semantic-memory
|
|
278
|
+
- Researcher returned 3-bullet summary → added to shared_context → workers had key guidance
|
|
279
|
+
- No context pollution in coordinator thread
|
|
280
|
+
|
|
281
|
+
**❌ BAD:**
|
|
282
|
+
- Called context7 directly → 50KB of Next.js docs dumped into main thread → context exhaustion
|
|
283
|
+
- Skipped researcher "because task seemed simple" → workers hit undocumented API quirks → 30min debugging
|
|
284
|
+
- Spawned researcher but didn't use the summary → wasted researcher's work
|
|
140
285
|
|
|
141
286
|
### 3. Create Feature Branch (unless --to-main)
|
|
142
287
|
|
|
@@ -272,6 +417,8 @@ Rules:
|
|
|
272
417
|
- 3-7 cells per swarm
|
|
273
418
|
- No file overlap between subtasks
|
|
274
419
|
|
|
420
|
+
**Event tracked:** `decomposition_complete`
|
|
421
|
+
|
|
275
422
|
### 6. Spawn Agents (Workers Reserve Their Own Files)
|
|
276
423
|
|
|
277
424
|
> **⚠️ CRITICAL: Coordinator NEVER reserves files.**
|
|
@@ -315,20 +462,48 @@ Then spawn:
|
|
|
315
462
|
Task(subagent_type="swarm/worker", description="<bead-title>", prompt="<from swarm_spawn_subtask>")
|
|
316
463
|
```
|
|
317
464
|
|
|
318
|
-
|
|
465
|
+
**Event tracked:** `worker_spawned` (for each worker)
|
|
466
|
+
|
|
467
|
+
**✅ GOOD:**
|
|
468
|
+
- Spawned all 5 workers in single message → parallel execution
|
|
469
|
+
- Included researcher findings in shared_context → workers had domain knowledge
|
|
470
|
+
- Included skill recommendations → workers loaded testing-patterns before TDD work
|
|
471
|
+
- Coordinator DID NOT reserve files → workers reserved their own → no conflicts
|
|
472
|
+
|
|
473
|
+
**❌ BAD:**
|
|
474
|
+
- Spawned workers one-by-one in separate messages → sequential, slow
|
|
475
|
+
- Forgot to include researcher summary in shared_context → workers lacked API knowledge
|
|
476
|
+
- Coordinator reserved files before spawning workers → workers blocked → manual cleanup
|
|
477
|
+
- Skipped skill recommendations → workers reinvented patterns
|
|
478
|
+
|
|
479
|
+
### 7. Monitor Inbox (MANDATORY - unless --no-sync)
|
|
480
|
+
|
|
481
|
+
> **⚠️ CRITICAL: Active monitoring is NOT optional.**
|
|
482
|
+
>
|
|
483
|
+
> Check `swarmmail_inbox()` **every 5-10 minutes** during swarm execution.
|
|
484
|
+
> Workers get blocked. Files conflict. Scope changes. You must intervene.
|
|
485
|
+
|
|
486
|
+
**Monitoring pattern:**
|
|
319
487
|
|
|
320
488
|
```bash
|
|
321
|
-
|
|
322
|
-
swarmmail_inbox() # Check for worker messages
|
|
489
|
+
# Every 5-10 minutes while workers are active
|
|
490
|
+
swarmmail_inbox() # Check for worker messages (max 5, no bodies)
|
|
491
|
+
|
|
492
|
+
# If urgent messages appear
|
|
323
493
|
swarmmail_read_message(message_id=N) # Read specific message
|
|
494
|
+
|
|
495
|
+
# Check overall status
|
|
496
|
+
swarm_status(epic_id="<epic-id>", project_key="$PWD")
|
|
324
497
|
```
|
|
325
498
|
|
|
499
|
+
**Event tracked:** `inbox_checked` (each check)
|
|
500
|
+
|
|
326
501
|
**Intervention triggers:**
|
|
327
502
|
|
|
328
|
-
- Worker blocked >5 min → Check inbox, offer guidance
|
|
329
|
-
- File conflict → Mediate, reassign files
|
|
330
|
-
- Worker asking questions → Answer directly
|
|
331
|
-
- Scope creep → Redirect, create new cell for extras
|
|
503
|
+
- **Worker blocked >5 min** → Check inbox, offer guidance → **Event:** `blocker_resolved`
|
|
504
|
+
- **File conflict** → Mediate, reassign files → **Event:** `file_conflict_mediated`
|
|
505
|
+
- **Worker asking questions** → Answer directly
|
|
506
|
+
- **Scope creep** → Redirect, create new cell for extras → **Event:** `scope_change_approved` or `scope_change_rejected`
|
|
332
507
|
|
|
333
508
|
If incompatibilities spotted, broadcast:
|
|
334
509
|
|
|
@@ -336,6 +511,76 @@ If incompatibilities spotted, broadcast:
|
|
|
336
511
|
swarmmail_send(to=["*"], subject="Coordinator Update", body="<guidance>", importance="high", thread_id="<epic-id>")
|
|
337
512
|
```
|
|
338
513
|
|
|
514
|
+
**✅ GOOD:**
|
|
515
|
+
- Checked inbox every 5 minutes → caught worker blocked on database schema at 8min mark
|
|
516
|
+
- Read message, coordinated with upstream worker → blocker resolved in 2min
|
|
517
|
+
- Worker unblocked, continued work → minimal delay
|
|
518
|
+
- Approved scope change request → created new cell for extra feature → **Event:** `scope_change_approved`
|
|
519
|
+
|
|
520
|
+
**❌ BAD:**
|
|
521
|
+
- Never checked inbox → worker stuck for 25 minutes waiting for coordinator
|
|
522
|
+
- Silent failure → worker gave up, closed cell incomplete
|
|
523
|
+
- Rejected scope change without creating follow-up cell → worker's valid concern lost → **Event:** `scope_change_rejected` (missing follow-up)
|
|
524
|
+
|
|
525
|
+
**Minimum monitoring frequency:**
|
|
526
|
+
- Check inbox **at least every 10 minutes** while workers active
|
|
527
|
+
- Immediately after spawning workers (catch quick blockers)
|
|
528
|
+
- After any worker completes (check for downstream dependencies)
|
|
529
|
+
|
|
530
|
+
### 8. Review Worker Output (MANDATORY)
|
|
531
|
+
|
|
532
|
+
> **⚠️ CRITICAL: Never skip review.**
|
|
533
|
+
>
|
|
534
|
+
> Workers say "done" doesn't mean "correct" or "integrated".
|
|
535
|
+
> Use `swarm_review` to generate review prompt, then `swarm_review_feedback` to approve/reject.
|
|
536
|
+
|
|
537
|
+
**Review workflow:**
|
|
538
|
+
|
|
539
|
+
```bash
|
|
540
|
+
# 1. Generate review prompt with epic context + diff
|
|
541
|
+
swarm_review(
|
|
542
|
+
project_key="$PWD",
|
|
543
|
+
epic_id="<epic-id>",
|
|
544
|
+
task_id="<cell-id>",
|
|
545
|
+
files_touched=["src/auth.ts", "src/schema.ts"]
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# 2. Review the output (check for integration, type safety, tests)
|
|
549
|
+
|
|
550
|
+
# 3. Send feedback
|
|
551
|
+
swarm_review_feedback(
|
|
552
|
+
project_key="$PWD",
|
|
553
|
+
task_id="<cell-id>",
|
|
554
|
+
worker_id="<agent-name>",
|
|
555
|
+
status="approved", # or "needs_changes"
|
|
556
|
+
summary="LGTM - auth service integrates correctly with existing schema",
|
|
557
|
+
issues="" # or JSON array of specific issues
|
|
558
|
+
)
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
**Event tracked:** `review_completed` (for each review)
|
|
562
|
+
|
|
563
|
+
**Review criteria:**
|
|
564
|
+
- Does work fulfill subtask requirements?
|
|
565
|
+
- Does it serve the overall epic goal?
|
|
566
|
+
- Does it enable downstream tasks?
|
|
567
|
+
- Type safety maintained?
|
|
568
|
+
- Tests added/passing?
|
|
569
|
+
- No obvious bugs or security issues?
|
|
570
|
+
|
|
571
|
+
**3-Strike Rule:** After 3 review rejections, task is marked blocked. This signals an architectural problem, not "try harder."
|
|
572
|
+
|
|
573
|
+
**✅ GOOD:**
|
|
574
|
+
- Reviewed all 5 workers' output before merge
|
|
575
|
+
- Caught integration issue in worker 3 → sent specific feedback → worker fixed in 5min
|
|
576
|
+
- Approved 4/5 on first review, 1/5 needed minor fixes
|
|
577
|
+
- Used swarm_review to get epic context + diff → comprehensive review
|
|
578
|
+
|
|
579
|
+
**❌ BAD:**
|
|
580
|
+
- Workers said "done", coordinator just closed cells → shipped broken integration
|
|
581
|
+
- Skipped review "to save time" → broke production
|
|
582
|
+
- Rejected worker output 3 times without guidance → worker stuck, no architectural input
|
|
583
|
+
|
|
339
584
|
### 9. Complete
|
|
340
585
|
|
|
341
586
|
```bash
|
|
@@ -385,6 +630,26 @@ gh pr create --title "feat: <epic title>" --body "## Summary\n<bullets>\n\n## Be
|
|
|
385
630
|
| Architecture decisions | `skills_use(name="system-design")` |
|
|
386
631
|
| Breaking dependencies | `skills_use(name="testing-patterns")` |
|
|
387
632
|
|
|
633
|
+
## Event Tracking Reference (for eval visibility)
|
|
634
|
+
|
|
635
|
+
These events are now tracked for coordinator evaluation:
|
|
636
|
+
|
|
637
|
+
| Event Type | When Fired |
|
|
638
|
+
| ------------------------ | ----------------------------------------- |
|
|
639
|
+
| `session_initialized` | swarmmail_init called |
|
|
640
|
+
| `skill_loaded` | skills_use called |
|
|
641
|
+
| `researcher_spawned` | Task(subagent_type="swarm-researcher") |
|
|
642
|
+
| `worker_spawned` | Task(subagent_type="swarm/worker") |
|
|
643
|
+
| `decomposition_complete` | hive_create_epic called |
|
|
644
|
+
| `inbox_checked` | swarmmail_inbox called |
|
|
645
|
+
| `blocker_resolved` | Coordinator unblocked stuck worker |
|
|
646
|
+
| `scope_change_approved` | Coordinator approved scope expansion |
|
|
647
|
+
| `scope_change_rejected` | Coordinator rejected scope expansion |
|
|
648
|
+
| `review_completed` | swarm_review_feedback called |
|
|
649
|
+
| `epic_complete` | swarm_complete called for epic |
|
|
650
|
+
|
|
651
|
+
**These events drive eval scoring.** Good coordinators fire the right events at the right times.
|
|
652
|
+
|
|
388
653
|
## Context Preservation Rules
|
|
389
654
|
|
|
390
655
|
**These are NON-NEGOTIABLE. Violating them burns context and kills long swarms.**
|
|
@@ -403,14 +668,19 @@ Not: Do Everything Inline → Run Out of Context → Fail
|
|
|
403
668
|
|
|
404
669
|
## Quick Checklist
|
|
405
670
|
|
|
406
|
-
- [ ] **swarmmail_init** called FIRST
|
|
671
|
+
- [ ] **swarmmail_init** called FIRST → Event: `session_initialized`
|
|
407
672
|
- [ ] Knowledge gathered (semantic-memory, CASS, pdf-brain, skills)
|
|
673
|
+
- [ ] **Skills loaded** → Event: `skill_loaded` (per skill)
|
|
674
|
+
- [ ] **Researcher spawned if needed** → Event: `researcher_spawned`
|
|
408
675
|
- [ ] **Planning delegated to swarm/planner subagent** (NOT inline)
|
|
409
676
|
- [ ] CellTree validated (no file conflicts)
|
|
410
|
-
- [ ] Epic + subtasks created
|
|
677
|
+
- [ ] Epic + subtasks created → Event: `decomposition_complete`
|
|
411
678
|
- [ ] **Coordinator did NOT reserve files** (workers do this themselves)
|
|
412
|
-
- [ ] Workers spawned in parallel
|
|
413
|
-
- [ ]
|
|
679
|
+
- [ ] Workers spawned in parallel → Event: `worker_spawned` (per worker)
|
|
680
|
+
- [ ] **Inbox monitored every 5-10 min** → Event: `inbox_checked` (multiple)
|
|
681
|
+
- [ ] **Blockers resolved** → Event: `blocker_resolved` (if any)
|
|
682
|
+
- [ ] **Scope changes handled** → Event: `scope_change_approved/rejected` (if any)
|
|
683
|
+
- [ ] **All workers reviewed** → Event: `review_completed` (per worker)
|
|
414
684
|
- [ ] PR created (or pushed to main)
|
|
415
685
|
- [ ] **ASCII art session summary** (MANDATORY - see below)
|
|
416
686
|
|