opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Tests for decomposition scorers
3
+ *
4
+ * Uses Vitest (evalite's test runner), not Bun's test runner.
5
+ *
6
+ * Note: evalite's Score type only exposes `score`, not `message`.
7
+ * We test scores only - message testing requires accessing internal scorer.
8
+ */
9
+ import { describe, expect, test } from "vitest";
10
+ import {
11
+ coverageCompleteness,
12
+ decompositionCoherence,
13
+ instructionClarity,
14
+ subtaskIndependence,
15
+ } from "./index.js";
16
+
17
+ describe("Heuristic Scorers", () => {
18
+ const goodDecomposition = JSON.stringify({
19
+ epic: { title: "Add auth", description: "Add authentication" },
20
+ subtasks: [
21
+ {
22
+ title: "Add login form component",
23
+ description: "Create React component for login with email/password",
24
+ files: ["src/components/LoginForm.tsx"],
25
+ },
26
+ {
27
+ title: "Add auth API routes",
28
+ description: "Create API endpoints for login/logout/session",
29
+ files: ["src/api/auth.ts"],
30
+ },
31
+ {
32
+ title: "Add auth middleware",
33
+ description: "Create middleware to protect routes",
34
+ files: ["src/middleware/auth.ts"],
35
+ },
36
+ ],
37
+ });
38
+
39
+ const conflictingDecomposition = JSON.stringify({
40
+ epic: { title: "Add auth", description: "Add authentication" },
41
+ subtasks: [
42
+ {
43
+ title: "Add login",
44
+ files: ["src/auth.ts"],
45
+ },
46
+ {
47
+ title: "Add logout",
48
+ files: ["src/auth.ts"], // Same file - conflict!
49
+ },
50
+ ],
51
+ });
52
+
53
+ test("subtaskIndependence scores 1.0 for no conflicts", async () => {
54
+ const result = await subtaskIndependence({
55
+ output: goodDecomposition,
56
+ expected: undefined,
57
+ input: {},
58
+ });
59
+ expect(result.score).toBe(1);
60
+ });
61
+
62
+ test("subtaskIndependence scores 0 for file conflicts", async () => {
63
+ const result = await subtaskIndependence({
64
+ output: conflictingDecomposition,
65
+ expected: undefined,
66
+ input: {},
67
+ });
68
+ expect(result.score).toBe(0);
69
+ });
70
+
71
+ test("instructionClarity scores higher for detailed subtasks", async () => {
72
+ const result = await instructionClarity({
73
+ output: goodDecomposition,
74
+ expected: undefined,
75
+ input: {},
76
+ });
77
+ expect(result.score).toBeGreaterThan(0.7);
78
+ });
79
+
80
+ test("coverageCompleteness checks subtask count", async () => {
81
+ const result = await coverageCompleteness({
82
+ output: goodDecomposition,
83
+ expected: { minSubtasks: 2, maxSubtasks: 5 },
84
+ input: {},
85
+ });
86
+ expect(result.score).toBe(1);
87
+ });
88
+ });
89
+
90
+ describe("LLM-as-Judge Scorer", () => {
91
+ // Skip LLM test in CI - requires API key
92
+ const hasApiKey = !!process.env.AI_GATEWAY_API_KEY;
93
+
94
+ test(
95
+ "decompositionCoherence returns valid score",
96
+ async () => {
97
+ if (!hasApiKey) {
98
+ console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
99
+ return;
100
+ }
101
+
102
+ const decomposition = JSON.stringify({
103
+ epic: { title: "Add auth", description: "Add authentication" },
104
+ subtasks: [
105
+ {
106
+ title: "Add login form",
107
+ description: "Create login UI",
108
+ files: ["src/LoginForm.tsx"],
109
+ },
110
+ {
111
+ title: "Add auth API",
112
+ description: "Create auth endpoints",
113
+ files: ["src/api/auth.ts"],
114
+ },
115
+ ],
116
+ });
117
+
118
+ const result = await decompositionCoherence({
119
+ output: decomposition,
120
+ expected: undefined,
121
+ input: { task: "Add user authentication with login/logout" },
122
+ });
123
+
124
+ expect(result.score).toBeGreaterThanOrEqual(0);
125
+ expect(result.score).toBeLessThanOrEqual(1);
126
+ },
127
+ 30000,
128
+ );
129
+
130
+ test("decompositionCoherence scores invalid decomposition low", async () => {
131
+ if (!process.env.AI_GATEWAY_API_KEY) {
132
+ console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
133
+ return;
134
+ }
135
+
136
+ const result = await decompositionCoherence({
137
+ output: "not valid json at all {{{",
138
+ expected: undefined,
139
+ input: {},
140
+ });
141
+
142
+ // LLM should recognize garbage input and score it very low
143
+ // (0 or close to 0, not 0.5 fallback)
144
+ expect(result.score).toBeLessThanOrEqual(0.2);
145
+ }, 30000);
146
+ });
@@ -1,6 +1,10 @@
1
1
  import { createScorer } from "evalite";
2
+ import { generateText, gateway } from "ai";
3
+ import type { GatewayModelId } from "ai";
2
4
  import type { CellTree } from "../../src/schemas/index.js";
3
5
 
6
+ const JUDGE_MODEL: GatewayModelId = "anthropic/claude-haiku-4-5";
7
+
4
8
  /**
5
9
  * Custom scorers for evaluating swarm task decomposition quality
6
10
  */
@@ -222,3 +226,103 @@ export const instructionClarity = createScorer({
222
226
  }
223
227
  },
224
228
  });
229
+
230
+ // ============================================================================
231
+ // LLM-as-Judge Scorers
232
+ // ============================================================================
233
+
234
+ /**
235
+ * LLM-as-judge scorer for decomposition coherence
236
+ *
237
+ * Uses Claude Haiku to evaluate whether subtasks are truly independent,
238
+ * well-scoped, and complete. This catches nuances that heuristics miss:
239
+ * - Semantic dependencies between subtasks
240
+ * - Scope that's too big or too trivial
241
+ * - Missing pieces that would block completion
242
+ *
243
+ * Only use for decomposition evals - this is where it matters.
244
+ */
245
+ export const decompositionCoherence = createScorer({
246
+ name: "Decomposition Coherence (LLM Judge)",
247
+ description:
248
+ "LLM evaluates whether subtasks are truly independent and well-scoped",
249
+ scorer: async ({ output, input }) => {
250
+ try {
251
+ const decomposition =
252
+ typeof output === "string" ? output : JSON.stringify(output, null, 2);
253
+
254
+ // Get original task from input if available
255
+ const originalTask =
256
+ typeof input === "object" && input !== null && "task" in input
257
+ ? String((input as { task: string }).task)
258
+ : "Unknown task";
259
+
260
+ const { text } = await generateText({
261
+ model: gateway(JUDGE_MODEL),
262
+ prompt: `You are evaluating a task decomposition for parallel agent execution.
263
+
264
+ ORIGINAL TASK:
265
+ ${originalTask}
266
+
267
+ DECOMPOSITION:
268
+ ${decomposition}
269
+
270
+ Evaluate on these criteria (be harsh - bad decompositions waste expensive parallel work):
271
+
272
+ 1. INDEPENDENCE (25%): Can subtasks truly run in parallel? Look for:
273
+ - Shared state dependencies (one writes, another reads)
274
+ - Ordering requirements hidden in the task descriptions
275
+ - Shared files that will cause merge conflicts
276
+
277
+ 2. SCOPE (25%): Is each subtask right-sized?
278
+ - Too big: Should be split further (>2 hours of work)
279
+ - Too small: Trivial tasks that waste agent spawn overhead
280
+ - Goldilocks: 30min-2hr of focused work
281
+
282
+ 3. COMPLETENESS (25%): Does the sum equal the whole?
283
+ - Missing pieces that would leave the task incomplete
284
+ - Gaps between subtasks (who handles X?)
285
+ - Implicit work not captured in any subtask
286
+
287
+ 4. CLARITY (25%): Would an agent know what to do?
288
+ - Vague descriptions that invite interpretation
289
+ - Missing context needed to start work
290
+ - Ambiguous boundaries between subtasks
291
+
292
+ Return ONLY valid JSON (no markdown, no explanation):
293
+ {"score": <0-100>, "issues": ["issue1", "issue2"], "strengths": ["strength1"]}`,
294
+ maxOutputTokens: 512,
295
+ });
296
+
297
+ // Parse JSON response - handle potential markdown wrapping
298
+ let jsonText = text.trim();
299
+ if (jsonText.startsWith("```")) {
300
+ jsonText = jsonText.replace(/```json?\n?/g, "").replace(/```$/g, "");
301
+ }
302
+
303
+ const result = JSON.parse(jsonText) as {
304
+ score: number;
305
+ issues: string[];
306
+ strengths?: string[];
307
+ };
308
+
309
+ const issueText =
310
+ result.issues.length > 0 ? result.issues.join("; ") : "No issues";
311
+ const strengthText =
312
+ result.strengths && result.strengths.length > 0
313
+ ? ` | Strengths: ${result.strengths.join("; ")}`
314
+ : "";
315
+
316
+ return {
317
+ score: result.score / 100,
318
+ message: `${issueText}${strengthText}`,
319
+ };
320
+ } catch (error) {
321
+ // Don't fail the eval if judge fails - return neutral score
322
+ return {
323
+ score: 0.5,
324
+ message: `LLM judge error: ${error instanceof Error ? error.message : String(error)}`,
325
+ };
326
+ }
327
+ },
328
+ });
@@ -19,6 +19,7 @@ import {
19
19
  subtaskIndependence,
20
20
  coverageCompleteness,
21
21
  instructionClarity,
22
+ decompositionCoherence,
22
23
  } from "./scorers/index.js";
23
24
  import { decompositionCases } from "./fixtures/decomposition-cases.js";
24
25
  import {
@@ -82,7 +83,13 @@ evalite("Swarm Decomposition Quality", {
82
83
  },
83
84
 
84
85
  // Scorers evaluate decomposition quality
85
- scorers: [subtaskIndependence, coverageCompleteness, instructionClarity],
86
+ // decompositionCoherence uses LLM-as-judge for nuanced evaluation
87
+ scorers: [
88
+ subtaskIndependence,
89
+ coverageCompleteness,
90
+ instructionClarity,
91
+ decompositionCoherence,
92
+ ],
86
93
  });
87
94
 
88
95
  /**
@@ -108,5 +115,5 @@ evalite("Decomposition Edge Cases", {
108
115
  return extractJson(response);
109
116
  },
110
117
 
111
- scorers: [subtaskIndependence, coverageCompleteness],
118
+ scorers: [subtaskIndependence, coverageCompleteness, decompositionCoherence],
112
119
  });
@@ -34,6 +34,77 @@ $ARGUMENTS
34
34
  /swarm --fast --to-main "quick fix" # Fast mode + push to main
35
35
  ```
36
36
 
37
+ ## What Good Looks Like 🎯
38
+
39
+ **Coordinators orchestrate, workers execute.** You're a conductor, not a performer.
40
+
41
+ ### ✅ GOOD Coordinator Behavior
42
+
43
+ ```
44
+ ┌─────────────────────────────────────────────────────────────┐
45
+ │ COORDINATOR EXCELLENCE │
46
+ ├─────────────────────────────────────────────────────────────┤
47
+ │ │
48
+ │ ✅ Spawned researcher for Next.js 16 Cache Components │
49
+ │ → Got condensed summary, stored full findings in │
50
+ │ semantic-memory for future agents │
51
+ │ │
52
+ │ ✅ Loaded testing-patterns skill BEFORE spawning workers │
53
+ │ → Included skill recommendations in shared_context │
54
+ │ → Workers knew exactly which skills to use │
55
+ │ │
56
+ │ ✅ Checked swarmmail_inbox every 5 minutes │
57
+ │ → Caught worker blocked on database schema │
58
+ │ → Unblocked by coordinating with upstream worker │
59
+ │ │
60
+ │ ✅ Delegated planning to swarm/planner subagent │
61
+ │ → Main context stayed clean (only received JSON) │
62
+ │ → Scaled to 7 workers without context exhaustion │
63
+ │ │
64
+ │ ✅ Workers reserved their OWN files │
65
+ │ → Coordinator never called swarmmail_reserve │
66
+ │ → Conflict detection worked, no edit collisions │
67
+ │ │
68
+ │ ✅ Reviewed worker output with swarm_review │
69
+ │ → Sent specific feedback via swarm_review_feedback │
70
+ │ → Caught integration issue before merge │
71
+ │ │
72
+ └─────────────────────────────────────────────────────────────┘
73
+ ```
74
+
75
+ ### ❌ COMMON MISTAKES (Avoid These)
76
+
77
+ ```
78
+ ┌─────────────────────────────────────────────────────────────┐
79
+ │ COORDINATOR ANTI-PATTERNS │
80
+ ├─────────────────────────────────────────────────────────────┤
81
+ │ │
82
+ │ ❌ Called context7 directly → dumped 50KB of docs into │
83
+ │ main thread → context exhaustion before workers spawned │
84
+ │ │
85
+ │ ❌ Skipped skill loading → workers didn't know about │
86
+ │ testing-patterns → reinvented dependency-breaking │
87
+ │ techniques already documented in skills │
88
+ │ │
89
+ │ ❌ Never checked inbox → worker stuck for 15 minutes on │
90
+ │ blocker → silent failure, wasted time │
91
+ │ │
92
+ │ ❌ Decomposed task inline in main thread → read 12 files, │
93
+ │ ran CASS queries, reasoned for 100 messages → burned │
94
+ │ 50% of context budget BEFORE spawning workers │
95
+ │ │
96
+ │ ❌ Reserved files as coordinator → workers blocked trying │
97
+ │ to reserve same files → swarm stalled, manual cleanup │
98
+ │ │
99
+ │ ❌ Edited worker's code directly → no swarm_complete call │
100
+ │ → learning signals lost, reservations not released │
101
+ │ │
102
+ │ ❌ Closed cells manually when workers said "done" │
103
+ │ → Skipped swarm_review → shipped broken integration │
104
+ │ │
105
+ └─────────────────────────────────────────────────────────────┘
106
+ ```
107
+
37
108
  ## MANDATORY: Swarm Mail
38
109
 
39
110
  **ALL coordination MUST use `swarmmail_*` tools.** This is non-negotiable.
@@ -97,9 +168,11 @@ swarmmail_init(project_path="$PWD", task_description="Swarm: <task summary>")
97
168
 
98
169
  This registers you as the coordinator agent.
99
170
 
171
+ **Event tracked:** `session_initialized`
172
+
100
173
  ### 2. Knowledge Gathering (MANDATORY)
101
174
 
102
- **Before decomposing, query ALL knowledge sources:**
175
+ **Before decomposing, query these knowledge sources:**
103
176
 
104
177
  ```bash
105
178
  # Past learnings from this project
@@ -108,14 +181,11 @@ semantic-memory_find(query="<task keywords>", limit=5)
108
181
  # How similar tasks were solved before
109
182
  cass_search(query="<task description>", limit=5)
110
183
 
111
- # Design patterns and prior art
112
- pdf-brain_search(query="<domain concepts>", limit=5)
113
-
114
184
  # Available skills to inject into workers
115
185
  skills_list()
116
186
  ```
117
187
 
118
- **Load coordinator skills based on task type:**
188
+ **Load coordinator skills based on task type (MANDATORY):**
119
189
 
120
190
  ```bash
121
191
  # For swarm coordination (ALWAYS load this)
@@ -131,12 +201,87 @@ skills_use(name="testing-patterns")
131
201
  skills_use(name="cli-builder")
132
202
  ```
133
203
 
134
- Synthesize findings into shared context for workers. Note:
204
+ **Event tracked:** `skill_loaded` (for each skill)
205
+
206
+ **✅ GOOD:**
207
+ - Load skills_use(name="swarm-coordination") at start of every swarm
208
+ - Load task-specific skills based on keywords in task description
209
+ - Include skill recommendations in shared_context for workers
135
210
 
136
- - Relevant patterns from pdf-brain
137
- - Similar past approaches from CASS
138
- - Project-specific learnings from semantic-memory
139
- - **Skills to recommend for each subtask** (critical for worker effectiveness)
211
+ **❌ BAD:**
212
+ - Skip skill loading workers reinvent patterns
213
+ - Load skills inline during decomposition → burns context
214
+ - Forget to mention skills in shared_context workers don't know they exist
215
+
216
+ Synthesize findings into shared context for workers.
217
+
218
+ ### 2.5. Research Phase (SPAWN RESEARCHER IF NEEDED - MANDATORY CHECK)
219
+
220
+ **⚠️ Coordinators CANNOT call pdf-brain, context7, or webfetch directly.** These dump massive context into your expensive Sonnet thread. Instead, spawn a researcher.
221
+
222
+ ```
223
+ ┌─────────────────────────────────────────────────────────────┐
224
+ │ WHEN TO SPAWN A RESEARCHER │
225
+ ├─────────────────────────────────────────────────────────────┤
226
+ │ │
227
+ │ ✅ SPAWN RESEARCHER WHEN: │
228
+ │ • Task involves unfamiliar framework/library │
229
+ │ • Need version-specific API docs (Next.js 16 vs 14) │
230
+ │ • Working with experimental/preview features │
231
+ │ • Need architectural guidance from pdf-brain │
232
+ │ • Want quotes from pdf-brain for changesets │
233
+ │ │
234
+ │ ❌ DON'T SPAWN WHEN: │
235
+ │ • Using well-known stable APIs │
236
+ │ • Pure refactoring of existing code │
237
+ │ • semantic-memory already has the answer │
238
+ │ │
239
+ └─────────────────────────────────────────────────────────────┘
240
+ ```
241
+
242
+ **How to spawn a researcher:**
243
+
244
+ ```bash
245
+ Task(
246
+ subagent_type="swarm-researcher",
247
+ description="Research: <topic>",
248
+ prompt="Research <topic> for the swarm task '<task>'.
249
+
250
+ Use these tools:
251
+ - pdf-brain_search(query='<domain concepts>', limit=5) - software literature
252
+ - context7_get-library-docs - library-specific docs
253
+ - webfetch - official documentation sites
254
+
255
+ Store full findings in semantic-memory for future agents.
256
+ Return a 3-5 bullet summary for shared_context.
257
+ If writing a changeset, include a thematic quote from pdf-brain."
258
+ )
259
+ ```
260
+
261
+ **Event tracked:** `researcher_spawned`
262
+
263
+ **Researcher outputs:**
264
+ - Full findings stored in semantic-memory (searchable forever)
265
+ - Condensed summary returned for coordinator's shared_context
266
+ - Quotes for changesets if requested
267
+
268
+ **Example triggers:**
269
+ | Task Contains | Spawn Researcher For |
270
+ |---------------|----------------------|
271
+ | "Next.js 16", "cache components" | Next.js 16 Cache Components API |
272
+ | "Effect-TS", "Layer" | Effect-TS service patterns |
273
+ | "event sourcing" | Event sourcing patterns from pdf-brain |
274
+ | "OAuth", "PKCE" | OAuth 2.0 PKCE flow specifics |
275
+
276
+ **✅ GOOD:**
277
+ - Spawn researcher for Next.js 16 Cache Components → got API patterns, stored in semantic-memory
278
+ - Researcher returned 3-bullet summary → added to shared_context → workers had key guidance
279
+ - No context pollution in coordinator thread
280
+
281
+ **❌ BAD:**
282
+ - Called context7 directly → 50KB of Next.js docs dumped into main thread → context exhaustion
283
+ - Skipped researcher "because task seemed simple" → workers hit undocumented API quirks → 30min debugging
284
+ - Spawned researcher but didn't use the summary → wasted researcher's work
140
285
 
141
286
  ### 3. Create Feature Branch (unless --to-main)
142
287
 
@@ -272,6 +417,8 @@ Rules:
272
417
  - 3-7 cells per swarm
273
418
  - No file overlap between subtasks
274
419
 
420
+ **Event tracked:** `decomposition_complete`
421
+
275
422
  ### 6. Spawn Agents (Workers Reserve Their Own Files)
276
423
 
277
424
  > **⚠️ CRITICAL: Coordinator NEVER reserves files.**
@@ -315,20 +462,48 @@ Then spawn:
315
462
  Task(subagent_type="swarm/worker", description="<bead-title>", prompt="<from swarm_spawn_subtask>")
316
463
  ```
317
464
 
318
- ### 8. Monitor (unless --no-sync)
465
+ **Event tracked:** `worker_spawned` (for each worker)
466
+
467
+ **✅ GOOD:**
468
+ - Spawned all 5 workers in single message → parallel execution
469
+ - Included researcher findings in shared_context → workers had domain knowledge
470
+ - Included skill recommendations → workers loaded testing-patterns before TDD work
471
+ - Coordinator DID NOT reserve files → workers reserved their own → no conflicts
472
+
473
+ **❌ BAD:**
474
+ - Spawned workers one-by-one in separate messages → sequential, slow
475
+ - Forgot to include researcher summary in shared_context → workers lacked API knowledge
476
+ - Coordinator reserved files before spawning workers → workers blocked → manual cleanup
477
+ - Skipped skill recommendations → workers reinvented patterns
478
+
479
+ ### 7. Monitor Inbox (MANDATORY - unless --no-sync)
480
+
481
+ > **⚠️ CRITICAL: Active monitoring is NOT optional.**
482
+ >
483
+ > Check `swarmmail_inbox()` **every 5-10 minutes** during swarm execution.
484
+ > Workers get blocked. Files conflict. Scope changes. You must intervene.
485
+
486
+ **Monitoring pattern:**
319
487
 
320
488
  ```bash
321
- swarm_status(epic_id="<epic-id>", project_key="$PWD")
322
- swarmmail_inbox() # Check for worker messages
489
+ # Every 5-10 minutes while workers are active
490
+ swarmmail_inbox() # Check for worker messages (max 5, no bodies)
491
+
492
+ # If urgent messages appear
323
493
  swarmmail_read_message(message_id=N) # Read specific message
494
+
495
+ # Check overall status
496
+ swarm_status(epic_id="<epic-id>", project_key="$PWD")
324
497
  ```
325
498
 
499
+ **Event tracked:** `inbox_checked` (each check)
500
+
326
501
  **Intervention triggers:**
327
502
 
328
- - Worker blocked >5 min → Check inbox, offer guidance
329
- - File conflict → Mediate, reassign files
330
- - Worker asking questions → Answer directly
331
- - Scope creep → Redirect, create new cell for extras
503
+ - **Worker blocked >5 min** → Check inbox, offer guidance → **Event:** `blocker_resolved`
504
+ - **File conflict** → Mediate, reassign files → **Event:** `file_conflict_mediated`
505
+ - **Worker asking questions** → Answer directly
506
+ - **Scope creep** → Redirect, create new cell for extras → **Event:** `scope_change_approved` or `scope_change_rejected`
332
507
 
333
508
  If incompatibilities spotted, broadcast:
334
509
 
@@ -336,6 +511,76 @@ If incompatibilities spotted, broadcast:
336
511
  swarmmail_send(to=["*"], subject="Coordinator Update", body="<guidance>", importance="high", thread_id="<epic-id>")
337
512
  ```
338
513
 
514
+ **✅ GOOD:**
515
+ - Checked inbox every 5 minutes → caught worker blocked on database schema at 8min mark
516
+ - Read message, coordinated with upstream worker → blocker resolved in 2min
517
+ - Worker unblocked, continued work → minimal delay
518
+ - Approved scope change request → created new cell for extra feature → **Event:** `scope_change_approved`
519
+
520
+ **❌ BAD:**
521
+ - Never checked inbox → worker stuck for 25 minutes waiting for coordinator
522
+ - Silent failure → worker gave up, closed cell incomplete
523
+ - Rejected scope change without creating follow-up cell → worker's valid concern lost → **Event:** `scope_change_rejected` (missing follow-up)
524
+
525
+ **Minimum monitoring frequency:**
526
+ - Check inbox **at least every 10 minutes** while workers active
527
+ - Immediately after spawning workers (catch quick blockers)
528
+ - After any worker completes (check for downstream dependencies)
529
+
530
+ ### 8. Review Worker Output (MANDATORY)
531
+
532
+ > **⚠️ CRITICAL: Never skip review.**
533
+ >
534
+ > Workers say "done" doesn't mean "correct" or "integrated".
535
+ > Use `swarm_review` to generate review prompt, then `swarm_review_feedback` to approve/reject.
536
+
537
+ **Review workflow:**
538
+
539
+ ```bash
540
+ # 1. Generate review prompt with epic context + diff
541
+ swarm_review(
542
+ project_key="$PWD",
543
+ epic_id="<epic-id>",
544
+ task_id="<cell-id>",
545
+ files_touched=["src/auth.ts", "src/schema.ts"]
546
+ )
547
+
548
+ # 2. Review the output (check for integration, type safety, tests)
549
+
550
+ # 3. Send feedback
551
+ swarm_review_feedback(
552
+ project_key="$PWD",
553
+ task_id="<cell-id>",
554
+ worker_id="<agent-name>",
555
+ status="approved", # or "needs_changes"
556
+ summary="LGTM - auth service integrates correctly with existing schema",
557
+ issues="" # or JSON array of specific issues
558
+ )
559
+ ```
560
+
561
+ **Event tracked:** `review_completed` (for each review)
562
+
563
+ **Review criteria:**
564
+ - Does work fulfill subtask requirements?
565
+ - Does it serve the overall epic goal?
566
+ - Does it enable downstream tasks?
567
+ - Type safety maintained?
568
+ - Tests added/passing?
569
+ - No obvious bugs or security issues?
570
+
571
+ **3-Strike Rule:** After 3 review rejections, task is marked blocked. This signals an architectural problem, not "try harder."
572
+
573
+ **✅ GOOD:**
574
+ - Reviewed all 5 workers' output before merge
575
+ - Caught integration issue in worker 3 → sent specific feedback → worker fixed in 5min
576
+ - Approved 4/5 on first review, 1/5 needed minor fixes
577
+ - Used swarm_review to get epic context + diff → comprehensive review
578
+
579
+ **❌ BAD:**
580
+ - Workers said "done", coordinator just closed cells → shipped broken integration
581
+ - Skipped review "to save time" → broke production
582
+ - Rejected worker output 3 times without guidance → worker stuck, no architectural input
583
+
339
584
  ### 9. Complete
340
585
 
341
586
  ```bash
@@ -385,6 +630,26 @@ gh pr create --title "feat: <epic title>" --body "## Summary\n<bullets>\n\n## Be
385
630
  | Architecture decisions | `skills_use(name="system-design")` |
386
631
  | Breaking dependencies | `skills_use(name="testing-patterns")` |
387
632
 
633
+ ## Event Tracking Reference (for eval visibility)
634
+
635
+ These events are now tracked for coordinator evaluation:
636
+
637
+ | Event Type | When Fired |
638
+ | ------------------------ | ----------------------------------------- |
639
+ | `session_initialized` | swarmmail_init called |
640
+ | `skill_loaded` | skills_use called |
641
+ | `researcher_spawned` | Task(subagent_type="swarm-researcher") |
642
+ | `worker_spawned` | Task(subagent_type="swarm/worker") |
643
+ | `decomposition_complete` | hive_create_epic called |
644
+ | `inbox_checked` | swarmmail_inbox called |
645
+ | `blocker_resolved` | Coordinator unblocked stuck worker |
646
+ | `scope_change_approved` | Coordinator approved scope expansion |
647
+ | `scope_change_rejected` | Coordinator rejected scope expansion |
648
+ | `review_completed` | swarm_review_feedback called |
649
+ | `epic_complete` | swarm_complete called for epic |
650
+
651
+ **These events drive eval scoring.** Good coordinators fire the right events at the right times.
652
+
388
653
  ## Context Preservation Rules
389
654
 
390
655
  **These are NON-NEGOTIABLE. Violating them burns context and kills long swarms.**
@@ -403,14 +668,19 @@ Not: Do Everything Inline → Run Out of Context → Fail
403
668
 
404
669
  ## Quick Checklist
405
670
 
406
- - [ ] **swarmmail_init** called FIRST
671
+ - [ ] **swarmmail_init** called FIRST → Event: `session_initialized`
407
672
  - [ ] Knowledge gathered (semantic-memory, CASS, pdf-brain, skills)
673
+ - [ ] **Skills loaded** → Event: `skill_loaded` (per skill)
674
+ - [ ] **Researcher spawned if needed** → Event: `researcher_spawned`
408
675
  - [ ] **Planning delegated to swarm/planner subagent** (NOT inline)
409
676
  - [ ] CellTree validated (no file conflicts)
410
- - [ ] Epic + subtasks created
677
+ - [ ] Epic + subtasks created → Event: `decomposition_complete`
411
678
  - [ ] **Coordinator did NOT reserve files** (workers do this themselves)
412
- - [ ] Workers spawned in parallel
413
- - [ ] Progress monitored via **swarmmail_inbox** (limit=5, no bodies)
679
+ - [ ] Workers spawned in parallel → Event: `worker_spawned` (per worker)
680
+ - [ ] **Inbox monitored every 5-10 min** Event: `inbox_checked` (multiple)
681
+ - [ ] **Blockers resolved** → Event: `blocker_resolved` (if any)
682
+ - [ ] **Scope changes handled** → Event: `scope_change_approved/rejected` (if any)
683
+ - [ ] **All workers reviewed** → Event: `review_completed` (per worker)
414
684
  - [ ] PR created (or pushed to main)
415
685
  - [ ] **ASCII art session summary** (MANDATORY - see below)
416
686