opencode-swarm-plugin 0.44.0 → 0.44.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/bin/swarm.serve.test.ts +6 -4
  2. package/bin/swarm.ts +18 -12
  3. package/dist/compaction-prompt-scoring.js +139 -0
  4. package/dist/eval-capture.js +12811 -0
  5. package/dist/hive.d.ts.map +1 -1
  6. package/dist/hive.js +14834 -0
  7. package/dist/index.d.ts +18 -0
  8. package/dist/index.d.ts.map +1 -1
  9. package/dist/index.js +7743 -62593
  10. package/dist/plugin.js +24052 -78907
  11. package/dist/swarm-orchestrate.d.ts.map +1 -1
  12. package/dist/swarm-prompts.d.ts.map +1 -1
  13. package/dist/swarm-prompts.js +39407 -0
  14. package/dist/swarm-review.d.ts.map +1 -1
  15. package/dist/swarm-validation.d.ts +127 -0
  16. package/dist/swarm-validation.d.ts.map +1 -0
  17. package/dist/validators/index.d.ts +7 -0
  18. package/dist/validators/index.d.ts.map +1 -0
  19. package/dist/validators/schema-validator.d.ts +58 -0
  20. package/dist/validators/schema-validator.d.ts.map +1 -0
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2286
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
  43. package/docs/planning/ROADMAP.md +0 -368
  44. package/docs/semantic-memory-cli-syntax.md +0 -123
  45. package/docs/swarm-mail-architecture.md +0 -1147
  46. package/docs/testing/context-recovery-test.md +0 -470
  47. package/evals/ARCHITECTURE.md +0 -1189
  48. package/evals/README.md +0 -768
  49. package/evals/compaction-prompt.eval.ts +0 -149
  50. package/evals/compaction-resumption.eval.ts +0 -289
  51. package/evals/coordinator-behavior.eval.ts +0 -307
  52. package/evals/coordinator-session.eval.ts +0 -154
  53. package/evals/evalite.config.ts.bak +0 -15
  54. package/evals/example.eval.ts +0 -31
  55. package/evals/fixtures/cass-baseline.ts +0 -217
  56. package/evals/fixtures/compaction-cases.ts +0 -350
  57. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  58. package/evals/fixtures/coordinator-sessions.ts +0 -328
  59. package/evals/fixtures/decomposition-cases.ts +0 -105
  60. package/evals/lib/compaction-loader.test.ts +0 -248
  61. package/evals/lib/compaction-loader.ts +0 -320
  62. package/evals/lib/data-loader.evalite-test.ts +0 -289
  63. package/evals/lib/data-loader.test.ts +0 -345
  64. package/evals/lib/data-loader.ts +0 -281
  65. package/evals/lib/llm.ts +0 -115
  66. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  67. package/evals/scorers/compaction-scorers.ts +0 -305
  68. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  69. package/evals/scorers/coordinator-discipline.ts +0 -325
  70. package/evals/scorers/index.test.ts +0 -146
  71. package/evals/scorers/index.ts +0 -328
  72. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  73. package/evals/scorers/outcome-scorers.ts +0 -349
  74. package/evals/swarm-decomposition.eval.ts +0 -121
  75. package/examples/commands/swarm.md +0 -745
  76. package/examples/plugin-wrapper-template.ts +0 -2515
  77. package/examples/skills/hive-workflow/SKILL.md +0 -212
  78. package/examples/skills/skill-creator/SKILL.md +0 -223
  79. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  80. package/global-skills/cli-builder/SKILL.md +0 -344
  81. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  82. package/global-skills/learning-systems/SKILL.md +0 -644
  83. package/global-skills/skill-creator/LICENSE.txt +0 -202
  84. package/global-skills/skill-creator/SKILL.md +0 -352
  85. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  86. package/global-skills/skill-creator/references/workflows.md +0 -28
  87. package/global-skills/swarm-coordination/SKILL.md +0 -995
  88. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  89. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  90. package/global-skills/system-design/SKILL.md +0 -213
  91. package/global-skills/testing-patterns/SKILL.md +0 -430
  92. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  93. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  94. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  95. package/scripts/cleanup-test-memories.ts +0 -346
  96. package/scripts/init-skill.ts +0 -222
  97. package/scripts/migrate-unknown-sessions.ts +0 -349
  98. package/scripts/validate-skill.ts +0 -204
  99. package/src/agent-mail.ts +0 -1724
  100. package/src/anti-patterns.test.ts +0 -1167
  101. package/src/anti-patterns.ts +0 -448
  102. package/src/compaction-capture.integration.test.ts +0 -257
  103. package/src/compaction-hook.test.ts +0 -838
  104. package/src/compaction-hook.ts +0 -1204
  105. package/src/compaction-observability.integration.test.ts +0 -139
  106. package/src/compaction-observability.test.ts +0 -187
  107. package/src/compaction-observability.ts +0 -324
  108. package/src/compaction-prompt-scorers.test.ts +0 -475
  109. package/src/compaction-prompt-scoring.ts +0 -300
  110. package/src/contributor-tools.test.ts +0 -133
  111. package/src/contributor-tools.ts +0 -201
  112. package/src/dashboard.test.ts +0 -611
  113. package/src/dashboard.ts +0 -462
  114. package/src/error-enrichment.test.ts +0 -403
  115. package/src/error-enrichment.ts +0 -219
  116. package/src/eval-capture.test.ts +0 -1015
  117. package/src/eval-capture.ts +0 -929
  118. package/src/eval-gates.test.ts +0 -306
  119. package/src/eval-gates.ts +0 -218
  120. package/src/eval-history.test.ts +0 -508
  121. package/src/eval-history.ts +0 -214
  122. package/src/eval-learning.test.ts +0 -378
  123. package/src/eval-learning.ts +0 -360
  124. package/src/eval-runner.test.ts +0 -223
  125. package/src/eval-runner.ts +0 -402
  126. package/src/export-tools.test.ts +0 -476
  127. package/src/export-tools.ts +0 -257
  128. package/src/hive.integration.test.ts +0 -2241
  129. package/src/hive.ts +0 -1628
  130. package/src/index.ts +0 -940
  131. package/src/learning.integration.test.ts +0 -1815
  132. package/src/learning.ts +0 -1079
  133. package/src/logger.test.ts +0 -189
  134. package/src/logger.ts +0 -135
  135. package/src/mandate-promotion.test.ts +0 -473
  136. package/src/mandate-promotion.ts +0 -239
  137. package/src/mandate-storage.integration.test.ts +0 -601
  138. package/src/mandate-storage.test.ts +0 -578
  139. package/src/mandate-storage.ts +0 -794
  140. package/src/mandates.ts +0 -540
  141. package/src/memory-tools.test.ts +0 -195
  142. package/src/memory-tools.ts +0 -344
  143. package/src/memory.integration.test.ts +0 -334
  144. package/src/memory.test.ts +0 -158
  145. package/src/memory.ts +0 -527
  146. package/src/model-selection.test.ts +0 -188
  147. package/src/model-selection.ts +0 -68
  148. package/src/observability-tools.test.ts +0 -359
  149. package/src/observability-tools.ts +0 -871
  150. package/src/output-guardrails.test.ts +0 -438
  151. package/src/output-guardrails.ts +0 -381
  152. package/src/pattern-maturity.test.ts +0 -1160
  153. package/src/pattern-maturity.ts +0 -525
  154. package/src/planning-guardrails.test.ts +0 -491
  155. package/src/planning-guardrails.ts +0 -438
  156. package/src/plugin.ts +0 -23
  157. package/src/post-compaction-tracker.test.ts +0 -251
  158. package/src/post-compaction-tracker.ts +0 -237
  159. package/src/query-tools.test.ts +0 -636
  160. package/src/query-tools.ts +0 -324
  161. package/src/rate-limiter.integration.test.ts +0 -466
  162. package/src/rate-limiter.ts +0 -774
  163. package/src/replay-tools.test.ts +0 -496
  164. package/src/replay-tools.ts +0 -240
  165. package/src/repo-crawl.integration.test.ts +0 -441
  166. package/src/repo-crawl.ts +0 -610
  167. package/src/schemas/cell-events.test.ts +0 -347
  168. package/src/schemas/cell-events.ts +0 -807
  169. package/src/schemas/cell.ts +0 -257
  170. package/src/schemas/evaluation.ts +0 -166
  171. package/src/schemas/index.test.ts +0 -199
  172. package/src/schemas/index.ts +0 -286
  173. package/src/schemas/mandate.ts +0 -232
  174. package/src/schemas/swarm-context.ts +0 -115
  175. package/src/schemas/task.ts +0 -161
  176. package/src/schemas/worker-handoff.test.ts +0 -302
  177. package/src/schemas/worker-handoff.ts +0 -131
  178. package/src/sessions/agent-discovery.test.ts +0 -137
  179. package/src/sessions/agent-discovery.ts +0 -112
  180. package/src/sessions/index.ts +0 -15
  181. package/src/skills.integration.test.ts +0 -1192
  182. package/src/skills.test.ts +0 -643
  183. package/src/skills.ts +0 -1549
  184. package/src/storage.integration.test.ts +0 -341
  185. package/src/storage.ts +0 -884
  186. package/src/structured.integration.test.ts +0 -817
  187. package/src/structured.test.ts +0 -1046
  188. package/src/structured.ts +0 -762
  189. package/src/swarm-decompose.test.ts +0 -188
  190. package/src/swarm-decompose.ts +0 -1302
  191. package/src/swarm-deferred.integration.test.ts +0 -157
  192. package/src/swarm-deferred.test.ts +0 -38
  193. package/src/swarm-insights.test.ts +0 -214
  194. package/src/swarm-insights.ts +0 -459
  195. package/src/swarm-mail.integration.test.ts +0 -970
  196. package/src/swarm-mail.ts +0 -739
  197. package/src/swarm-orchestrate.integration.test.ts +0 -282
  198. package/src/swarm-orchestrate.test.ts +0 -548
  199. package/src/swarm-orchestrate.ts +0 -3084
  200. package/src/swarm-prompts.test.ts +0 -1270
  201. package/src/swarm-prompts.ts +0 -2077
  202. package/src/swarm-research.integration.test.ts +0 -701
  203. package/src/swarm-research.test.ts +0 -698
  204. package/src/swarm-research.ts +0 -472
  205. package/src/swarm-review.integration.test.ts +0 -285
  206. package/src/swarm-review.test.ts +0 -879
  207. package/src/swarm-review.ts +0 -709
  208. package/src/swarm-strategies.ts +0 -407
  209. package/src/swarm-worktree.test.ts +0 -501
  210. package/src/swarm-worktree.ts +0 -575
  211. package/src/swarm.integration.test.ts +0 -2377
  212. package/src/swarm.ts +0 -38
  213. package/src/tool-adapter.integration.test.ts +0 -1221
  214. package/src/tool-availability.ts +0 -461
  215. package/tsconfig.json +0 -28
package/evals/README.md DELETED
@@ -1,768 +0,0 @@
1
- # Eval-Driven Development with Progressive Gates
2
-
3
- ```
4
- ┌──────────────────────────────────────────────────────────────┐
5
- │ EVAL PIPELINE │
6
- │ │
7
- │ CAPTURE → SCORE → STORE → GATE → LEARN → IMPROVE │
8
- │ │
9
- │ Real execution data feeds back into prompt generation │
10
- └──────────────────────────────────────────────────────────────┘
11
- ```
12
-
13
- TypeScript-native evaluation framework for testing swarm task decomposition quality and coordinator discipline. Built on [Evalite](https://evalite.dev), powered by captured real-world execution data.
14
-
15
- ---
16
-
17
- ## Quick Start
18
-
19
- ```bash
20
- # Run all evals once
21
- bun run eval:run
22
-
23
- # Run specific eval suite
24
- bun run eval:decomposition # Task decomposition quality
25
- bun run eval:coordinator # Coordinator protocol compliance
26
- bun run eval:compaction # Compaction prompt quality
27
-
28
- # Check eval status (progressive gates)
29
- swarm eval status
30
-
31
- # View eval history with trends
32
- swarm eval history
33
- ```
34
-
35
- ---
36
-
37
- ## Architecture
38
-
39
- ### The Pipeline
40
-
41
- ```
42
- ┌─────────────────────────────────────────────────────────────────┐
43
- │ │
44
- │ 1. CAPTURE (Real Execution) │
45
- │ ├─ Decomposition: task, strategy, subtasks │
46
- │ ├─ Outcomes: duration, errors, retries, success │
47
- │ ├─ Coordinator Events: decisions, violations, compaction │
48
- │ └─ Store to: .opencode/eval-data.jsonl, sessions/*.jsonl │
49
- │ │
50
- │ 2. SCORE (Quality Metrics) │
51
- │ ├─ Subtask Independence (file conflicts) │
52
- │ ├─ Complexity Balance (fair work distribution) │
53
- │ ├─ Coverage Completeness (files + scope) │
54
- │ ├─ Instruction Clarity (actionable descriptions) │
55
- │ └─ Coordinator Discipline (protocol adherence) │
56
- │ │
57
- │ 3. STORE (History Tracking) │
58
- │ ├─ Record to: .opencode/eval-history.jsonl │
59
- │ ├─ Track: score, timestamp, run_count │
60
- │ └─ Calculate: phase, variance, baseline │
61
- │ │
62
- │ 4. GATE (Progressive Quality Control) │
63
- │ ├─ Bootstrap (<10 runs): Always pass, collect data │
64
- │ ├─ Stabilization (10-50 runs): Warn on >10% regression │
65
- │ └─ Production (>50 runs, variance <0.1): Fail on >5% drop │
66
- │ │
67
- │ 5. LEARN (Failure Feedback) │
68
- │ ├─ Detect: Significant score drops (>15% from baseline) │
69
- │ ├─ Store to: Semantic memory with tags │
70
- │ └─ Query: Before generating future prompts │
71
- │ │
72
- │ 6. IMPROVE (Continuous Refinement) │
73
- │ └─ Future prompts query past failures for context │
74
- │ │
75
- └─────────────────────────────────────────────────────────────────┘
76
- ```
77
-
78
- ### Progressive Gates (Phase-Based Quality Control)
79
-
80
- The eval system uses **progressive gates** that adapt based on data maturity:
81
-
82
- ```
83
- Phase Runs Variance Gate Behavior
84
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
85
- Bootstrap <10 N/A ✅ Always pass (collect data)
86
- Stabilization 10-50 N/A ⚠️ Warn on >10% regression (pass)
87
- Production >50 <0.1 ❌ Fail on >5% regression
88
- (High Variance) >50 ≥0.1 ⚠️ Stay in stabilization
89
- ```
90
-
91
- **Why progressive?**
92
-
93
- - **Bootstrap**: No baseline yet, focus on data collection
94
- - **Stabilization**: Baseline forming, tolerate noise while learning
95
- - **Production**: Stable baseline, strict quality enforcement
96
-
97
- **Variance threshold (0.1)**: Measures score consistency. High variance = unstable eval, stays in stabilization until it settles.
98
-
99
- **Regression calculation**:
100
- ```
101
- baseline = mean(historical_scores)
102
- regression = (baseline - current_score) / baseline
103
- ```
104
-
105
- ---
106
-
107
- ## Eval Suites
108
-
109
- ### Swarm Decomposition (`swarm-decomposition.eval.ts`)
110
-
111
- **What it measures:** Quality of task decomposition into parallel subtasks
112
-
113
- **Data sources:**
114
- - Fixtures: `fixtures/decomposition-cases.ts`
115
- - Real captures: `.opencode/eval-data.jsonl`
116
-
117
- **Scorers:**
118
-
119
- | Scorer | Weight | What It Checks | Perfect Score |
120
- | ------------------------ | ------ | ------------------------------------------------------- | ---------------------------------- |
121
- | **Subtask Independence** | 0.25 | No file overlaps between subtasks (prevents conflicts) | 0 files in multiple subtasks |
122
- | **Complexity Balance** | 0.25 | Work distributed evenly (coefficient of variation <0.3) | CV <0.3 (max/min complexity ratio) |
123
- | **Coverage** | 0.25 | Required files covered, subtask count in range | All required files + 3-6 subtasks |
124
- | **Instruction Clarity** | 0.25 | Descriptions actionable, files specified, titles clear | >20 chars, files listed, specific |
125
-
126
- **Example output:**
127
- ```
128
- swarm-decomposition
129
- ├─ subtaskIndependence: 1.0 (no conflicts)
130
- ├─ complexityBalance: 0.85 (CV: 0.22)
131
- ├─ coverageCompleteness: 1.0 (all files covered)
132
- └─ instructionClarity: 0.90 (clear, actionable)
133
- → Overall: 0.94 ✅ PASS (stabilization phase)
134
- ```
135
-
136
- ### Coordinator Session (`coordinator-session.eval.ts`)
137
-
138
- **What it measures:** Coordinator protocol adherence during swarm runs
139
-
140
- **Data sources:**
141
- - Real sessions: `~/.config/swarm-tools/sessions/*.jsonl`
142
- - Fixtures: `fixtures/coordinator-sessions.ts`
143
-
144
- **Scorers:**
145
-
146
- | Scorer | Weight | What It Checks | Perfect Score |
147
- | ---------------------------- | ------ | -------------------------------------------------- | -------------------- |
148
- | **Violation Count** | 0.30 | Protocol violations (edit files, run tests, etc.) | 0 violations |
149
- | **Spawn Efficiency** | 0.25 | Workers spawned / subtasks planned | 100% (all delegated) |
150
- | **Review Thoroughness** | 0.25 | Reviews completed / workers finished | 100% (all reviewed) |
151
- | **Time to First Spawn** | 0.20 | Speed from decomposition to first worker spawn | <60 seconds |
152
- | **Overall Discipline** (composite) | 1.00 | Weighted composite of above | 1.0 (perfect) |
153
-
154
- **Violations tracked:**
155
- - `coordinator_edited_file` - Coordinator should NEVER edit directly
156
- - `coordinator_ran_tests` - Workers run tests, not coordinator
157
- - `coordinator_reserved_files` - Only workers reserve files
158
- - `no_worker_spawned` - Coordinator must delegate, not do work itself
159
-
160
- **Example output:**
161
- ```
162
- coordinator-behavior
163
- ├─ violationCount: 1.0 (0 violations)
164
- ├─ spawnEfficiency: 1.0 (3/3 workers spawned)
165
- ├─ reviewThoroughness: 0.67 (2/3 reviewed)
166
- └─ timeToFirstSpawn: 0.90 (45 seconds)
167
- → overallDiscipline: 0.89 ✅ PASS (bootstrap phase, collecting data)
168
- ```
169
-
170
- #### Coordinator Session Capture (Deep Dive)
171
-
172
- **How it works:** Session capture is fully automatic when coordinator tools are used. No manual instrumentation needed.
173
-
174
- **Capture flow:**
175
-
176
- ```
177
- ┌─────────────────────────────────────────────────────────────┐
178
- │ SESSION CAPTURE FLOW │
179
- │ │
180
- │ 1. Coordinator tool call detected │
181
- │ ├─ swarm_decompose, hive_create_epic, etc. │
182
- │ └─ Tool name + args inspected in real-time │
183
- │ │
184
- │ 2. Violation detection (planning-guardrails.ts) │
185
- │ ├─ detectCoordinatorViolation() checks patterns │
186
- │ ├─ Edit/Write tools → coordinator_edited_file │
187
- │ ├─ bash with test patterns → coordinator_ran_tests │
188
- │ └─ swarmmail_reserve → coordinator_reserved_files │
189
- │ │
190
- │ 3. Event emission (eval-capture.ts) │
191
- │ ├─ captureCoordinatorEvent() validates via Zod │
192
- │ ├─ Appends JSONL line to session file │
193
- │ └─ ~/.config/swarm-tools/sessions/{session_id}.jsonl │
194
- │ │
195
- │ 4. Eval consumption (coordinator-session.eval.ts) │
196
- │ ├─ loadCapturedSessions() reads all *.jsonl files │
197
- │ ├─ Parses events, reconstructs sessions │
198
- │ └─ Scorers analyze event sequences │
199
- │ │
200
- └─────────────────────────────────────────────────────────────┘
201
- ```
202
-
203
- **Event types:**
204
-
205
- | Event Type | Subtypes | When Captured |
206
- | -------------- | --------------------------------------------------------------------- | ------------------------------------ |
207
- | `DECISION` | strategy_selected, worker_spawned, review_completed, decomposition_complete | Coordinator makes decision |
208
- | `VIOLATION` | coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned | Protocol violation detected |
209
- | `OUTCOME` | subtask_success, subtask_retry, subtask_failed, epic_complete | Worker completes or epic finishes |
210
- | `COMPACTION` | detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked | Compaction lifecycle events |
211
-
212
- **Violation detection patterns** (from `planning-guardrails.ts`):
213
-
214
- ```typescript
215
- // File modification detection
216
- VIOLATION_PATTERNS.FILE_MODIFICATION_TOOLS = ["edit", "write"];
217
-
218
- // Test execution detection (regex patterns in bash commands)
219
- VIOLATION_PATTERNS.TEST_EXECUTION_PATTERNS = [
220
- /\bbun\s+test\b/i,
221
- /\bnpm\s+(run\s+)?test/i,
222
- /\bjest\b/i,
223
- /\bvitest\b/i,
224
- // ... and 6 more patterns
225
- ];
226
-
227
- // File reservation detection
228
- VIOLATION_PATTERNS.RESERVATION_TOOLS = ["swarmmail_reserve", "agentmail_reserve"];
229
- ```
230
-
231
- **Example session file** (`~/.config/swarm-tools/sessions/session-abc123.jsonl`):
232
-
233
- ```jsonl
234
- {"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:00:00Z","event_type":"DECISION","decision_type":"strategy_selected","payload":{"strategy":"feature-based"}}
235
- {"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:01:00Z","event_type":"DECISION","decision_type":"decomposition_complete","payload":{"subtask_count":3}}
236
- {"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:02:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{"worker_id":"SwiftFire","bead_id":"mjkw81rkq4c.1"}}
237
- {"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:05:00Z","event_type":"VIOLATION","violation_type":"coordinator_edited_file","payload":{"tool":"edit","file":"src/auth.ts"}}
238
- {"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:10:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{"bead_id":"mjkw81rkq4c.1","duration_ms":480000}}
239
- ```
240
-
241
- **Viewing sessions:**
242
-
243
- ```bash
244
- # List all captured sessions (coming soon)
245
- swarm log sessions
246
-
247
- # View specific session events
248
- cat ~/.config/swarm-tools/sessions/session-abc123.jsonl | jq .
249
-
250
- # Filter to violations only
251
- cat ~/.config/swarm-tools/sessions/*.jsonl | jq 'select(.event_type == "VIOLATION")'
252
-
253
- # Count violations by type
254
- cat ~/.config/swarm-tools/sessions/*.jsonl | jq -r 'select(.event_type == "VIOLATION") | .violation_type' | sort | uniq -c
255
- ```
256
-
257
- **Why JSONL format?**
258
-
259
- - **Append-only**: No file locking, safe for concurrent writes
260
- - **Streamable**: Process events one-by-one without loading full file
261
- - **Line-oriented**: Easy to `grep`, `jq`, `tail -f` for live monitoring
262
- - **Fault-tolerant**: Corrupted line doesn't break entire file
263
-
264
- **Integration points:**
265
-
266
- | Where | What Gets Captured | File |
267
- | -------------------------- | ----------------------------------------- | ----------------------- |
268
- | `swarm_decompose` | DECISION: strategy_selected, decomposition_complete | sessions/*.jsonl |
269
- | `swarm_spawn_subtask` | DECISION: worker_spawned | sessions/*.jsonl |
270
- | `swarm_review` | DECISION: review_completed | sessions/*.jsonl |
271
- | `swarm_complete` | OUTCOME: subtask_success/failed | sessions/*.jsonl |
272
- | Tool call inspection | VIOLATION: (real-time pattern matching) | sessions/*.jsonl |
273
- | Compaction hook | COMPACTION: (all lifecycle stages) | sessions/*.jsonl |
274
-
275
- **Source files:**
276
-
277
- - **Schema**: `src/eval-capture.ts` - CoordinatorEventSchema (Zod discriminated union)
278
- - **Violation detection**: `src/planning-guardrails.ts` - detectCoordinatorViolation()
279
- - **Capture**: `src/eval-capture.ts` - captureCoordinatorEvent()
280
- - **Scorers**: `evals/scorers/coordinator-discipline.ts` - violationCount, spawnEfficiency, etc.
281
- - **Eval**: `evals/coordinator-session.eval.ts` - Real sessions + fixtures
282
-
283
- ### Compaction Prompt (`compaction-prompt.eval.ts`)
284
-
285
- **What it measures:** Quality of continuation prompts after context compaction
286
-
287
- **Data sources:**
288
- - Captured compaction events from session files
289
- - Test fixtures with known-good/bad prompts
290
-
291
- **Scorers:**
292
-
293
- | Scorer | Weight | What It Checks | Perfect Score |
294
- | -------------------------------- | ------ | --------------------------------------------------------- | -------------------------------- |
295
- | **Epic ID Specificity** | 0.20 | Real IDs (mjkw...) not placeholders (<epic-id>, bd-xxx) | Real epic ID present |
296
- | **Actionability** | 0.20 | Tool calls with real values (swarm_status with epic ID) | Actionable tool with real values |
297
- | **Coordinator Identity** | 0.25 | ASCII header + strong mandates (NEVER/ALWAYS) | ASCII box + strong language |
298
- | **Forbidden Tools Listed** | 0.15 | Lists Edit, Write, swarmmail_reserve, git commit by name | 4/4 forbidden tools listed |
299
- | **Post-Compaction Discipline** | 0.20 | First suggested tool is swarm_status or inbox (not Edit) | First tool correct |
300
-
301
- **Why these metrics?**
302
-
303
- Post-compaction coordinators often "wake up" confused:
304
- - Forget they're coordinators → start editing files
305
- - Use placeholders → can't check actual status
306
- - Weak language → ignore mandates
307
- - Wrong first tool → dive into code instead of checking workers
308
-
309
- **Example output:**
310
- ```
311
- compaction-prompt
312
- ├─ epicIdSpecificity: 1.0 (real ID: mjkw81rkq4c)
313
- ├─ actionability: 1.0 (swarm_status with real epic ID)
314
- ├─ coordinatorIdentity: 1.0 (ASCII header + NEVER/ALWAYS)
315
- ├─ forbiddenToolsPresent: 1.0 (4/4 tools listed)
316
- └─ postCompactionDiscipline: 1.0 (first tool: swarm_status)
317
- → Overall: 1.0 ✅ PASS (production phase)
318
- ```
319
-
320
- ---
321
-
322
- ## Data Capture
323
-
324
- ### What Gets Captured
325
-
326
- **Decomposition Events** (`.opencode/eval-data.jsonl`):
327
- ```jsonl
328
- {
329
- "id": "mjkw81rkq4c",
330
- "timestamp": "2025-01-01T12:00:00Z",
331
- "task": "Add OAuth authentication",
332
- "strategy": "feature-based",
333
- "epic_title": "OAuth Implementation",
334
- "subtasks": [...],
335
- "outcomes": [...]
336
- }
337
- ```
338
-
339
- **Coordinator Sessions** (`~/.config/swarm-tools/sessions/<session-id>.jsonl`):
340
- ```jsonl
341
- {"event_type": "DECISION", "decision_type": "strategy_selected", ...}
342
- {"event_type": "DECISION", "decision_type": "worker_spawned", ...}
343
- {"event_type": "VIOLATION", "violation_type": "coordinator_edited_file", ...}
344
- {"event_type": "COMPACTION", "compaction_type": "prompt_generated", ...}
345
- ```
346
-
347
- **Eval History** (`.opencode/eval-history.jsonl`):
348
- ```jsonl
349
- {"timestamp": "...", "eval_name": "swarm-decomposition", "score": 0.92, "run_count": 15}
350
- ```
351
-
352
- ### Capture Points (Automatic)
353
-
354
- | Integration Point | What Gets Captured | File |
355
- | -------------------------- | ------------------------------------- | ----------------------- |
356
- | `swarm_decompose` | Task, strategy, subtasks | eval-data.jsonl |
357
- | `swarm_complete` | Outcome signals (duration, errors) | eval-data.jsonl |
358
- | `swarm_record_outcome` | Learning signals | swarm-mail database |
359
- | Coordinator spawn | Worker spawn event | sessions/*.jsonl |
360
- | Coordinator review | Review decision | sessions/*.jsonl |
361
- | Compaction hook | Prompt content, detection results | sessions/*.jsonl |
362
- | Evalite runner | Score, baseline, phase | eval-history.jsonl |
363
-
364
- ---
365
-
366
- ## CLI Commands
367
-
368
- ### `swarm eval status [eval-name]`
369
-
370
- Shows current phase, gate thresholds, and recent scores with sparklines.
371
-
372
- ```bash
373
- $ swarm eval status swarm-decomposition
374
-
375
- ┌─────────────────────────────────────────────────────────────┐
376
- │ Eval: swarm-decomposition │
377
- │ Phase: 🚀 Production (53 runs, variance: 0.08) │
378
- │ │
379
- │ Gate Thresholds: │
380
- │ ├─ Stabilization: >10% regression (warn) │
381
- │ └─ Production: >5% regression (fail) │
382
- │ │
383
- │ Recent Scores (last 10 runs): │
384
- │ 0.92 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 12:00 │
385
- │ 0.89 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 11:30 │
386
- │ 0.94 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 11:00 │
387
- │ Baseline: 0.91 | Variance: 0.08 | Trend: ↗ │
388
- └─────────────────────────────────────────────────────────────┘
389
- ```
390
-
391
- **Phase indicators:**
392
- - 🌱 Bootstrap - Collecting data
393
- - ⚙️ Stabilization - Learning baseline
394
- - 🚀 Production - Enforcing quality
395
-
396
- ### `swarm eval history`
397
-
398
- Shows eval run history grouped by eval name with trends and color-coded scores.
399
-
400
- ```bash
401
- $ swarm eval history
402
-
403
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
404
- swarm-decomposition (53 runs)
405
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
406
- Run #53 0.92 2025-01-01 12:00:00 ✅ PASS
407
- Run #52 0.89 2025-01-01 11:30:00 ✅ PASS
408
- Run #51 0.94 2025-01-01 11:00:00 ✅ PASS
409
- ...
410
- Sparkline: ▁▂▃▄▅▆▇█▇▆▅▄▃▂▁▂▃▄▅▆▇█
411
- Trend: ↗ (improving)
412
-
413
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
414
- coordinator-behavior (8 runs)
415
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
416
- Run #8 0.85 2025-01-01 10:00:00 ⚠️ WARN
417
- Run #7 0.91 2025-01-01 09:30:00 ✅ PASS
418
- ...
419
- ```
420
-
421
- **Color coding:**
422
- - 🟢 Green: ≥0.8 (pass/high score)
423
- - 🟡 Yellow: 0.6-0.8 (warning/medium score)
424
- - 🔴 Red: <0.6 (fail/low score)
425
-
426
- ### `swarm eval run` (Stub)
427
-
428
- Placeholder for future direct eval execution from CLI.
429
-
430
- ---
431
-
432
- ## CI Integration
433
-
434
- ### GitHub Actions Workflow
435
-
436
- Progressive gates integrate with CI for PR checks:
437
-
438
- ```yaml
439
- # .github/workflows/eval-check.yml
440
- name: Eval Quality Gate
441
-
442
- on:
443
- pull_request:
444
- branches: [main]
445
-
446
- jobs:
447
- eval-gate:
448
- runs-on: ubuntu-latest
449
- steps:
450
- - uses: actions/checkout@v4
451
-
452
- - name: Setup Bun
453
- uses: oven-sh/setup-bun@v1
454
-
455
- - name: Install deps
456
- run: bun install
457
-
458
- - name: Run evals
459
- run: bun run eval:run
460
-
461
- - name: Check gates
462
- run: |
463
- # Check if any eval failed production gate
464
- swarm eval status | grep "FAIL" && exit 1 || exit 0
465
-
466
- - name: Post PR comment
467
- if: failure()
468
- uses: actions/github-script@v7
469
- with:
470
- script: |
471
- // Post detailed gate failure to PR
472
- const status = await exec.getExecOutput('swarm eval status');
473
- github.rest.issues.createComment({
474
- issue_number: context.issue.number,
475
- body: `## ❌ Eval Gate Failed\n\n\`\`\`\n${status.stdout}\n\`\`\``
476
- });
477
- ```
478
-
479
- **Gate behavior in CI:**
480
- - Bootstrap: Always pass (collecting data)
481
- - Stabilization: Pass but warn on >10% regression
482
- - Production: **FAIL PR** on >5% regression
483
-
484
- ---
485
-
486
- ## Writing New Evals
487
-
488
- ### 1. Create Eval File
489
-
490
- ```typescript
491
- // evals/my-feature.eval.ts
492
- import { evalite } from "evalite";
493
- import { createScorer } from "evalite";
494
-
495
- // Define your scorer
496
- const myScorer = createScorer({
497
- name: "My Quality Metric",
498
- description: "Checks if feature meets quality bar",
499
- scorer: async ({ output, expected, input }) => {
500
- // Implement scoring logic
501
- const score = /* calculate 0-1 score */;
502
- return {
503
- score,
504
- message: "Details about score"
505
- };
506
- },
507
- });
508
-
509
- // Define the eval
510
- evalite("My Feature Quality", {
511
- data: async () => {
512
- // Load test cases
513
- return [
514
- {
515
- input: "test input",
516
- expected: { /* expected structure */ },
517
- },
518
- ];
519
- },
520
- task: async (input) => {
521
- // Call your system under test
522
- const output = await myFeature(input);
523
- return output;
524
- },
525
- scorers: [myScorer],
526
- });
527
- ```
528
-
529
- ### 2. Add to Package Scripts
530
-
531
- ```json
532
- {
533
- "scripts": {
534
- "eval:my-feature": "bunx evalite run evals/my-feature.eval.ts"
535
- }
536
- }
537
- ```
538
-
539
- ### 3. Add Capture Points
540
-
541
- Wire your feature to capture real execution data:
542
-
543
- ```typescript
544
- import { captureMyFeatureEvent } from "./eval-capture.js";
545
-
546
- async function myFeature(input) {
547
- const startTime = Date.now();
548
-
549
- try {
550
- const result = await doWork(input);
551
-
552
- // Capture success
553
- captureMyFeatureEvent({
554
- input,
555
- output: result,
556
- duration_ms: Date.now() - startTime,
557
- success: true,
558
- });
559
-
560
- return result;
561
- } catch (error) {
562
- // Capture failure
563
- captureMyFeatureEvent({
564
- input,
565
- error: error.message,
566
- duration_ms: Date.now() - startTime,
567
- success: false,
568
- });
569
- throw error;
570
- }
571
- }
572
- ```
573
-
574
- ### 4. Test Locally
575
-
576
- ```bash
577
- # Run your eval
578
- bun run eval:my-feature
579
-
580
- # Check status
581
- swarm eval status my-feature
582
-
583
- # View history
584
- swarm eval history
585
- ```
586
-
587
- ---
588
-
589
- ## Scorer Reference
590
-
591
- ### Scorer Pattern (Evalite v1.0)
592
-
593
- **IMPORTANT**: Evalite scorers are **async functions**, not objects with `.scorer` property.
594
-
595
- ```typescript
596
- import { createScorer } from "evalite";
597
-
598
- // CORRECT ✅
599
- const myScorer = createScorer({
600
- name: "My Scorer",
601
- description: "What it measures",
602
- scorer: async ({ output, expected, input }) => {
603
- return { score: 0.8, message: "Details" };
604
- },
605
- });
606
-
607
- // Use in eval
608
- evalite("test", {
609
- scorers: [myScorer], // Pass the scorer directly
610
- });
611
-
612
- // In composite scorers
613
- const result = await childScorer({ output, expected, input });
614
- const score = result.score ?? 0;
615
- ```
616
-
617
- **WRONG ❌**:
618
- ```typescript
619
- // Don't do this - .scorer property doesn't exist
620
- const result = childScorer.scorer({ output, expected }); // ❌
621
- ```
622
-
623
- ### Custom Scorer Template
624
-
625
- ```typescript
626
- export const myCustomScorer = createScorer({
627
- name: "My Custom Metric",
628
- description: "Detailed description of what this measures and why it matters",
629
- scorer: async ({ output, expected, input }) => {
630
- // 1. Parse output
631
- let data;
632
- try {
633
- data = typeof output === "string" ? JSON.parse(output) : output;
634
- } catch {
635
- return { score: 0, message: "Invalid output format" };
636
- }
637
-
638
- // 2. Calculate score (0-1 range)
639
- const score = calculateYourMetric(data, expected);
640
-
641
- // 3. Return with detailed message
642
- return {
643
- score,
644
- message: `Score: ${score.toFixed(2)} - ${getExplanation(score)}`,
645
- };
646
- },
647
- });
648
- ```
649
-
650
- ---
651
-
652
- ## Troubleshooting
653
-
654
- ### "No eval history found"
655
-
656
- **Cause:** Haven't run any evals yet or `.opencode/eval-history.jsonl` missing.
657
-
658
- **Fix:**
659
- ```bash
660
- # Run an eval to create history
661
- bun run eval:decomposition
662
- swarm eval status # Should show bootstrap phase
663
- ```
664
-
665
- ### "Phase stuck in stabilization despite >50 runs"
666
-
667
- **Cause:** High variance (≥0.1). Scores not consistent enough for production phase.
668
-
669
- **Fix:** Investigate why scores fluctuate:
670
- ```bash
671
- # Check variance
672
- swarm eval status my-eval # Shows variance value
673
-
674
- # View score history to spot outliers
675
- swarm eval history
676
-
677
- # Common causes:
678
- # - Eval depends on external state (network, filesystem)
679
- # - Non-deterministic scoring logic
680
- # - Input data changing between runs
681
- ```
682
-
683
- ### "Gate failing on minor changes"
684
-
685
- **Cause:** Production phase threshold (5%) too strict for your use case.
686
-
687
- **Fix:** Adjust threshold in eval code:
688
- ```typescript
689
- import { checkGate } from "../src/eval-gates.js";
690
-
691
- const result = checkGate(projectPath, evalName, score, {
692
- productionThreshold: 0.10, // 10% instead of 5%
693
- });
694
- ```
695
-
696
- ### "Evalite not finding my eval file"
697
-
698
- **Cause:** File not matching `*.eval.ts` pattern or not in `evals/` directory.
699
-
700
- **Fix:**
701
- ```bash
702
- # Ensure file is named correctly
703
- mv evals/my-test.ts evals/my-test.eval.ts
704
-
705
- # Verify discovery
706
- bunx evalite run evals/ # Should list your eval
707
- ```
708
-
709
- ### "Scorers returning undefined"
710
-
711
- **Cause:** Forgot to `await` async scorers or accessing `.scorer` property (doesn't exist).
712
-
713
- **Fix:**
714
- ```typescript
715
- // CORRECT ✅
716
- const result = await myScorer({ output, expected, input });
717
- const score = result.score ?? 0;
718
-
719
- // WRONG ❌
720
- const result = myScorer.scorer({ output, expected }); // .scorer doesn't exist
721
- ```
722
-
723
- ---
724
-
725
- ## File Structure
726
-
727
- ```
728
- evals/
729
- ├── README.md # This file
730
- ├── evalite.config.ts # Evalite configuration
731
-
732
- ├── fixtures/
733
- │ ├── decomposition-cases.ts # Test cases for decomposition
734
- │ ├── coordinator-sessions.ts # Known good/bad coordinator sessions
735
- │ └── compaction-prompts.ts # Sample compaction prompts
736
-
737
- ├── lib/
738
- │ ├── data-loader.ts # Load eval data from JSONL files
739
- │ └── test-helpers.ts # Shared test utilities
740
-
741
- ├── scorers/
742
- │ ├── decomposition-scorers.ts # Subtask quality scorers
743
- │ ├── coordinator-scorers.ts # Protocol adherence scorers
744
- │ └── compaction-prompt-scorers.ts # Prompt quality scorers
745
-
746
- ├── swarm-decomposition.eval.ts # Decomposition quality eval
747
- ├── coordinator-session.eval.ts # Coordinator discipline eval
748
- ├── compaction-prompt.eval.ts # Compaction prompt quality eval
749
- └── example.eval.ts # Sanity check / template
750
- ```
751
-
752
- **Data locations:**
753
- - `.opencode/eval-data.jsonl` - Decomposition captures
754
- - `.opencode/eval-history.jsonl` - Score history
755
- - `~/.config/swarm-tools/sessions/*.jsonl` - Coordinator sessions
756
-
757
- ---
758
-
759
- ## Further Reading
760
-
761
- - **[Evalite Docs](https://evalite.dev)** - Evaluation framework
762
- - **[Progressive Gates Implementation](../src/eval-gates.ts)** - Phase-based quality control
763
- - **[Learning Feedback Loop](../src/eval-learning.ts)** - Auto-store failures to memory
764
- - **[Data Capture](../src/eval-capture.ts)** - Real execution tracking
765
- - **[Compaction Scorers](../src/compaction-prompt-scoring.ts)** - Pure scoring functions
766
-
767
- > _"Measure outcomes, not outputs. The system that learns from failure beats the system that avoids it."_
768
- > — Inspired by Site Reliability Engineering principles