opencode-swarm-plugin 0.44.0 → 0.44.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/bin/swarm.serve.test.ts +6 -4
  2. package/bin/swarm.ts +18 -12
  3. package/dist/compaction-prompt-scoring.js +139 -0
  4. package/dist/eval-capture.js +12811 -0
  5. package/dist/hive.d.ts.map +1 -1
  6. package/dist/hive.js +14834 -0
  7. package/dist/index.d.ts +18 -0
  8. package/dist/index.d.ts.map +1 -1
  9. package/dist/index.js +7743 -62593
  10. package/dist/plugin.js +24052 -78907
  11. package/dist/swarm-orchestrate.d.ts.map +1 -1
  12. package/dist/swarm-prompts.d.ts.map +1 -1
  13. package/dist/swarm-prompts.js +39407 -0
  14. package/dist/swarm-review.d.ts.map +1 -1
  15. package/dist/swarm-validation.d.ts +127 -0
  16. package/dist/swarm-validation.d.ts.map +1 -0
  17. package/dist/validators/index.d.ts +7 -0
  18. package/dist/validators/index.d.ts.map +1 -0
  19. package/dist/validators/schema-validator.d.ts +58 -0
  20. package/dist/validators/schema-validator.d.ts.map +1 -0
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2286
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
  43. package/docs/planning/ROADMAP.md +0 -368
  44. package/docs/semantic-memory-cli-syntax.md +0 -123
  45. package/docs/swarm-mail-architecture.md +0 -1147
  46. package/docs/testing/context-recovery-test.md +0 -470
  47. package/evals/ARCHITECTURE.md +0 -1189
  48. package/evals/README.md +0 -768
  49. package/evals/compaction-prompt.eval.ts +0 -149
  50. package/evals/compaction-resumption.eval.ts +0 -289
  51. package/evals/coordinator-behavior.eval.ts +0 -307
  52. package/evals/coordinator-session.eval.ts +0 -154
  53. package/evals/evalite.config.ts.bak +0 -15
  54. package/evals/example.eval.ts +0 -31
  55. package/evals/fixtures/cass-baseline.ts +0 -217
  56. package/evals/fixtures/compaction-cases.ts +0 -350
  57. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  58. package/evals/fixtures/coordinator-sessions.ts +0 -328
  59. package/evals/fixtures/decomposition-cases.ts +0 -105
  60. package/evals/lib/compaction-loader.test.ts +0 -248
  61. package/evals/lib/compaction-loader.ts +0 -320
  62. package/evals/lib/data-loader.evalite-test.ts +0 -289
  63. package/evals/lib/data-loader.test.ts +0 -345
  64. package/evals/lib/data-loader.ts +0 -281
  65. package/evals/lib/llm.ts +0 -115
  66. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  67. package/evals/scorers/compaction-scorers.ts +0 -305
  68. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  69. package/evals/scorers/coordinator-discipline.ts +0 -325
  70. package/evals/scorers/index.test.ts +0 -146
  71. package/evals/scorers/index.ts +0 -328
  72. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  73. package/evals/scorers/outcome-scorers.ts +0 -349
  74. package/evals/swarm-decomposition.eval.ts +0 -121
  75. package/examples/commands/swarm.md +0 -745
  76. package/examples/plugin-wrapper-template.ts +0 -2515
  77. package/examples/skills/hive-workflow/SKILL.md +0 -212
  78. package/examples/skills/skill-creator/SKILL.md +0 -223
  79. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  80. package/global-skills/cli-builder/SKILL.md +0 -344
  81. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  82. package/global-skills/learning-systems/SKILL.md +0 -644
  83. package/global-skills/skill-creator/LICENSE.txt +0 -202
  84. package/global-skills/skill-creator/SKILL.md +0 -352
  85. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  86. package/global-skills/skill-creator/references/workflows.md +0 -28
  87. package/global-skills/swarm-coordination/SKILL.md +0 -995
  88. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  89. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  90. package/global-skills/system-design/SKILL.md +0 -213
  91. package/global-skills/testing-patterns/SKILL.md +0 -430
  92. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  93. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  94. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  95. package/scripts/cleanup-test-memories.ts +0 -346
  96. package/scripts/init-skill.ts +0 -222
  97. package/scripts/migrate-unknown-sessions.ts +0 -349
  98. package/scripts/validate-skill.ts +0 -204
  99. package/src/agent-mail.ts +0 -1724
  100. package/src/anti-patterns.test.ts +0 -1167
  101. package/src/anti-patterns.ts +0 -448
  102. package/src/compaction-capture.integration.test.ts +0 -257
  103. package/src/compaction-hook.test.ts +0 -838
  104. package/src/compaction-hook.ts +0 -1204
  105. package/src/compaction-observability.integration.test.ts +0 -139
  106. package/src/compaction-observability.test.ts +0 -187
  107. package/src/compaction-observability.ts +0 -324
  108. package/src/compaction-prompt-scorers.test.ts +0 -475
  109. package/src/compaction-prompt-scoring.ts +0 -300
  110. package/src/contributor-tools.test.ts +0 -133
  111. package/src/contributor-tools.ts +0 -201
  112. package/src/dashboard.test.ts +0 -611
  113. package/src/dashboard.ts +0 -462
  114. package/src/error-enrichment.test.ts +0 -403
  115. package/src/error-enrichment.ts +0 -219
  116. package/src/eval-capture.test.ts +0 -1015
  117. package/src/eval-capture.ts +0 -929
  118. package/src/eval-gates.test.ts +0 -306
  119. package/src/eval-gates.ts +0 -218
  120. package/src/eval-history.test.ts +0 -508
  121. package/src/eval-history.ts +0 -214
  122. package/src/eval-learning.test.ts +0 -378
  123. package/src/eval-learning.ts +0 -360
  124. package/src/eval-runner.test.ts +0 -223
  125. package/src/eval-runner.ts +0 -402
  126. package/src/export-tools.test.ts +0 -476
  127. package/src/export-tools.ts +0 -257
  128. package/src/hive.integration.test.ts +0 -2241
  129. package/src/hive.ts +0 -1628
  130. package/src/index.ts +0 -940
  131. package/src/learning.integration.test.ts +0 -1815
  132. package/src/learning.ts +0 -1079
  133. package/src/logger.test.ts +0 -189
  134. package/src/logger.ts +0 -135
  135. package/src/mandate-promotion.test.ts +0 -473
  136. package/src/mandate-promotion.ts +0 -239
  137. package/src/mandate-storage.integration.test.ts +0 -601
  138. package/src/mandate-storage.test.ts +0 -578
  139. package/src/mandate-storage.ts +0 -794
  140. package/src/mandates.ts +0 -540
  141. package/src/memory-tools.test.ts +0 -195
  142. package/src/memory-tools.ts +0 -344
  143. package/src/memory.integration.test.ts +0 -334
  144. package/src/memory.test.ts +0 -158
  145. package/src/memory.ts +0 -527
  146. package/src/model-selection.test.ts +0 -188
  147. package/src/model-selection.ts +0 -68
  148. package/src/observability-tools.test.ts +0 -359
  149. package/src/observability-tools.ts +0 -871
  150. package/src/output-guardrails.test.ts +0 -438
  151. package/src/output-guardrails.ts +0 -381
  152. package/src/pattern-maturity.test.ts +0 -1160
  153. package/src/pattern-maturity.ts +0 -525
  154. package/src/planning-guardrails.test.ts +0 -491
  155. package/src/planning-guardrails.ts +0 -438
  156. package/src/plugin.ts +0 -23
  157. package/src/post-compaction-tracker.test.ts +0 -251
  158. package/src/post-compaction-tracker.ts +0 -237
  159. package/src/query-tools.test.ts +0 -636
  160. package/src/query-tools.ts +0 -324
  161. package/src/rate-limiter.integration.test.ts +0 -466
  162. package/src/rate-limiter.ts +0 -774
  163. package/src/replay-tools.test.ts +0 -496
  164. package/src/replay-tools.ts +0 -240
  165. package/src/repo-crawl.integration.test.ts +0 -441
  166. package/src/repo-crawl.ts +0 -610
  167. package/src/schemas/cell-events.test.ts +0 -347
  168. package/src/schemas/cell-events.ts +0 -807
  169. package/src/schemas/cell.ts +0 -257
  170. package/src/schemas/evaluation.ts +0 -166
  171. package/src/schemas/index.test.ts +0 -199
  172. package/src/schemas/index.ts +0 -286
  173. package/src/schemas/mandate.ts +0 -232
  174. package/src/schemas/swarm-context.ts +0 -115
  175. package/src/schemas/task.ts +0 -161
  176. package/src/schemas/worker-handoff.test.ts +0 -302
  177. package/src/schemas/worker-handoff.ts +0 -131
  178. package/src/sessions/agent-discovery.test.ts +0 -137
  179. package/src/sessions/agent-discovery.ts +0 -112
  180. package/src/sessions/index.ts +0 -15
  181. package/src/skills.integration.test.ts +0 -1192
  182. package/src/skills.test.ts +0 -643
  183. package/src/skills.ts +0 -1549
  184. package/src/storage.integration.test.ts +0 -341
  185. package/src/storage.ts +0 -884
  186. package/src/structured.integration.test.ts +0 -817
  187. package/src/structured.test.ts +0 -1046
  188. package/src/structured.ts +0 -762
  189. package/src/swarm-decompose.test.ts +0 -188
  190. package/src/swarm-decompose.ts +0 -1302
  191. package/src/swarm-deferred.integration.test.ts +0 -157
  192. package/src/swarm-deferred.test.ts +0 -38
  193. package/src/swarm-insights.test.ts +0 -214
  194. package/src/swarm-insights.ts +0 -459
  195. package/src/swarm-mail.integration.test.ts +0 -970
  196. package/src/swarm-mail.ts +0 -739
  197. package/src/swarm-orchestrate.integration.test.ts +0 -282
  198. package/src/swarm-orchestrate.test.ts +0 -548
  199. package/src/swarm-orchestrate.ts +0 -3084
  200. package/src/swarm-prompts.test.ts +0 -1270
  201. package/src/swarm-prompts.ts +0 -2077
  202. package/src/swarm-research.integration.test.ts +0 -701
  203. package/src/swarm-research.test.ts +0 -698
  204. package/src/swarm-research.ts +0 -472
  205. package/src/swarm-review.integration.test.ts +0 -285
  206. package/src/swarm-review.test.ts +0 -879
  207. package/src/swarm-review.ts +0 -709
  208. package/src/swarm-strategies.ts +0 -407
  209. package/src/swarm-worktree.test.ts +0 -501
  210. package/src/swarm-worktree.ts +0 -575
  211. package/src/swarm.integration.test.ts +0 -2377
  212. package/src/swarm.ts +0 -38
  213. package/src/tool-adapter.integration.test.ts +0 -1221
  214. package/src/tool-availability.ts +0 -461
  215. package/tsconfig.json +0 -28
@@ -1,1189 +0,0 @@
1
- # Eval Infrastructure Architecture Analysis
2
-
3
- ```
4
- ┌─────────────────────────────────────────────────────────────────────┐
5
- │ EVAL INFRASTRUCTURE FLOW │
6
- │ │
7
- │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────┐ │
8
- │ │ CAPTURE │─────▶│ STORE │─────▶│ LOAD │─────▶│ EVAL │ │
9
- │ └──────────┘ └──────────┘ └──────────┘ └────────┘ │
10
- │ │ │ │ │
11
- │ │ Tool calls │ Data loaders │ │
12
- │ │ Violations │ Fixtures │ │
13
- │ │ Outcomes │ │ │
14
- │ ▼ ▼ ▼ │
15
- │ [sessions/*.jsonl] [PGlite eval_records] [Scorers]│
16
- │ [eval-data.jsonl] [Fixtures] [Gates] │
17
- │ │
18
- │ ┌──────────────────┐ │
19
- │ │ FEEDBACK LOOP │ │
20
- │ ├──────────────────┤ │
21
- │ │ Gate Check │ │
22
- │ │ Learn from Fail │ │
23
- │ │ Store Memory │ │
24
- │ └──────────────────┘ │
25
- └─────────────────────────────────────────────────────────────────────┘
26
- ```
27
-
28
- **Date:** 2025-12-25
29
- **Agent:** BlueForest
30
- **Cell:** opencode-swarm-plugin--ys7z8-mjlk7jsilk9
31
-
32
- ---
33
-
34
- ## Executive Summary
35
-
36
- The eval infrastructure is a **progressive quality control system** that captures real execution data, scores it against quality criteria, and enforces adaptive gates based on data maturity. The architecture follows a clean pipeline: **CAPTURE → STORE → LOAD → EVAL → GATE → LEARN**.
37
-
38
- **Key strengths:**
39
- - Clear separation of concerns (loaders, scorers, evals)
40
- - Progressive gates prevent premature failures
41
- - Real data integration (not just synthetic fixtures)
42
- - Learning feedback loop (regressions → semantic memory)
43
-
44
- **Key issues identified:**
45
- 1. **Data loader abstraction leak** - Loaders know too much about storage format
46
- 2. **Scorer composition complexity** - Composite scorers have brittle async patterns
47
- 3. **Fixture vs real data switching** - Implicit fallback logic scattered in eval files
48
- 4. **Session filtering buried in loader** - Quality criteria hardcoded in data-loader.ts
49
- 5. **No eval versioning** - Schema changes could break historical data
50
-
51
- ---
52
-
53
- ## Component Architecture
54
-
55
- ### 1. Data Capture (`src/eval-capture.ts`)
56
-
57
- **Purpose:** Automatically capture real execution data during swarm runs.
58
-
59
- **Event Types:**
60
- - `DECISION` - Coordinator decisions (strategy selected, worker spawned, review completed)
61
- - `VIOLATION` - Protocol violations (edited files, ran tests, reserved files)
62
- - `OUTCOME` - Task outcomes (success, retry, failure, epic complete)
63
- - `COMPACTION` - Context compaction lifecycle (detection, prompt generation, resumption)
64
-
65
- **Storage:**
66
- - **Sessions:** `~/.config/swarm-tools/sessions/{session-id}.jsonl` (append-only JSONL)
67
- - **Eval Records:** PGlite `eval_records` table (via swarm-mail)
68
- - **History:** `.opencode/eval-history.jsonl` (local project)
69
-
70
- **Schema:** Zod discriminated union (`CoordinatorEventSchema`) - type-safe with exhaustive checks.
71
-
72
- **Capture points:**
73
- - `swarm_decompose` - Captures strategy, decomposition
74
- - `swarm_complete` - Captures outcomes (duration, errors, retries)
75
- - Tool call inspection - Real-time violation detection via pattern matching
76
- - Compaction hook - Lifecycle tracking
77
-
78
- **Strengths:**
79
- - Zod validation prevents garbage data
80
- - JSONL format is append-only, fault-tolerant, streamable
81
- - Discriminated union makes event types exhaustive
82
-
83
- **Issues:**
84
- - **No schema versioning** - Future schema changes could break old data
85
- - **Session directory hardcoded** - `~/.config/swarm-tools/sessions/` not configurable per project
86
-
87
- ---
88
-
89
- ### 2. Data Loaders (`evals/lib/`)
90
-
91
- #### `data-loader.ts` - PGlite + Session Loader
92
-
93
- **Purpose:** Load real data from PGlite (`eval_records`) and session JSONL files.
94
-
95
- **Key functions:**
96
- - `loadEvalCases()` - Query PGlite for decomposition eval records
97
- - `loadCapturedSessions()` - Read coordinator sessions from JSONL
98
- - `hasRealEvalData()` - Check if enough data exists for real eval
99
- - `getEvalDataSummary()` - Stats for reporting
100
-
101
- **Session Quality Filters:**
102
- ```typescript
103
- {
104
- minEvents: 3, // Filter incomplete sessions
105
- requireWorkerSpawn: true, // Ensure delegation happened
106
- requireReview: true, // Ensure coordinator reviewed work
107
- }
108
- ```
109
-
110
- **Strengths:**
111
- - Quality filters reduce noise (only 3/100 sessions passed in coordinator-session eval)
112
- - Stats functions provide transparency (logs which data source is used)
113
-
114
- **Issues:**
115
- 1. **Abstraction leak** - Loader knows about PGlite internals AND JSONL format
116
- - Should have separate `PGliteEvalSource` and `JsonlEvalSource` adapters
117
- 2. **Quality criteria hardcoded** - Filters baked into loader, not configurable at call site
118
- - `requireReview: true` prevents testing coordinators who skip reviews
119
- 3. **Transform logic mixed with loading** - `meetsQualityCriteria()` is business logic, not I/O
120
- 4. **No data versioning** - Can't handle schema evolution (what if event types change?)
121
-
122
- **Recommendation:**
123
- ```typescript
124
- // Separate concerns
125
- interface EvalSource {
126
- load(filters: EvalFilters): Promise<EvalCase[]>;
127
- stats(): Promise<EvalStats>;
128
- }
129
-
130
- class PGliteEvalSource implements EvalSource { /* ... */ }
131
- class JsonlSessionSource implements EvalSource { /* ... */ }
132
-
133
- // Make filters first-class
134
- type SessionFilter = (session: CoordinatorSession) => boolean;
135
- const filters = {
136
- minEvents: (n: number) => (s) => s.events.length >= n,
137
- requireWorkerSpawn: (s) => s.events.some(e => e.decision_type === "worker_spawned"),
138
- compose: (...fns) => (s) => fns.every(f => f(s)),
139
- };
140
- ```
141
-
142
- #### `compaction-loader.ts` - COMPACTION Event Loader
143
-
144
- **Purpose:** Load COMPACTION events from session JSONL files for compaction-prompt eval.
145
-
146
- **Key functions:**
147
- - `loadCompactionEvents()` - Stream COMPACTION events with early termination
148
- - `loadCompactionSessions()` - Group events by session_id
149
- - `loadDefaultCompaction*()` - Convenience wrappers for default session dir
150
-
151
- **Features:**
152
- - **Lazy loading** - Streams large files line-by-line (avoids memory bloat)
153
- - **Early termination** - Stops reading when limit reached
154
- - **Graceful errors** - Skips invalid lines, logs warnings
155
-
156
- **Strengths:**
157
- - Clean single-responsibility (only COMPACTION events)
158
- - Performance-conscious (streaming for large datasets)
159
- - Type-safe with discriminated union extraction
160
-
161
- **Issues:**
162
- 1. **Streaming threshold arbitrary** - `limit < 100` triggers streaming - why 100?
163
- - Should stream by file size, not result limit
164
- 2. **Duplicate logic** - `parseLine()` duplicated between loaders
165
- - Should be shared utility in `eval-capture.ts`
166
- 3. **No pagination** - Returns all matches up to limit, can't resume
167
- - Real-world use case: "Load next 10 sessions" for UI
168
-
169
- **Recommendation:**
170
- ```typescript
171
- // Shared utilities in eval-capture.ts
172
- export function parseEventLine(line: string): CoordinatorEvent | null;
173
- export function* streamEvents(filePath: string): Generator<CoordinatorEvent>;
174
-
175
- // Pagination support
176
- interface PaginatedResult<T> {
177
- data: T[];
178
- cursor: string | null; // file:line for resumption
179
- hasMore: boolean;
180
- }
181
- ```
182
-
183
- #### `llm.ts` - LLM Client for Evals
184
-
185
- **Purpose:** Generate decompositions via LLM for testing (swarm-decomposition eval).
186
-
187
- **Key functions:**
188
- - `generateDecomposition()` - Call Claude via AI SDK + Vercel Gateway
189
- - `formatDecompositionPrompt()` - Template prompt for decomposition
190
- - `extractJson()` - Parse JSON from LLM responses (handles markdown wrapping)
191
-
192
- **Gateway pattern:**
193
- ```typescript
194
- const { text } = await generateText({
195
- model: gateway("anthropic/claude-sonnet-4-5"),
196
- prompt,
197
- maxOutputTokens: 4096,
198
- });
199
- ```
200
-
201
- **Strengths:**
202
- - Gateway abstraction hides provider details (just pass "provider/model")
203
- - JSON extraction handles markdown code blocks (common LLM quirk)
204
- - Prompt template matches production `swarm_plan_prompt`
205
-
206
- **Issues:**
207
- 1. **No retry logic** - Single LLM call, no fallback on failure
208
- - Network errors or rate limits fail entire eval run
209
- 2. **Hardcoded model** - `DEFAULT_MODEL` not overridable at runtime
210
- - Can't test with different models without code change
211
- 3. **No response caching** - Repeated eval runs re-generate same decompositions
212
- - Wastes $ and time for deterministic inputs
213
-
214
- **Recommendation:**
215
- ```typescript
216
- // Retry wrapper
217
- export async function generateWithRetry(
218
- prompt: string,
219
- options?: { model?: GatewayModelId; retries?: number; cache?: boolean }
220
- ): Promise<string>;
221
-
222
- // Cache layer
223
- const cacheKey = hash(prompt + model);
224
- if (cache.has(cacheKey)) return cache.get(cacheKey);
225
- ```
226
-
227
- ---
228
-
229
- ### 3. Scorers (`evals/scorers/`)
230
-
231
- **Purpose:** Score eval outputs against quality criteria. Return `{ score: 0-1, message: string }`.
232
-
233
- **Evalite pattern:**
234
- ```typescript
235
- export const myScorer = createScorer({
236
- name: "My Scorer",
237
- description: "What it measures",
238
- scorer: async ({ output, expected, input }) => {
239
- return { score: 0.8, message: "Details" };
240
- },
241
- });
242
- ```
243
-
244
- #### Scorer Categories
245
-
246
- | File | Scorers | What They Measure |
247
- |------|---------|-------------------|
248
- | `index.ts` | `subtaskIndependence`, `coverageCompleteness`, `instructionClarity`, `decompositionCoherence` | Decomposition quality |
249
- | `coordinator-discipline.ts` | `violationCount`, `spawnEfficiency`, `reviewThoroughness`, `timeToFirstSpawn`, `overallDiscipline` | Coordinator protocol adherence |
250
- | `compaction-scorers.ts` | `confidenceAccuracy`, `contextInjectionCorrectness`, `requiredPatternsPresent`, `forbiddenPatternsAbsent`, `compactionQuality` | Compaction correctness |
251
- | `compaction-prompt-scorers.ts` | `epicIdSpecificity`, `actionability`, `coordinatorIdentity`, `forbiddenToolsPresent`, `postCompactionDiscipline` | Continuation prompt quality |
252
- | `outcome-scorers.ts` | `executionSuccess`, `timeBalance`, `scopeAccuracy`, `scopeDrift`, `noRework` | Real execution outcomes |
253
-
254
- **Composite Scorer Pattern:**
255
- ```typescript
256
- export const overallDiscipline = createScorer({
257
- name: "Overall Discipline",
258
- description: "Weighted composite of all discipline scorers",
259
- scorer: async ({ output, expected, input }) => {
260
- // Call child scorers
261
- const violations = await violationCount({ output, expected, input });
262
- const spawn = await spawnEfficiency({ output, expected, input });
263
- const review = await reviewThoroughness({ output, expected, input });
264
- const time = await timeToFirstSpawn({ output, expected, input });
265
-
266
- // Weighted average
267
- const score =
268
- (violations.score ?? 0) * 0.30 +
269
- (spawn.score ?? 0) * 0.25 +
270
- (review.score ?? 0) * 0.25 +
271
- (time.score ?? 0) * 0.20;
272
-
273
- return { score, message: "Composite score" };
274
- },
275
- });
276
- ```
277
-
278
- **Strengths:**
279
- - Clear single-responsibility (each scorer tests one thing)
280
- - Composite scorers enable weighted evaluation
281
- - Type-safe with Zod schemas for output parsing
282
- - Null-safe scoring (`score ?? 0` handles scorer failures gracefully)
283
-
284
- **Issues:**
285
- 1. **Async composition fragility** - Must `await` each child scorer
286
- - Easy to forget, causes `Promise<Score>` type errors
287
- - Semantic memory shows this bit TWO files recently
288
- 2. **No scorer versioning** - Scorer logic changes invalidate historical comparisons
289
- - Can't tell if score dropped due to regression or scorer change
290
- 3. **Hardcoded weights** - `0.30`, `0.25`, etc. not configurable
291
- - Can't experiment with different weight profiles
292
- 4. **LLM-as-judge cost** - `decompositionCoherence` calls Claude for each test case
293
- - No cost controls or budgets
294
- - No fallback if LLM fails
295
-
296
- **Recommendation:**
297
- ```typescript
298
- // Versioned scorers
299
- export const violationCount_v1 = createScorer({ /* ... */ });
300
- export const violationCount_v2 = createScorer({ /* ... */ });
301
-
302
- // Configurable weights
303
- export function createOverallDiscipline(weights: {
304
- violations: number;
305
- spawn: number;
306
- review: number;
307
- time: number;
308
- }) { /* ... */ }
309
-
310
- // LLM budget
311
- const JUDGE_BUDGET = { maxCalls: 100, maxCost: 1.00 };
312
- ```
313
-
314
- ---
315
-
316
- ### 4. Eval Files (`evals/*.eval.ts`)
317
-
318
- **Purpose:** Define eval test suites using Evalite framework.
319
-
320
- **Pattern:**
321
- ```typescript
322
- evalite("Eval Name", {
323
- data: async () => [...testCases],
324
- task: async (input) => /* generate output */,
325
- scorers: [scorer1, scorer2, ...],
326
- });
327
- ```
328
-
329
- #### Eval Suites
330
-
331
- | File | Data Source | Task | Scorers |
332
- |------|-------------|------|---------|
333
- | `swarm-decomposition.eval.ts` | PGlite or fixtures | LLM generates decomposition | Independence, coverage, clarity, coherence |
334
- | `coordinator-session.eval.ts` | Session JSONL or fixtures | Identity (session as JSON) | Violations, spawn, review, time, discipline |
335
- | `compaction-prompt.eval.ts` | Fixtures only | Identity (fixture prompts) | Epic ID, actionability, identity, tools, discipline |
336
- | `compaction-resumption.eval.ts` | Compaction events | Compaction logic | Confidence, injection, patterns, quality |
337
-
338
- **Data Source Switching:**
339
- ```typescript
340
- const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
341
- const evalCases = useRealData
342
- ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
343
- : decompositionCases.map((testCase) => ({ input: testCase.input, expected: testCase.expected }));
344
- ```
345
-
346
- **Strengths:**
347
- - Progressive data source (fixtures → real data as it accumulates)
348
- - Transparency (logs which source is used)
349
- - Multiple test suites per eval file (edge cases, perfect vs bad, etc.)
350
-
351
- **Issues:**
352
- 1. **Data source logic duplicated** - Every eval file has same `hasRealEvalData` check
353
- - Should be abstracted into data loader
354
- 2. **Hard limit of 20 cases** - `limit: 20` hardcoded
355
- - No way to run full dataset locally
356
- 3. **No eval parameterization** - Can't run same eval with different configs
357
- - E.g., "test with max_subtasks=4" vs "max_subtasks=8"
358
- 4. **Identity task for fixtures** - `task: async (input) => JSON.stringify(input)` is wasteful
359
- - Fixtures already have output, no need to "generate" it
360
- - Should have `FixtureEval` vs `GenerativeEval` types
361
-
362
- **Recommendation:**
363
- ```typescript
364
- // Data source abstraction
365
- const dataSource = await selectDataSource(PROJECT_KEY, {
366
- preferReal: true,
367
- fallbackToFixtures: true,
368
- limit: process.env.CI ? 5 : undefined, // Full dataset locally, sample in CI
369
- });
370
-
371
- // Eval parameterization
372
- evalite.parameterize("Decomposition Quality", {
373
- params: [
374
- { maxSubtasks: 4, strategy: "file-based" },
375
- { maxSubtasks: 8, strategy: "feature-based" },
376
- ],
377
- data: async ({ maxSubtasks, strategy }) => /* ... */,
378
- });
379
- ```
380
-
381
- ---
382
-
383
- ### 5. Progressive Gates (`src/eval-gates.ts`)
384
-
385
- **Purpose:** Enforce quality gates based on eval maturity phase.
386
-
387
- **Phases:**
388
- - **Bootstrap (<10 runs):** Always pass, collect baseline data
389
- - **Stabilization (10-50 runs):** Warn on >10% regression (default), but pass
390
- - **Production (>50 runs + variance <0.1):** Fail on >5% regression (default)
391
-
392
- **Gate Logic:**
393
- ```typescript
394
- export function checkGate(
395
- projectPath: string,
396
- evalName: string,
397
- currentScore: number,
398
- config?: GateConfig
399
- ): GateResult {
400
- const phase = getPhase(projectPath, evalName);
401
- const history = getScoreHistory(projectPath, evalName);
402
- const baseline = calculateBaseline(history, currentScore);
403
- const regressionPercent = (baseline - currentScore) / baseline;
404
-
405
- // Phase-specific thresholds
406
- if (phase === "bootstrap") return { passed: true, ... };
407
- if (phase === "stabilization") return { passed: true, warn: regressionPercent > 0.10, ... };
408
- if (phase === "production") return { passed: regressionPercent <= 0.05, ... };
409
- }
410
- ```
411
-
412
- **Variance Threshold:**
413
- - High variance (≥0.1) keeps eval in stabilization even with >50 runs
414
- - Prevents premature production gates when scores unstable
415
- - Current issue: coordinator-session has high variance (only 3/100 sessions pass filters)
416
-
417
- **Strengths:**
418
- - Adaptive thresholds prevent premature failures
419
- - Variance check prevents false confidence
420
- - Configurable thresholds per eval
421
-
422
- **Issues:**
423
- 1. **Baseline calculation naive** - Simple mean of all scores
424
- - Doesn't handle outliers or trends
425
- - Early bad runs drag down baseline forever
426
- 2. **No time-based decay** - Old scores weighted equally with new
427
- - Eval improvements don't raise baseline fast enough
428
- 3. **No CI/PR integration hooks** - Gates check but don't post results
429
- - Documented in README but not implemented
430
- 4. **Variance threshold magic number** - 0.1 chosen arbitrarily
431
- - Should be configurable or derived from data
432
-
433
- **Recommendation:**
434
- ```typescript
435
- // Weighted baseline (recent scores matter more)
436
- function calculateWeightedBaseline(
437
- history: EvalRunRecord[],
438
- decayFactor: number = 0.9 // Recent = 1.0, older = 0.9^n
439
- ): number;
440
-
441
- // Outlier-resistant baseline (median or trimmed mean)
442
- function calculateRobustBaseline(
443
- history: EvalRunRecord[],
444
- trimPercent: number = 0.1 // Trim top/bottom 10%
445
- ): number;
446
-
447
- // CI posting
448
- export function postGateResultToGitHub(
449
- result: GateResult,
450
- prNumber: number,
451
- repo: string
452
- ): Promise<void>;
453
- ```
454
-
455
- ---
456
-
457
- ### 6. Learning Feedback Loop (`src/eval-learning.ts`)
458
-
459
- **Purpose:** Automatically store eval failures to semantic memory for learning.
460
-
461
- **Trigger:** Score drops >15% (configurable) from rolling average baseline.
462
-
463
- **Flow:**
464
- ```typescript
465
- const result = await learnFromEvalFailure(
466
- evalName,
467
- currentScore,
468
- history,
469
- memoryAdapter,
470
- { config: { dropThreshold: 0.15, windowSize: 5 } }
471
- );
472
-
473
- if (result.triggered) {
474
- // Stored to semantic-memory with tags:
475
- // - "eval-failure"
476
- // - "{eval-name}"
477
- // - "regression"
478
- }
479
- ```
480
-
481
- **Stored Context:**
482
- - Eval name
483
- - Baseline score (rolling average)
484
- - Current score
485
- - Drop percentage
486
- - Timestamp
487
- - Optional scorer details (which scorer failed)
488
-
489
- **Strengths:**
490
- - Automatic detection (no manual annotation)
491
- - Rolling average baseline (more stable than last-run comparison)
492
- - Configurable sensitivity (threshold + window size)
493
- - Structured metadata for querying
494
-
495
- **Issues:**
496
- 1. **No retrieval integration** - Memories stored but not queried before eval runs
497
- - Should inject past failures into LLM prompts for context
498
- 2. **No failure analysis** - Stores "score dropped" but not "why"
499
- - Should include which test cases failed, what changed
500
- 3. **No auto-remediation** - Human must read memory and act
501
- - Could auto-generate hypotheses or suggested fixes
502
- 4. **Memory pollution risk** - Noisy evals create spam memories
503
- - Should require multiple consecutive drops before storing
504
-
505
- **Recommendation:**
506
- ```typescript
507
- // Retrieval hook
508
- export async function queryEvalFailures(
509
- evalName: string,
510
- memoryAdapter: MemoryAdapter
511
- ): Promise<Memory[]> {
512
- return memoryAdapter.find({
513
- query: evalName,
514
- tags: ["eval-failure", "regression"],
515
- limit: 5,
516
- });
517
- }
518
-
519
- // Failure analysis
520
- export function analyzeFailure(
521
- evalName: string,
522
- currentRun: EvalResult,
523
- previousRun: EvalResult
524
- ): FailureAnalysis {
525
- // Diff test cases, scorer outputs, etc.
526
- }
527
-
528
- // Spam prevention
529
- if (recentDrops.length >= 3) {
530
- // Only store if consistent regression
531
- storeMemory();
532
- }
533
- ```
534
-
535
- ---
536
-
537
- ## Data Flow Architecture
538
-
539
- ### Capture Flow
540
-
541
- ```
542
- ┌─────────────────────────────────────────────────────────────┐
543
- │ REAL-TIME CAPTURE │
544
- ├─────────────────────────────────────────────────────────────┤
545
- │ │
546
- │ 1. Coordinator calls swarm tool │
547
- │ ├─ swarm_decompose(task="Add auth") │
548
- │ ├─ swarm_spawn_subtask(bead_id="bd-123.1") │
549
- │ └─ swarm_review(task_id="bd-123.1") │
550
- │ │
551
- │ 2. Tool execution │
552
- │ ├─ planning-guardrails.ts detects violations │
553
- │ │ (pattern matching on tool name + args) │
554
- │ └─ eval-capture.ts emits events │
555
- │ │
556
- │ 3. Event storage │
557
- │ ├─ Session JSONL: ~/.config/swarm-tools/sessions/... │
558
- │ ├─ PGlite: eval_records table │
559
- │ └─ History: .opencode/eval-history.jsonl │
560
- │ │
561
- └─────────────────────────────────────────────────────────────┘
562
- ```
563
-
564
- **Key characteristic:** Capture is **passive** - no manual instrumentation needed. Tool calls are inspected in real-time.
565
-
566
- ### Load → Eval Flow
567
-
568
- ```
569
- ┌─────────────────────────────────────────────────────────────┐
570
- │ EVAL EXECUTION │
571
- ├─────────────────────────────────────────────────────────────┤
572
- │ │
573
- │ 1. Data Loading │
574
- │ ├─ Check: hasRealEvalData(projectKey, minRecords=5) │
575
- │ ├─ If true: loadEvalCases(projectKey, limit=20) │
576
- │ └─ If false: Use fixtures (decomposition-cases.ts) │
577
- │ │
578
- │ 2. Task Execution │
579
- │ ├─ Generative: LLM generates decomposition │
580
- │ └─ Identity: Fixture data as-is (JSON.stringify) │
581
- │ │
582
- │ 3. Scoring │
583
- │ ├─ Parse output (JSON, Zod validation) │
584
- │ ├─ Run scorers in parallel (async composition) │
585
- │ └─ Composite scorer: weighted average │
586
- │ │
587
- │ 4. Gate Check │
588
- │ ├─ getPhase(projectPath, evalName) │
589
- │ ├─ calculateBaseline(history) │
590
- │ ├─ calculateRegression(baseline, currentScore) │
591
- │ └─ Return GateResult { passed, phase, message } │
592
- │ │
593
- │ 5. Learning │
594
- │ ├─ isSignificantDrop(current, baseline, threshold) │
595
- │ ├─ If true: storeMemory(evalName, context, tags) │
596
- │ └─ Return LearningResult { triggered, memory_id } │
597
- │ │
598
- └─────────────────────────────────────────────────────────────┘
599
- ```
600
-
601
- **Key characteristic:** Load flow has **implicit fallback** (real data → fixtures). This is scattered across eval files, not centralized.
602
-
603
- ---
604
-
605
- ## Structural Issues & Recommendations
606
-
607
- ### Issue 1: Data Loader Abstraction Leak
608
-
609
- **Problem:** `data-loader.ts` knows about PGlite internals AND JSONL format. Violates single-responsibility.
610
-
611
- **Impact:**
612
- - Hard to test (mocking requires PGlite + file I/O)
613
- - Hard to extend (adding CSV source requires modifying data-loader.ts)
614
- - Tight coupling to storage format
615
-
616
- **Solution:**
617
- ```typescript
618
- // Define source interface
619
- interface EvalSource<T> {
620
- load(filters: FilterSpec): Promise<T[]>;
621
- stats(): Promise<SourceStats>;
622
- }
623
-
624
- // Implement sources
625
- class PGliteDecompositionSource implements EvalSource<EvalCase> { /* ... */ }
626
- class JsonlSessionSource implements EvalSource<CoordinatorSession> { /* ... */ }
627
- class FixtureSource<T> implements EvalSource<T> { /* ... */ }
628
-
629
- // Compose in eval files
630
- const source = await selectSource<EvalCase>({
631
- preferReal: new PGliteDecompositionSource(projectKey),
632
- fallback: new FixtureSource(decompositionCases),
633
- minRecords: 5,
634
- });
635
-
636
- const data = await source.load({ limit: 20 });
637
- ```
638
-
639
- **Benefits:**
640
- - Sources testable in isolation
641
- - Easy to add new sources (S3, API, etc.)
642
- - Explicit fallback strategy (not hardcoded)
643
-
644
- ### Issue 2: Session Quality Filters Hardcoded
645
-
646
- **Problem:** Quality criteria baked into `loadCapturedSessions()` - can't test coordinators who skip reviews.
647
-
648
- **Impact:**
649
- - Only 3/100 sessions passed filters in coordinator-session eval
650
- - Can't experiment with different filter profiles
651
- - Hidden filtering (caller doesn't control criteria)
652
-
653
- **Solution:**
654
- ```typescript
655
- // Make filters first-class, composable
656
- type SessionFilter = (session: CoordinatorSession) => boolean;
657
-
658
- const filters = {
659
- minEvents: (n: number): SessionFilter => (s) => s.events.length >= n,
660
- requireWorkerSpawn: (s) => s.events.some(e => e.decision_type === "worker_spawned"),
661
- requireReview: (s) => s.events.some(e => e.decision_type === "review_completed"),
662
- compose: (...fns: SessionFilter[]): SessionFilter => (s) => fns.every(f => f(s)),
663
- };
664
-
665
- // Explicit filtering at call site
666
- const sessions = await loadCapturedSessions({
667
- filter: filters.compose(
668
- filters.minEvents(3),
669
- filters.requireWorkerSpawn
670
- // Note: NOT requiring review for this test
671
- ),
672
- limit: 20,
673
- });
674
- ```
675
-
676
- **Benefits:**
677
- - Caller controls filtering (explicit, testable)
678
- - Easy to add new filters (no loader modification)
679
- - Can test partial compliance (e.g., "spawn but no review")
680
-
681
- ### Issue 3: No Scorer Versioning
682
-
683
- **Problem:** Scorer logic changes invalidate historical comparisons. Can't tell if score dropped due to regression or scorer change.
684
-
685
- **Impact:**
686
- - "Score dropped 15%" - was it code regression or stricter scoring?
687
- - Can't experiment with scorer improvements (breaks history)
688
- - No rollback if new scorer is too strict
689
-
690
- **Solution:**
691
- ```typescript
692
- // Version scorers with metadata
693
- export const subtaskIndependence_v1 = createScorer({
694
- name: "Subtask Independence",
695
- version: "1.0.0",
696
- description: "...",
697
- scorer: ({ output }) => { /* original logic */ },
698
- });
699
-
700
- export const subtaskIndependence_v2 = createScorer({
701
- name: "Subtask Independence",
702
- version: "2.0.0",
703
- description: "...",
704
- changes: "Added semantic file conflict detection",
705
- scorer: ({ output }) => { /* improved logic */ },
706
- });
707
-
708
- // Track scorer version in history
709
- interface EvalRunRecord {
710
- timestamp: string;
711
- eval_name: string;
712
- score: number;
713
- scorer_versions: Record<string, string>; // { "subtaskIndependence": "2.0.0" }
714
- }
715
-
716
- // Baseline calculation only uses compatible runs
717
- function calculateBaseline(history: EvalRunRecord[], scorerVersions: Record<string, string>): number {
718
- const compatible = history.filter(run =>
719
- Object.entries(scorerVersions).every(([name, version]) =>
720
- run.scorer_versions[name] === version
721
- )
722
- );
723
- return mean(compatible.map(r => r.score));
724
- }
725
- ```
726
-
727
- **Benefits:**
728
- - Can improve scorers without breaking history
729
- - Clear attribution of score changes
730
- - Can A/B test new scorers against old
731
-
732
- ### Issue 4: LLM-as-Judge Has No Budget
733
-
734
- **Problem:** `decompositionCoherence` calls Claude for every test case. No cost controls.
735
-
736
- **Impact:**
737
- - Eval run cost unbounded (20 cases × $0.01/call = $0.20+)
738
- - Network failures fail entire eval
739
- - Slow eval runs (LLM latency)
740
-
741
- **Solution:**
742
- ```typescript
743
- // Budget enforcement
744
- const JUDGE_BUDGET = {
745
- maxCalls: 100,
746
- maxCost: 1.00, // USD
747
- maxLatency: 5000, // ms per call
748
- };
749
-
750
- let usedCalls = 0;
751
- let usedCost = 0;
752
-
753
- export const decompositionCoherence = createScorer({
754
- scorer: async ({ output, input }) => {
755
- // Check budget
756
- if (usedCalls >= JUDGE_BUDGET.maxCalls) {
757
- return { score: null, message: "Budget exhausted (max calls)" };
758
- }
759
- if (usedCost >= JUDGE_BUDGET.maxCost) {
760
- return { score: null, message: "Budget exhausted (max cost)" };
761
- }
762
-
763
- try {
764
- const { text, usage } = await generateText({ /* ... */ });
765
-
766
- // Track usage
767
- usedCalls++;
768
- usedCost += estimateCost(usage);
769
-
770
- // ... scoring logic
771
- } catch (error) {
772
- // Fallback to heuristic score
773
- return { score: 0.5, message: "LLM judge failed, using fallback" };
774
- }
775
- },
776
- });
777
- ```
778
-
779
- **Benefits:**
780
- - Predictable costs (budget enforced)
781
- - Graceful degradation (fallback on failure)
782
- - Fast feedback (skip LLM in CI, use for deep analysis locally)
783
-
784
- ### Issue 5: Baseline Calculation Too Naive
785
-
786
- **Problem:** Simple mean of all scores. Early bad runs drag down baseline forever. No time-based decay.
787
-
788
- **Impact:**
789
- - Baseline stagnates (old scores weighted equally with new)
790
- - Improvements don't raise baseline fast enough
791
- - Outliers distort baseline
792
-
793
- **Solution:**
794
- ```typescript
795
- // Exponential moving average (recent scores matter more)
796
- function calculateEMA(
797
- history: EvalRunRecord[],
798
- alpha: number = 0.2 // Smoothing factor (0.2 = 20% weight to new value)
799
- ): number {
800
- if (history.length === 0) return 0;
801
-
802
- let ema = history[0].score;
803
- for (let i = 1; i < history.length; i++) {
804
- ema = alpha * history[i].score + (1 - alpha) * ema;
805
- }
806
- return ema;
807
- }
808
-
809
- // Trimmed mean (remove outliers)
810
- function calculateTrimmedMean(
811
- history: EvalRunRecord[],
812
- trimPercent: number = 0.1 // Trim top/bottom 10%
813
- ): number {
814
- const sorted = history.map(r => r.score).sort((a, b) => a - b);
815
- const trimCount = Math.floor(sorted.length * trimPercent);
816
- const trimmed = sorted.slice(trimCount, sorted.length - trimCount);
817
- return mean(trimmed);
818
- }
819
-
820
- // Let caller choose baseline strategy
821
- type BaselineStrategy = "mean" | "ema" | "trimmed-mean" | "median";
822
-
823
- function calculateBaseline(
824
- history: EvalRunRecord[],
825
- strategy: BaselineStrategy = "ema"
826
- ): number {
827
- switch (strategy) {
828
- case "mean": return mean(history.map(r => r.score));
829
- case "ema": return calculateEMA(history);
830
- case "trimmed-mean": return calculateTrimmedMean(history);
831
- case "median": return median(history.map(r => r.score));
832
- }
833
- }
834
- ```
835
-
836
- **Benefits:**
837
- - Baseline adapts to improvements (EMA)
838
- - Robust to outliers (trimmed mean, median)
839
- - Configurable per eval (some need stability, others need responsiveness)
840
-
841
- ### Issue 6: No Eval Parameterization
842
-
843
- **Problem:** Can't run same eval with different configs (e.g., max_subtasks=4 vs max_subtasks=8). Must copy-paste eval file.
844
-
845
- **Impact:**
846
- - Duplication (multiple eval files for slight variations)
847
- - Can't grid search optimal params
848
- - Hard to compare strategies side-by-side
849
-
850
- **Solution:**
851
- ```typescript
852
- // Parameterized evals
853
- evalite.parameterize("Decomposition Quality", {
854
- params: [
855
- { maxSubtasks: 4, strategy: "file-based" },
856
- { maxSubtasks: 4, strategy: "feature-based" },
857
- { maxSubtasks: 8, strategy: "file-based" },
858
- { maxSubtasks: 8, strategy: "feature-based" },
859
- ],
860
- data: async ({ maxSubtasks, strategy }) =>
861
- loadEvalCases(PROJECT_KEY, { strategy, limit: 20 }),
862
- task: async (input, { maxSubtasks }) => {
863
- const prompt = formatDecompositionPrompt(input.task, input.context, maxSubtasks);
864
- return await generateDecomposition(prompt);
865
- },
866
- scorers: [subtaskIndependence, coverageCompleteness],
867
- });
868
- ```
869
-
870
- **Benefits:**
871
- - Single source of truth (DRY)
872
- - Easy to add new params (no file duplication)
873
- - Results grouped for comparison
874
-
875
- ---
876
-
877
- ## Performance Characteristics
878
-
879
- ### Eval Execution Times (Estimated)
880
-
881
- | Eval | Data Source | Task | Scorers | Time/Case | Total (20 cases) |
882
- |------|-------------|------|---------|-----------|------------------|
883
- | `swarm-decomposition` | PGlite | LLM call | 4 (1 LLM judge) | ~3-5s | ~60-100s |
884
- | `coordinator-session` | JSONL | Identity | 5 | ~10ms | ~200ms |
885
- | `compaction-prompt` | Fixtures | Identity | 5 | ~5ms | ~100ms |
886
- | `compaction-resumption` | JSONL | Logic | 4 | ~20ms | ~400ms |
887
-
888
- **Bottlenecks:**
889
- 1. **LLM calls** - `decompositionCoherence` dominates swarm-decomposition time
890
- 2. **PGlite queries** - Network RTT if using remote DB
891
- 3. **JSONL parsing** - Linear scan of all session files (could be indexed)
892
-
893
- **Optimization opportunities:**
894
- 1. **Parallel LLM calls** - Run test cases concurrently (10 parallel = 10x faster)
895
- 2. **Response caching** - Cache LLM responses by prompt hash
896
- 3. **Session indexing** - SQLite index on session_id, epic_id for fast lookup
897
- 4. **Incremental evals** - Only test changed cases (git diff → affected evals)
898
-
899
- ---
900
-
901
- ## Integration Points
902
-
903
- ### 1. Swarm Tools → Capture
904
-
905
- **File:** `src/eval-capture.ts`
906
-
907
- **Hook points:**
908
- - `swarm_decompose()` → `captureDecompositionEvent()`
909
- - `swarm_complete()` → `captureOutcomeEvent()`
910
- - Tool call inspection → `detectCoordinatorViolation()` → `captureViolationEvent()`
911
- - Compaction hook → `captureCompactionEvent()`
912
-
913
- **Data validation:** Zod schemas ensure type safety at capture time.
914
-
915
- ### 2. Evalite → Loaders
916
-
917
- **File:** `evals/lib/data-loader.ts`, `evals/lib/compaction-loader.ts`
918
-
919
- **Pattern:**
920
- ```typescript
921
- evalite("Test Name", {
922
- data: async () => {
923
- const realData = await hasRealEvalData(PROJECT_KEY, 5);
924
- return realData
925
- ? await loadEvalCases(PROJECT_KEY, { limit: 20 })
926
- : fixtures;
927
- },
928
- // ...
929
- });
930
- ```
931
-
932
- **Issue:** Fallback logic duplicated across eval files. Should be abstracted.
933
-
934
- ### 3. Evalite → Gates
935
-
936
- **File:** `src/eval-gates.ts`
937
-
938
- **Pattern:**
939
- ```typescript
940
- import { checkGate } from "../src/eval-gates.js";
941
-
942
- evalite("Test", {
943
- // ... data, task, scorers
944
- onComplete: ({ score }) => {
945
- const gate = checkGate(PROJECT_PATH, "test-name", score);
946
- if (!gate.passed) {
947
- console.error(`❌ Gate failed: ${gate.message}`);
948
- process.exit(1); // Fail CI
949
- }
950
- },
951
- });
952
- ```
953
-
954
- **Issue:** No built-in integration. Must manually wire `onComplete` hook in each eval file.
955
-
956
- ### 4. Gates → Learning
957
-
958
- **File:** `src/eval-learning.ts`
959
-
960
- **Pattern:**
961
- ```typescript
962
- import { learnFromEvalFailure } from "../src/eval-learning.js";
963
-
964
- const result = await learnFromEvalFailure(
965
- evalName,
966
- currentScore,
967
- history,
968
- memoryAdapter
969
- );
970
-
971
- if (result.triggered) {
972
- console.log(`📉 Stored failure to memory: ${result.memory_id}`);
973
- }
974
- ```
975
-
976
- **Issue:** No automatic execution. Must manually call after gate check.
977
-
978
- ### 5. Learning → Prompts (Missing)
979
-
980
- **Expected flow:**
981
- ```typescript
982
- // Query failures before generating prompts
983
- const failures = await queryEvalFailures(evalName, memoryAdapter);
984
-
985
- // Inject into LLM prompt
986
- const prompt = `
987
- ${basePrompt}
988
-
989
- PAST FAILURES:
990
- ${failures.map(f => `- ${f.information}`).join("\n")}
991
-
992
- Avoid these patterns.
993
- `;
994
- ```
995
-
996
- **Status:** Not implemented. Learning loop stores but doesn't retrieve.
997
-
998
- ---
999
-
1000
- ## Testing Strategy
1001
-
1002
- ### Current Coverage
1003
-
1004
- | Component | Unit Tests | Integration Tests | E2E Tests |
1005
- |-----------|------------|-------------------|-----------|
1006
- | Data loaders | ✅ `data-loader.test.ts` | ✅ `data-loader.evalite-test.ts` | ❌ |
1007
- | Scorers | ✅ `scorers/*.evalite-test.ts` | ❌ | ❌ |
1008
- | Gates | ✅ `eval-gates.test.ts` | ❌ | ❌ |
1009
- | Learning | ✅ `eval-learning.test.ts` | ❌ | ❌ |
1010
- | Capture | ❌ | ✅ `eval-capture.integration.test.ts` | ❌ |
1011
-
1012
- **Gaps:**
1013
- - No E2E tests (full CAPTURE → EVAL → GATE → LEARN flow)
1014
- - No scorer integration tests (composition logic)
1015
- - No error path tests (what if LLM fails? PGlite down? JSONL corrupt?)
1016
-
1017
- **Recommendation:**
1018
- ```typescript
1019
- // E2E test skeleton
1020
- describe("Eval Pipeline E2E", () => {
1021
- it("should capture → load → eval → gate → learn", async () => {
1022
- // 1. Trigger capture
1023
- await swarm_decompose(task, context);
1024
-
1025
- // 2. Load data
1026
- const cases = await loadEvalCases(PROJECT_KEY);
1027
- expect(cases.length).toBeGreaterThan(0);
1028
-
1029
- // 3. Run eval
1030
- const score = await runEval(cases);
1031
-
1032
- // 4. Check gate
1033
- const gate = checkGate(PROJECT_PATH, "test", score);
1034
- expect(gate.passed).toBe(true);
1035
-
1036
- // 5. Learn from failure (if any)
1037
- const learned = await learnFromEvalFailure("test", score, history, memory);
1038
- // ... assertions
1039
- });
1040
- });
1041
- ```
1042
-
1043
- ---
1044
-
1045
- ## Improvement Roadmap
1046
-
1047
- ### Phase 1: Foundation (1-2 weeks)
1048
-
1049
- 1. **Extract data source interface** (`EvalSource<T>`)
1050
- - Refactor `data-loader.ts` into `PGliteSource`, `JsonlSource`, `FixtureSource`
1051
- - Add source selection logic to shared utility
1052
- - Update all eval files to use new interface
1053
-
1054
- 2. **Make filters first-class**
1055
- - Extract `SessionFilter` type and filter library
1056
- - Move quality criteria out of loader, into eval files
1057
- - Add filter composition utilities
1058
-
1059
- 3. **Add scorer versioning**
1060
- - Add `version` field to scorer metadata
1061
- - Track scorer versions in eval history
1062
- - Update baseline calculation to only use compatible runs
1063
-
1064
- ### Phase 2: Robustness (2-3 weeks)
1065
-
1066
- 4. **LLM judge improvements**
1067
- - Add budget enforcement (max calls, max cost)
1068
- - Add response caching (hash prompt → cache result)
1069
- - Add fallback scoring (heuristic if LLM fails)
1070
-
1071
- 5. **Baseline improvements**
1072
- - Implement EMA, trimmed mean, median strategies
1073
- - Add `BaselineStrategy` config to eval-gates
1074
- - A/B test strategies against real data
1075
-
1076
- 6. **Error handling**
1077
- - Add retry logic to LLM calls
1078
- - Graceful degradation for missing data
1079
- - Corrupt JSONL line handling (currently silent skip)
1080
-
1081
- ### Phase 3: Intelligence (3-4 weeks)
1082
-
1083
- 7. **Learning loop completion**
1084
- - Query eval failures before generating prompts
1085
- - Inject past failures into LLM context
1086
- - Auto-generate hypotheses for regressions
1087
-
1088
- 8. **Failure analysis**
1089
- - Diff scorer outputs between runs
1090
- - Identify which test cases regressed
1091
- - Surface root cause signals (scorer, data, code change)
1092
-
1093
- 9. **CI/PR integration**
1094
- - Post gate results to GitHub PR comments
1095
- - Block merge on production gate failures
1096
- - Add `swarm eval status` badge to PRs
1097
-
1098
- ### Phase 4: Scale (4-6 weeks)
1099
-
1100
- 10. **Performance optimization**
1101
- - Parallel LLM calls for test cases
1102
- - Session indexing (SQLite for fast lookup)
1103
- - Incremental evals (only run affected tests)
1104
-
1105
- 11. **Eval parameterization**
1106
- - Add `evalite.parameterize()` support
1107
- - Grid search optimal params (max_subtasks, strategy combos)
1108
- - Compare strategies side-by-side
1109
-
1110
- 12. **Observability**
1111
- - Real-time eval dashboards (Grafana + Prometheus)
1112
- - Eval run traces (OpenTelemetry)
1113
- - Cost tracking (LLM usage, storage growth)
1114
-
1115
- ---
1116
-
1117
- ## Conclusion
1118
-
1119
- The eval infrastructure is **well-designed at the macro level** (clear pipeline, progressive gates, learning loop), but has **tactical issues** that impact usability and maintainability:
1120
-
1121
- **Key strengths to preserve:**
1122
- - Progressive gates prevent premature failures
1123
- - Real data integration grounds evals in reality
1124
- - Learning loop closes the feedback cycle
1125
- - Type-safe schemas prevent garbage data
1126
-
1127
- **Critical improvements needed:**
1128
- - **Abstraction:** Extract data source interface (reduce coupling)
1129
- - **Configurability:** Make filters, baselines, budgets first-class (not hardcoded)
1130
- - **Versioning:** Track scorer versions (enable safe improvements)
1131
- - **Robustness:** Add retries, fallbacks, error handling (production-grade)
1132
-
1133
- **Impact of improvements:**
1134
- - **Developer experience:** Easier to add new evals (less boilerplate)
1135
- - **Reliability:** Evals don't fail due to transient issues (network, LLM)
1136
- - **Trust:** Score changes attributable to code (not scorer drift)
1137
- - **Cost control:** LLM budgets prevent runaway spend
1138
-
1139
- **Next steps:** Start with Phase 1 (foundation) to unblock future improvements. The architecture is sound - just needs tactical refactoring.
1140
-
1141
- ---
1142
-
1143
- ## Appendix: File Inventory
1144
-
1145
- ```
1146
- evals/
1147
- ├── README.md # User-facing docs (comprehensive)
1148
- ├── ARCHITECTURE.md # This document
1149
- ├── evalite.config.ts.bak # Minimal config (mostly defaults)
1150
-
1151
- ├── fixtures/ # Synthetic test data
1152
- │ ├── decomposition-cases.ts # Decomposition test cases
1153
- │ ├── coordinator-sessions.ts # Perfect/bad coordinator examples
1154
- │ ├── compaction-cases.ts # Compaction logic test cases
1155
- │ └── compaction-prompt-cases.ts # Continuation prompt examples
1156
-
1157
- ├── lib/ # Data loading utilities
1158
- │ ├── data-loader.ts # PGlite + JSONL session loader
1159
- │ ├── data-loader.test.ts # Unit tests
1160
- │ ├── data-loader.evalite-test.ts # Integration tests
1161
- │ ├── compaction-loader.ts # COMPACTION event loader
1162
- │ ├── compaction-loader.test.ts # Unit tests
1163
- │ └── llm.ts # LLM client (AI SDK + Gateway)
1164
-
1165
- ├── scorers/ # Quality metric implementations
1166
- │ ├── index.ts # Decomposition scorers + exports
1167
- │ ├── index.test.ts # Unit tests
1168
- │ ├── coordinator-discipline.ts # Protocol adherence scorers
1169
- │ ├── coordinator-discipline.evalite-test.ts
1170
- │ ├── compaction-scorers.ts # Compaction correctness
1171
- │ ├── compaction-prompt-scorers.ts # Prompt quality
1172
- │ ├── outcome-scorers.ts # Real execution outcomes
1173
- │ └── outcome-scorers.evalite-test.ts
1174
-
1175
- ├── swarm-decomposition.eval.ts # Decomposition quality eval
1176
- ├── coordinator-session.eval.ts # Coordinator discipline eval
1177
- ├── compaction-prompt.eval.ts # Continuation prompt quality
1178
- ├── compaction-resumption.eval.ts # Compaction correctness eval
1179
- └── example.eval.ts # Sanity check / template
1180
-
1181
- Total: 24 TypeScript files (8 evals, 8 loaders/utils, 8 scorers)
1182
- ```
1183
-
1184
- ---
1185
-
1186
- **Generated by:** BlueForest (swarm worker)
1187
- **Cell:** opencode-swarm-plugin--ys7z8-mjlk7jsilk9
1188
- **Epic:** opencode-swarm-plugin--ys7z8-mjlk7js9bt1
1189
- **Date:** 2025-12-25