opencode-swarm-plugin 0.43.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. package/bin/cass.characterization.test.ts +422 -0
  2. package/bin/swarm.serve.test.ts +6 -4
  3. package/bin/swarm.test.ts +68 -0
  4. package/bin/swarm.ts +81 -8
  5. package/dist/compaction-prompt-scoring.js +139 -0
  6. package/dist/contributor-tools.d.ts +42 -0
  7. package/dist/contributor-tools.d.ts.map +1 -0
  8. package/dist/eval-capture.js +12811 -0
  9. package/dist/hive.d.ts.map +1 -1
  10. package/dist/index.d.ts +12 -0
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +7728 -62590
  13. package/dist/plugin.js +23833 -78695
  14. package/dist/sessions/agent-discovery.d.ts +59 -0
  15. package/dist/sessions/agent-discovery.d.ts.map +1 -0
  16. package/dist/sessions/index.d.ts +10 -0
  17. package/dist/sessions/index.d.ts.map +1 -0
  18. package/dist/swarm-orchestrate.d.ts.map +1 -1
  19. package/dist/swarm-prompts.d.ts.map +1 -1
  20. package/dist/swarm-review.d.ts.map +1 -1
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2255
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ROADMAP.md +0 -368
  43. package/docs/semantic-memory-cli-syntax.md +0 -123
  44. package/docs/swarm-mail-architecture.md +0 -1147
  45. package/docs/testing/context-recovery-test.md +0 -470
  46. package/evals/ARCHITECTURE.md +0 -1189
  47. package/evals/README.md +0 -768
  48. package/evals/compaction-prompt.eval.ts +0 -149
  49. package/evals/compaction-resumption.eval.ts +0 -289
  50. package/evals/coordinator-behavior.eval.ts +0 -307
  51. package/evals/coordinator-session.eval.ts +0 -154
  52. package/evals/evalite.config.ts.bak +0 -15
  53. package/evals/example.eval.ts +0 -31
  54. package/evals/fixtures/compaction-cases.ts +0 -350
  55. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  56. package/evals/fixtures/coordinator-sessions.ts +0 -328
  57. package/evals/fixtures/decomposition-cases.ts +0 -105
  58. package/evals/lib/compaction-loader.test.ts +0 -248
  59. package/evals/lib/compaction-loader.ts +0 -320
  60. package/evals/lib/data-loader.evalite-test.ts +0 -289
  61. package/evals/lib/data-loader.test.ts +0 -345
  62. package/evals/lib/data-loader.ts +0 -281
  63. package/evals/lib/llm.ts +0 -115
  64. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  65. package/evals/scorers/compaction-scorers.ts +0 -305
  66. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  67. package/evals/scorers/coordinator-discipline.ts +0 -325
  68. package/evals/scorers/index.test.ts +0 -146
  69. package/evals/scorers/index.ts +0 -328
  70. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  71. package/evals/scorers/outcome-scorers.ts +0 -349
  72. package/evals/swarm-decomposition.eval.ts +0 -121
  73. package/examples/commands/swarm.md +0 -745
  74. package/examples/plugin-wrapper-template.ts +0 -2426
  75. package/examples/skills/hive-workflow/SKILL.md +0 -212
  76. package/examples/skills/skill-creator/SKILL.md +0 -223
  77. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  78. package/global-skills/cli-builder/SKILL.md +0 -344
  79. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  80. package/global-skills/learning-systems/SKILL.md +0 -644
  81. package/global-skills/skill-creator/LICENSE.txt +0 -202
  82. package/global-skills/skill-creator/SKILL.md +0 -352
  83. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  84. package/global-skills/skill-creator/references/workflows.md +0 -28
  85. package/global-skills/swarm-coordination/SKILL.md +0 -995
  86. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  87. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  88. package/global-skills/system-design/SKILL.md +0 -213
  89. package/global-skills/testing-patterns/SKILL.md +0 -430
  90. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  91. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  92. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  93. package/scripts/cleanup-test-memories.ts +0 -346
  94. package/scripts/init-skill.ts +0 -222
  95. package/scripts/migrate-unknown-sessions.ts +0 -349
  96. package/scripts/validate-skill.ts +0 -204
  97. package/src/agent-mail.ts +0 -1724
  98. package/src/anti-patterns.test.ts +0 -1167
  99. package/src/anti-patterns.ts +0 -448
  100. package/src/compaction-capture.integration.test.ts +0 -257
  101. package/src/compaction-hook.test.ts +0 -838
  102. package/src/compaction-hook.ts +0 -1204
  103. package/src/compaction-observability.integration.test.ts +0 -139
  104. package/src/compaction-observability.test.ts +0 -187
  105. package/src/compaction-observability.ts +0 -324
  106. package/src/compaction-prompt-scorers.test.ts +0 -475
  107. package/src/compaction-prompt-scoring.ts +0 -300
  108. package/src/dashboard.test.ts +0 -611
  109. package/src/dashboard.ts +0 -462
  110. package/src/error-enrichment.test.ts +0 -403
  111. package/src/error-enrichment.ts +0 -219
  112. package/src/eval-capture.test.ts +0 -1015
  113. package/src/eval-capture.ts +0 -929
  114. package/src/eval-gates.test.ts +0 -306
  115. package/src/eval-gates.ts +0 -218
  116. package/src/eval-history.test.ts +0 -508
  117. package/src/eval-history.ts +0 -214
  118. package/src/eval-learning.test.ts +0 -378
  119. package/src/eval-learning.ts +0 -360
  120. package/src/eval-runner.test.ts +0 -223
  121. package/src/eval-runner.ts +0 -402
  122. package/src/export-tools.test.ts +0 -476
  123. package/src/export-tools.ts +0 -257
  124. package/src/hive.integration.test.ts +0 -2241
  125. package/src/hive.ts +0 -1628
  126. package/src/index.ts +0 -935
  127. package/src/learning.integration.test.ts +0 -1815
  128. package/src/learning.ts +0 -1079
  129. package/src/logger.test.ts +0 -189
  130. package/src/logger.ts +0 -135
  131. package/src/mandate-promotion.test.ts +0 -473
  132. package/src/mandate-promotion.ts +0 -239
  133. package/src/mandate-storage.integration.test.ts +0 -601
  134. package/src/mandate-storage.test.ts +0 -578
  135. package/src/mandate-storage.ts +0 -794
  136. package/src/mandates.ts +0 -540
  137. package/src/memory-tools.test.ts +0 -195
  138. package/src/memory-tools.ts +0 -344
  139. package/src/memory.integration.test.ts +0 -334
  140. package/src/memory.test.ts +0 -158
  141. package/src/memory.ts +0 -527
  142. package/src/model-selection.test.ts +0 -188
  143. package/src/model-selection.ts +0 -68
  144. package/src/observability-tools.test.ts +0 -359
  145. package/src/observability-tools.ts +0 -871
  146. package/src/output-guardrails.test.ts +0 -438
  147. package/src/output-guardrails.ts +0 -381
  148. package/src/pattern-maturity.test.ts +0 -1160
  149. package/src/pattern-maturity.ts +0 -525
  150. package/src/planning-guardrails.test.ts +0 -491
  151. package/src/planning-guardrails.ts +0 -438
  152. package/src/plugin.ts +0 -23
  153. package/src/post-compaction-tracker.test.ts +0 -251
  154. package/src/post-compaction-tracker.ts +0 -237
  155. package/src/query-tools.test.ts +0 -636
  156. package/src/query-tools.ts +0 -324
  157. package/src/rate-limiter.integration.test.ts +0 -466
  158. package/src/rate-limiter.ts +0 -774
  159. package/src/replay-tools.test.ts +0 -496
  160. package/src/replay-tools.ts +0 -240
  161. package/src/repo-crawl.integration.test.ts +0 -441
  162. package/src/repo-crawl.ts +0 -610
  163. package/src/schemas/cell-events.test.ts +0 -347
  164. package/src/schemas/cell-events.ts +0 -807
  165. package/src/schemas/cell.ts +0 -257
  166. package/src/schemas/evaluation.ts +0 -166
  167. package/src/schemas/index.test.ts +0 -199
  168. package/src/schemas/index.ts +0 -286
  169. package/src/schemas/mandate.ts +0 -232
  170. package/src/schemas/swarm-context.ts +0 -115
  171. package/src/schemas/task.ts +0 -161
  172. package/src/schemas/worker-handoff.test.ts +0 -302
  173. package/src/schemas/worker-handoff.ts +0 -131
  174. package/src/skills.integration.test.ts +0 -1192
  175. package/src/skills.test.ts +0 -643
  176. package/src/skills.ts +0 -1549
  177. package/src/storage.integration.test.ts +0 -341
  178. package/src/storage.ts +0 -884
  179. package/src/structured.integration.test.ts +0 -817
  180. package/src/structured.test.ts +0 -1046
  181. package/src/structured.ts +0 -762
  182. package/src/swarm-decompose.test.ts +0 -188
  183. package/src/swarm-decompose.ts +0 -1302
  184. package/src/swarm-deferred.integration.test.ts +0 -157
  185. package/src/swarm-deferred.test.ts +0 -38
  186. package/src/swarm-insights.test.ts +0 -214
  187. package/src/swarm-insights.ts +0 -459
  188. package/src/swarm-mail.integration.test.ts +0 -970
  189. package/src/swarm-mail.ts +0 -739
  190. package/src/swarm-orchestrate.integration.test.ts +0 -282
  191. package/src/swarm-orchestrate.test.ts +0 -548
  192. package/src/swarm-orchestrate.ts +0 -3084
  193. package/src/swarm-prompts.test.ts +0 -1270
  194. package/src/swarm-prompts.ts +0 -2077
  195. package/src/swarm-research.integration.test.ts +0 -701
  196. package/src/swarm-research.test.ts +0 -698
  197. package/src/swarm-research.ts +0 -472
  198. package/src/swarm-review.integration.test.ts +0 -285
  199. package/src/swarm-review.test.ts +0 -879
  200. package/src/swarm-review.ts +0 -709
  201. package/src/swarm-strategies.ts +0 -407
  202. package/src/swarm-worktree.test.ts +0 -501
  203. package/src/swarm-worktree.ts +0 -575
  204. package/src/swarm.integration.test.ts +0 -2377
  205. package/src/swarm.ts +0 -38
  206. package/src/tool-adapter.integration.test.ts +0 -1221
  207. package/src/tool-availability.ts +0 -461
  208. package/tsconfig.json +0 -28
@@ -1,1189 +0,0 @@
1
- # Eval Infrastructure Architecture Analysis
2
-
3
- ```
4
- ┌─────────────────────────────────────────────────────────────────────┐
5
- │ EVAL INFRASTRUCTURE FLOW │
6
- │ │
7
- │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────┐ │
8
- │ │ CAPTURE │─────▶│ STORE │─────▶│ LOAD │─────▶│ EVAL │ │
9
- │ └──────────┘ └──────────┘ └──────────┘ └────────┘ │
10
- │ │ │ │ │
11
- │ │ Tool calls │ Data loaders │ │
12
- │ │ Violations │ Fixtures │ │
13
- │ │ Outcomes │ │ │
14
- │ ▼ ▼ ▼ │
15
- │ [sessions/*.jsonl] [PGlite eval_records] [Scorers]│
16
- │ [eval-data.jsonl] [Fixtures] [Gates] │
17
- │ │
18
- │ ┌──────────────────┐ │
19
- │ │ FEEDBACK LOOP │ │
20
- │ ├──────────────────┤ │
21
- │ │ Gate Check │ │
22
- │ │ Learn from Fail │ │
23
- │ │ Store Memory │ │
24
- │ └──────────────────┘ │
25
- └─────────────────────────────────────────────────────────────────────┘
26
- ```
27
-
28
- **Date:** 2025-12-25
29
- **Agent:** BlueForest
30
- **Cell:** opencode-swarm-plugin--ys7z8-mjlk7jsilk9
31
-
32
- ---
33
-
34
- ## Executive Summary
35
-
36
- The eval infrastructure is a **progressive quality control system** that captures real execution data, scores it against quality criteria, and enforces adaptive gates based on data maturity. The architecture follows a clean pipeline: **CAPTURE → STORE → LOAD → EVAL → GATE → LEARN**.
37
-
38
- **Key strengths:**
39
- - Clear separation of concerns (loaders, scorers, evals)
40
- - Progressive gates prevent premature failures
41
- - Real data integration (not just synthetic fixtures)
42
- - Learning feedback loop (regressions → semantic memory)
43
-
44
- **Key issues identified:**
45
- 1. **Data loader abstraction leak** - Loaders know too much about storage format
46
- 2. **Scorer composition complexity** - Composite scorers have brittle async patterns
47
- 3. **Fixture vs real data switching** - Implicit fallback logic scattered in eval files
48
- 4. **Session filtering buried in loader** - Quality criteria hardcoded in data-loader.ts
49
- 5. **No eval versioning** - Schema changes could break historical data
50
-
51
- ---
52
-
53
- ## Component Architecture
54
-
55
- ### 1. Data Capture (`src/eval-capture.ts`)
56
-
57
- **Purpose:** Automatically capture real execution data during swarm runs.
58
-
59
- **Event Types:**
60
- - `DECISION` - Coordinator decisions (strategy selected, worker spawned, review completed)
61
- - `VIOLATION` - Protocol violations (edited files, ran tests, reserved files)
62
- - `OUTCOME` - Task outcomes (success, retry, failure, epic complete)
63
- - `COMPACTION` - Context compaction lifecycle (detection, prompt generation, resumption)
64
-
65
- **Storage:**
66
- - **Sessions:** `~/.config/swarm-tools/sessions/{session-id}.jsonl` (append-only JSONL)
67
- - **Eval Records:** PGlite `eval_records` table (via swarm-mail)
68
- - **History:** `.opencode/eval-history.jsonl` (local project)
69
-
70
- **Schema:** Zod discriminated union (`CoordinatorEventSchema`) - type-safe with exhaustive checks.
71
-
72
- **Capture points:**
73
- - `swarm_decompose` - Captures strategy, decomposition
74
- - `swarm_complete` - Captures outcomes (duration, errors, retries)
75
- - Tool call inspection - Real-time violation detection via pattern matching
76
- - Compaction hook - Lifecycle tracking
77
-
78
- **Strengths:**
79
- - Zod validation prevents garbage data
80
- - JSONL format is append-only, fault-tolerant, streamable
81
- - Discriminated union makes event types exhaustive
82
-
83
- **Issues:**
84
- - **No schema versioning** - Future schema changes could break old data
85
- - **Session directory hardcoded** - `~/.config/swarm-tools/sessions/` not configurable per project
86
-
87
- ---
88
-
89
- ### 2. Data Loaders (`evals/lib/`)
90
-
91
- #### `data-loader.ts` - PGlite + Session Loader
92
-
93
- **Purpose:** Load real data from PGlite (`eval_records`) and session JSONL files.
94
-
95
- **Key functions:**
96
- - `loadEvalCases()` - Query PGlite for decomposition eval records
97
- - `loadCapturedSessions()` - Read coordinator sessions from JSONL
98
- - `hasRealEvalData()` - Check if enough data exists for real eval
99
- - `getEvalDataSummary()` - Stats for reporting
100
-
101
- **Session Quality Filters:**
102
- ```typescript
103
- {
104
- minEvents: 3, // Filter incomplete sessions
105
- requireWorkerSpawn: true, // Ensure delegation happened
106
- requireReview: true, // Ensure coordinator reviewed work
107
- }
108
- ```
109
-
110
- **Strengths:**
111
- - Quality filters reduce noise (only 3/100 sessions passed in coordinator-session eval)
112
- - Stats functions provide transparency (logs which data source is used)
113
-
114
- **Issues:**
115
- 1. **Abstraction leak** - Loader knows about PGlite internals AND JSONL format
116
- - Should have separate `PGliteEvalSource` and `JsonlEvalSource` adapters
117
- 2. **Quality criteria hardcoded** - Filters baked into loader, not configurable at call site
118
- - `requireReview: true` prevents testing coordinators who skip reviews
119
- 3. **Transform logic mixed with loading** - `meetsQualityCriteria()` is business logic, not I/O
120
- 4. **No data versioning** - Can't handle schema evolution (what if event types change?)
121
-
122
- **Recommendation:**
123
- ```typescript
124
- // Separate concerns
125
- interface EvalSource {
126
- load(filters: EvalFilters): Promise<EvalCase[]>;
127
- stats(): Promise<EvalStats>;
128
- }
129
-
130
- class PGliteEvalSource implements EvalSource { /* ... */ }
131
- class JsonlSessionSource implements EvalSource { /* ... */ }
132
-
133
- // Make filters first-class
134
- type SessionFilter = (session: CoordinatorSession) => boolean;
135
- const filters = {
136
- minEvents: (n: number) => (s) => s.events.length >= n,
137
- requireWorkerSpawn: (s) => s.events.some(e => e.decision_type === "worker_spawned"),
138
- compose: (...fns) => (s) => fns.every(f => f(s)),
139
- };
140
- ```
141
-
142
- #### `compaction-loader.ts` - COMPACTION Event Loader
143
-
144
- **Purpose:** Load COMPACTION events from session JSONL files for compaction-prompt eval.
145
-
146
- **Key functions:**
147
- - `loadCompactionEvents()` - Stream COMPACTION events with early termination
148
- - `loadCompactionSessions()` - Group events by session_id
149
- - `loadDefaultCompaction*()` - Convenience wrappers for default session dir
150
-
151
- **Features:**
152
- - **Lazy loading** - Streams large files line-by-line (avoids memory bloat)
153
- - **Early termination** - Stops reading when limit reached
154
- - **Graceful errors** - Skips invalid lines, logs warnings
155
-
156
- **Strengths:**
157
- - Clean single-responsibility (only COMPACTION events)
158
- - Performance-conscious (streaming for large datasets)
159
- - Type-safe with discriminated union extraction
160
-
161
- **Issues:**
162
- 1. **Streaming threshold arbitrary** - `limit < 100` triggers streaming - why 100?
163
- - Should stream by file size, not result limit
164
- 2. **Duplicate logic** - `parseLine()` duplicated between loaders
165
- - Should be shared utility in `eval-capture.ts`
166
- 3. **No pagination** - Returns all matches up to limit, can't resume
167
- - Real-world use case: "Load next 10 sessions" for UI
168
-
169
- **Recommendation:**
170
- ```typescript
171
- // Shared utilities in eval-capture.ts
172
- export function parseEventLine(line: string): CoordinatorEvent | null;
173
- export function* streamEvents(filePath: string): Generator<CoordinatorEvent>;
174
-
175
- // Pagination support
176
- interface PaginatedResult<T> {
177
- data: T[];
178
- cursor: string | null; // file:line for resumption
179
- hasMore: boolean;
180
- }
181
- ```
182
-
183
- #### `llm.ts` - LLM Client for Evals
184
-
185
- **Purpose:** Generate decompositions via LLM for testing (swarm-decomposition eval).
186
-
187
- **Key functions:**
188
- - `generateDecomposition()` - Call Claude via AI SDK + Vercel Gateway
189
- - `formatDecompositionPrompt()` - Template prompt for decomposition
190
- - `extractJson()` - Parse JSON from LLM responses (handles markdown wrapping)
191
-
192
- **Gateway pattern:**
193
- ```typescript
194
- const { text } = await generateText({
195
- model: gateway("anthropic/claude-sonnet-4-5"),
196
- prompt,
197
- maxOutputTokens: 4096,
198
- });
199
- ```
200
-
201
- **Strengths:**
202
- - Gateway abstraction hides provider details (just pass "provider/model")
203
- - JSON extraction handles markdown code blocks (common LLM quirk)
204
- - Prompt template matches production `swarm_plan_prompt`
205
-
206
- **Issues:**
207
- 1. **No retry logic** - Single LLM call, no fallback on failure
208
- - Network errors or rate limits fail entire eval run
209
- 2. **Hardcoded model** - `DEFAULT_MODEL` not overridable at runtime
210
- - Can't test with different models without code change
211
- 3. **No response caching** - Repeated eval runs re-generate same decompositions
212
- - Wastes $ and time for deterministic inputs
213
-
214
- **Recommendation:**
215
- ```typescript
216
- // Retry wrapper
217
- export async function generateWithRetry(
218
- prompt: string,
219
- options?: { model?: GatewayModelId; retries?: number; cache?: boolean }
220
- ): Promise<string>;
221
-
222
- // Cache layer
223
- const cacheKey = hash(prompt + model);
224
- if (cache.has(cacheKey)) return cache.get(cacheKey);
225
- ```
226
-
227
- ---
228
-
229
- ### 3. Scorers (`evals/scorers/`)
230
-
231
- **Purpose:** Score eval outputs against quality criteria. Return `{ score: 0-1, message: string }`.
232
-
233
- **Evalite pattern:**
234
- ```typescript
235
- export const myScorer = createScorer({
236
- name: "My Scorer",
237
- description: "What it measures",
238
- scorer: async ({ output, expected, input }) => {
239
- return { score: 0.8, message: "Details" };
240
- },
241
- });
242
- ```
243
-
244
- #### Scorer Categories
245
-
246
- | File | Scorers | What They Measure |
247
- |------|---------|-------------------|
248
- | `index.ts` | `subtaskIndependence`, `coverageCompleteness`, `instructionClarity`, `decompositionCoherence` | Decomposition quality |
249
- | `coordinator-discipline.ts` | `violationCount`, `spawnEfficiency`, `reviewThoroughness`, `timeToFirstSpawn`, `overallDiscipline` | Coordinator protocol adherence |
250
- | `compaction-scorers.ts` | `confidenceAccuracy`, `contextInjectionCorrectness`, `requiredPatternsPresent`, `forbiddenPatternsAbsent`, `compactionQuality` | Compaction correctness |
251
- | `compaction-prompt-scorers.ts` | `epicIdSpecificity`, `actionability`, `coordinatorIdentity`, `forbiddenToolsPresent`, `postCompactionDiscipline` | Continuation prompt quality |
252
- | `outcome-scorers.ts` | `executionSuccess`, `timeBalance`, `scopeAccuracy`, `scopeDrift`, `noRework` | Real execution outcomes |
253
-
254
- **Composite Scorer Pattern:**
255
- ```typescript
256
- export const overallDiscipline = createScorer({
257
- name: "Overall Discipline",
258
- description: "Weighted composite of all discipline scorers",
259
- scorer: async ({ output, expected, input }) => {
260
- // Call child scorers
261
- const violations = await violationCount({ output, expected, input });
262
- const spawn = await spawnEfficiency({ output, expected, input });
263
- const review = await reviewThoroughness({ output, expected, input });
264
- const time = await timeToFirstSpawn({ output, expected, input });
265
-
266
- // Weighted average
267
- const score =
268
- (violations.score ?? 0) * 0.30 +
269
- (spawn.score ?? 0) * 0.25 +
270
- (review.score ?? 0) * 0.25 +
271
- (time.score ?? 0) * 0.20;
272
-
273
- return { score, message: "Composite score" };
274
- },
275
- });
276
- ```
277
-
278
- **Strengths:**
279
- - Clear single-responsibility (each scorer tests one thing)
280
- - Composite scorers enable weighted evaluation
281
- - Type-safe with Zod schemas for output parsing
282
- - Null-safe scoring (`score ?? 0` handles scorer failures gracefully)
283
-
284
- **Issues:**
285
- 1. **Async composition fragility** - Must `await` each child scorer
286
- - Easy to forget, causes `Promise<Score>` type errors
287
- - Semantic memory shows this bit TWO files recently
288
- 2. **No scorer versioning** - Scorer logic changes invalidate historical comparisons
289
- - Can't tell if score dropped due to regression or scorer change
290
- 3. **Hardcoded weights** - `0.30`, `0.25`, etc. not configurable
291
- - Can't experiment with different weight profiles
292
- 4. **LLM-as-judge cost** - `decompositionCoherence` calls Claude for each test case
293
- - No cost controls or budgets
294
- - No fallback if LLM fails
295
-
296
- **Recommendation:**
297
- ```typescript
298
- // Versioned scorers
299
- export const violationCount_v1 = createScorer({ /* ... */ });
300
- export const violationCount_v2 = createScorer({ /* ... */ });
301
-
302
- // Configurable weights
303
- export function createOverallDiscipline(weights: {
304
- violations: number;
305
- spawn: number;
306
- review: number;
307
- time: number;
308
- }) { /* ... */ }
309
-
310
- // LLM budget
311
- const JUDGE_BUDGET = { maxCalls: 100, maxCost: 1.00 };
312
- ```
313
-
314
- ---
315
-
316
- ### 4. Eval Files (`evals/*.eval.ts`)
317
-
318
- **Purpose:** Define eval test suites using Evalite framework.
319
-
320
- **Pattern:**
321
- ```typescript
322
- evalite("Eval Name", {
323
- data: async () => [...testCases],
324
- task: async (input) => /* generate output */,
325
- scorers: [scorer1, scorer2, ...],
326
- });
327
- ```
328
-
329
- #### Eval Suites
330
-
331
- | File | Data Source | Task | Scorers |
332
- |------|-------------|------|---------|
333
- | `swarm-decomposition.eval.ts` | PGlite or fixtures | LLM generates decomposition | Independence, coverage, clarity, coherence |
334
- | `coordinator-session.eval.ts` | Session JSONL or fixtures | Identity (session as JSON) | Violations, spawn, review, time, discipline |
335
- | `compaction-prompt.eval.ts` | Fixtures only | Identity (fixture prompts) | Epic ID, actionability, identity, tools, discipline |
336
- | `compaction-resumption.eval.ts` | Compaction events | Compaction logic | Confidence, injection, patterns, quality |
337
-
338
- **Data Source Switching:**
339
- ```typescript
340
- const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
341
- const evalCases = useRealData
342
- ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
343
- : decompositionCases.map((testCase) => ({ input: testCase.input, expected: testCase.expected }));
344
- ```
345
-
346
- **Strengths:**
347
- - Progressive data source (fixtures → real data as it accumulates)
348
- - Transparency (logs which source is used)
349
- - Multiple test suites per eval file (edge cases, perfect vs bad, etc.)
350
-
351
- **Issues:**
352
- 1. **Data source logic duplicated** - Every eval file has same `hasRealEvalData` check
353
- - Should be abstracted into data loader
354
- 2. **Hard limit of 20 cases** - `limit: 20` hardcoded
355
- - No way to run full dataset locally
356
- 3. **No eval parameterization** - Can't run same eval with different configs
357
- - E.g., "test with max_subtasks=4" vs "max_subtasks=8"
358
- 4. **Identity task for fixtures** - `task: async (input) => JSON.stringify(input)` is wasteful
359
- - Fixtures already have output, no need to "generate" it
360
- - Should have `FixtureEval` vs `GenerativeEval` types
361
-
362
- **Recommendation:**
363
- ```typescript
364
- // Data source abstraction
365
- const dataSource = await selectDataSource(PROJECT_KEY, {
366
- preferReal: true,
367
- fallbackToFixtures: true,
368
- limit: process.env.CI ? 5 : undefined, // Full dataset locally, sample in CI
369
- });
370
-
371
- // Eval parameterization
372
- evalite.parameterize("Decomposition Quality", {
373
- params: [
374
- { maxSubtasks: 4, strategy: "file-based" },
375
- { maxSubtasks: 8, strategy: "feature-based" },
376
- ],
377
- data: async ({ maxSubtasks, strategy }) => /* ... */,
378
- });
379
- ```
380
-
381
- ---
382
-
383
- ### 5. Progressive Gates (`src/eval-gates.ts`)
384
-
385
- **Purpose:** Enforce quality gates based on eval maturity phase.
386
-
387
- **Phases:**
388
- - **Bootstrap (<10 runs):** Always pass, collect baseline data
389
- - **Stabilization (10-50 runs):** Warn on >10% regression (default), but pass
390
- - **Production (>50 runs + variance <0.1):** Fail on >5% regression (default)
391
-
392
- **Gate Logic:**
393
- ```typescript
394
- export function checkGate(
395
- projectPath: string,
396
- evalName: string,
397
- currentScore: number,
398
- config?: GateConfig
399
- ): GateResult {
400
- const phase = getPhase(projectPath, evalName);
401
- const history = getScoreHistory(projectPath, evalName);
402
- const baseline = calculateBaseline(history, currentScore);
403
- const regressionPercent = (baseline - currentScore) / baseline;
404
-
405
- // Phase-specific thresholds
406
- if (phase === "bootstrap") return { passed: true, ... };
407
- if (phase === "stabilization") return { passed: true, warn: regressionPercent > 0.10, ... };
408
- if (phase === "production") return { passed: regressionPercent <= 0.05, ... };
409
- }
410
- ```
411
-
412
- **Variance Threshold:**
413
- - High variance (≥0.1) keeps eval in stabilization even with >50 runs
414
- - Prevents premature production gates when scores unstable
415
- - Current issue: coordinator-session has high variance (only 3/100 sessions pass filters)
416
-
417
- **Strengths:**
418
- - Adaptive thresholds prevent premature failures
419
- - Variance check prevents false confidence
420
- - Configurable thresholds per eval
421
-
422
- **Issues:**
423
- 1. **Baseline calculation naive** - Simple mean of all scores
424
- - Doesn't handle outliers or trends
425
- - Early bad runs drag down baseline forever
426
- 2. **No time-based decay** - Old scores weighted equally with new
427
- - Eval improvements don't raise baseline fast enough
428
- 3. **No CI/PR integration hooks** - Gates check but don't post results
429
- - Documented in README but not implemented
430
- 4. **Variance threshold magic number** - 0.1 chosen arbitrarily
431
- - Should be configurable or derived from data
432
-
433
- **Recommendation:**
434
- ```typescript
435
- // Weighted baseline (recent scores matter more)
436
- function calculateWeightedBaseline(
437
- history: EvalRunRecord[],
438
- decayFactor: number = 0.9 // Recent = 1.0, older = 0.9^n
439
- ): number;
440
-
441
- // Outlier-resistant baseline (median or trimmed mean)
442
- function calculateRobustBaseline(
443
- history: EvalRunRecord[],
444
- trimPercent: number = 0.1 // Trim top/bottom 10%
445
- ): number;
446
-
447
- // CI posting
448
- export function postGateResultToGitHub(
449
- result: GateResult,
450
- prNumber: number,
451
- repo: string
452
- ): Promise<void>;
453
- ```
454
-
455
- ---
456
-
457
- ### 6. Learning Feedback Loop (`src/eval-learning.ts`)
458
-
459
- **Purpose:** Automatically store eval failures to semantic memory for learning.
460
-
461
- **Trigger:** Score drops >15% (configurable) from rolling average baseline.
462
-
463
- **Flow:**
464
- ```typescript
465
- const result = await learnFromEvalFailure(
466
- evalName,
467
- currentScore,
468
- history,
469
- memoryAdapter,
470
- { config: { dropThreshold: 0.15, windowSize: 5 } }
471
- );
472
-
473
- if (result.triggered) {
474
- // Stored to semantic-memory with tags:
475
- // - "eval-failure"
476
- // - "{eval-name}"
477
- // - "regression"
478
- }
479
- ```
480
-
481
- **Stored Context:**
482
- - Eval name
483
- - Baseline score (rolling average)
484
- - Current score
485
- - Drop percentage
486
- - Timestamp
487
- - Optional scorer details (which scorer failed)
488
-
489
- **Strengths:**
490
- - Automatic detection (no manual annotation)
491
- - Rolling average baseline (more stable than last-run comparison)
492
- - Configurable sensitivity (threshold + window size)
493
- - Structured metadata for querying
494
-
495
- **Issues:**
496
- 1. **No retrieval integration** - Memories stored but not queried before eval runs
497
- - Should inject past failures into LLM prompts for context
498
- 2. **No failure analysis** - Stores "score dropped" but not "why"
499
- - Should include which test cases failed, what changed
500
- 3. **No auto-remediation** - Human must read memory and act
501
- - Could auto-generate hypotheses or suggested fixes
502
- 4. **Memory pollution risk** - Noisy evals create spam memories
503
- - Should require multiple consecutive drops before storing
504
-
505
- **Recommendation:**
506
- ```typescript
507
- // Retrieval hook
508
- export async function queryEvalFailures(
509
- evalName: string,
510
- memoryAdapter: MemoryAdapter
511
- ): Promise<Memory[]> {
512
- return memoryAdapter.find({
513
- query: evalName,
514
- tags: ["eval-failure", "regression"],
515
- limit: 5,
516
- });
517
- }
518
-
519
- // Failure analysis
520
- export function analyzeFailure(
521
- evalName: string,
522
- currentRun: EvalResult,
523
- previousRun: EvalResult
524
- ): FailureAnalysis {
525
- // Diff test cases, scorer outputs, etc.
526
- }
527
-
528
- // Spam prevention
529
- if (recentDrops.length >= 3) {
530
- // Only store if consistent regression
531
- storeMemory();
532
- }
533
- ```
534
-
535
- ---
536
-
537
- ## Data Flow Architecture
538
-
539
- ### Capture Flow
540
-
541
- ```
542
- ┌─────────────────────────────────────────────────────────────┐
543
- │ REAL-TIME CAPTURE │
544
- ├─────────────────────────────────────────────────────────────┤
545
- │ │
546
- │ 1. Coordinator calls swarm tool │
547
- │ ├─ swarm_decompose(task="Add auth") │
548
- │ ├─ swarm_spawn_subtask(bead_id="bd-123.1") │
549
- │ └─ swarm_review(task_id="bd-123.1") │
550
- │ │
551
- │ 2. Tool execution │
552
- │ ├─ planning-guardrails.ts detects violations │
553
- │ │ (pattern matching on tool name + args) │
554
- │ └─ eval-capture.ts emits events │
555
- │ │
556
- │ 3. Event storage │
557
- │ ├─ Session JSONL: ~/.config/swarm-tools/sessions/... │
558
- │ ├─ PGlite: eval_records table │
559
- │ └─ History: .opencode/eval-history.jsonl │
560
- │ │
561
- └─────────────────────────────────────────────────────────────┘
562
- ```
563
-
564
- **Key characteristic:** Capture is **passive** - no manual instrumentation needed. Tool calls are inspected in real-time.
565
-
566
- ### Load → Eval Flow
567
-
568
- ```
569
- ┌─────────────────────────────────────────────────────────────┐
570
- │ EVAL EXECUTION │
571
- ├─────────────────────────────────────────────────────────────┤
572
- │ │
573
- │ 1. Data Loading │
574
- │ ├─ Check: hasRealEvalData(projectKey, minRecords=5) │
575
- │ ├─ If true: loadEvalCases(projectKey, limit=20) │
576
- │ └─ If false: Use fixtures (decomposition-cases.ts) │
577
- │ │
578
- │ 2. Task Execution │
579
- │ ├─ Generative: LLM generates decomposition │
580
- │ └─ Identity: Fixture data as-is (JSON.stringify) │
581
- │ │
582
- │ 3. Scoring │
583
- │ ├─ Parse output (JSON, Zod validation) │
584
- │ ├─ Run scorers in parallel (async composition) │
585
- │ └─ Composite scorer: weighted average │
586
- │ │
587
- │ 4. Gate Check │
588
- │ ├─ getPhase(projectPath, evalName) │
589
- │ ├─ calculateBaseline(history) │
590
- │ ├─ calculateRegression(baseline, currentScore) │
591
- │ └─ Return GateResult { passed, phase, message } │
592
- │ │
593
- │ 5. Learning │
594
- │ ├─ isSignificantDrop(current, baseline, threshold) │
595
- │ ├─ If true: storeMemory(evalName, context, tags) │
596
- │ └─ Return LearningResult { triggered, memory_id } │
597
- │ │
598
- └─────────────────────────────────────────────────────────────┘
599
- ```
600
-
601
- **Key characteristic:** Load flow has **implicit fallback** (real data → fixtures). This is scattered across eval files, not centralized.
602
-
603
- ---
604
-
605
- ## Structural Issues & Recommendations
606
-
607
- ### Issue 1: Data Loader Abstraction Leak
608
-
609
- **Problem:** `data-loader.ts` knows about PGlite internals AND JSONL format. Violates single-responsibility.
610
-
611
- **Impact:**
612
- - Hard to test (mocking requires PGlite + file I/O)
613
- - Hard to extend (adding CSV source requires modifying data-loader.ts)
614
- - Tight coupling to storage format
615
-
616
- **Solution:**
617
- ```typescript
618
- // Define source interface
619
- interface EvalSource<T> {
620
- load(filters: FilterSpec): Promise<T[]>;
621
- stats(): Promise<SourceStats>;
622
- }
623
-
624
- // Implement sources
625
- class PGliteDecompositionSource implements EvalSource<EvalCase> { /* ... */ }
626
- class JsonlSessionSource implements EvalSource<CoordinatorSession> { /* ... */ }
627
- class FixtureSource<T> implements EvalSource<T> { /* ... */ }
628
-
629
- // Compose in eval files
630
- const source = await selectSource<EvalCase>({
631
- preferReal: new PGliteDecompositionSource(projectKey),
632
- fallback: new FixtureSource(decompositionCases),
633
- minRecords: 5,
634
- });
635
-
636
- const data = await source.load({ limit: 20 });
637
- ```
638
-
639
- **Benefits:**
640
- - Sources testable in isolation
641
- - Easy to add new sources (S3, API, etc.)
642
- - Explicit fallback strategy (not hardcoded)
643
-
644
- ### Issue 2: Session Quality Filters Hardcoded
645
-
646
- **Problem:** Quality criteria baked into `loadCapturedSessions()` - can't test coordinators who skip reviews.
647
-
648
- **Impact:**
649
- - Only 3/100 sessions passed filters in coordinator-session eval
650
- - Can't experiment with different filter profiles
651
- - Hidden filtering (caller doesn't control criteria)
652
-
653
- **Solution:**
654
- ```typescript
655
- // Make filters first-class, composable
656
- type SessionFilter = (session: CoordinatorSession) => boolean;
657
-
658
- const filters = {
659
- minEvents: (n: number): SessionFilter => (s) => s.events.length >= n,
660
- requireWorkerSpawn: (s) => s.events.some(e => e.decision_type === "worker_spawned"),
661
- requireReview: (s) => s.events.some(e => e.decision_type === "review_completed"),
662
- compose: (...fns: SessionFilter[]): SessionFilter => (s) => fns.every(f => f(s)),
663
- };
664
-
665
- // Explicit filtering at call site
666
- const sessions = await loadCapturedSessions({
667
- filter: filters.compose(
668
- filters.minEvents(3),
669
- filters.requireWorkerSpawn
670
- // Note: NOT requiring review for this test
671
- ),
672
- limit: 20,
673
- });
674
- ```
675
-
676
- **Benefits:**
677
- - Caller controls filtering (explicit, testable)
678
- - Easy to add new filters (no loader modification)
679
- - Can test partial compliance (e.g., "spawn but no review")
680
-
681
- ### Issue 3: No Scorer Versioning
682
-
683
- **Problem:** Scorer logic changes invalidate historical comparisons. Can't tell if score dropped due to regression or scorer change.
684
-
685
- **Impact:**
686
- - "Score dropped 15%" - was it code regression or stricter scoring?
687
- - Can't experiment with scorer improvements (breaks history)
688
- - No rollback if new scorer is too strict
689
-
690
- **Solution:**
691
- ```typescript
692
- // Version scorers with metadata
693
- export const subtaskIndependence_v1 = createScorer({
694
- name: "Subtask Independence",
695
- version: "1.0.0",
696
- description: "...",
697
- scorer: ({ output }) => { /* original logic */ },
698
- });
699
-
700
- export const subtaskIndependence_v2 = createScorer({
701
- name: "Subtask Independence",
702
- version: "2.0.0",
703
- description: "...",
704
- changes: "Added semantic file conflict detection",
705
- scorer: ({ output }) => { /* improved logic */ },
706
- });
707
-
708
- // Track scorer version in history
709
- interface EvalRunRecord {
710
- timestamp: string;
711
- eval_name: string;
712
- score: number;
713
- scorer_versions: Record<string, string>; // { "subtaskIndependence": "2.0.0" }
714
- }
715
-
716
- // Baseline calculation only uses compatible runs
717
- function calculateBaseline(history: EvalRunRecord[], scorerVersions: Record<string, string>): number {
718
- const compatible = history.filter(run =>
719
- Object.entries(scorerVersions).every(([name, version]) =>
720
- run.scorer_versions[name] === version
721
- )
722
- );
723
- return mean(compatible.map(r => r.score));
724
- }
725
- ```
726
-
727
- **Benefits:**
728
- - Can improve scorers without breaking history
729
- - Clear attribution of score changes
730
- - Can A/B test new scorers against old
731
-
732
- ### Issue 4: LLM-as-Judge Has No Budget
733
-
734
- **Problem:** `decompositionCoherence` calls Claude for every test case. No cost controls.
735
-
736
- **Impact:**
737
- - Eval run cost unbounded (20 cases × $0.01/call = $0.20+)
738
- - Network failures fail entire eval
739
- - Slow eval runs (LLM latency)
740
-
741
- **Solution:**
742
- ```typescript
743
- // Budget enforcement
744
- const JUDGE_BUDGET = {
745
- maxCalls: 100,
746
- maxCost: 1.00, // USD
747
- maxLatency: 5000, // ms per call
748
- };
749
-
750
- let usedCalls = 0;
751
- let usedCost = 0;
752
-
753
- export const decompositionCoherence = createScorer({
754
- scorer: async ({ output, input }) => {
755
- // Check budget
756
- if (usedCalls >= JUDGE_BUDGET.maxCalls) {
757
- return { score: null, message: "Budget exhausted (max calls)" };
758
- }
759
- if (usedCost >= JUDGE_BUDGET.maxCost) {
760
- return { score: null, message: "Budget exhausted (max cost)" };
761
- }
762
-
763
- try {
764
- const { text, usage } = await generateText({ /* ... */ });
765
-
766
- // Track usage
767
- usedCalls++;
768
- usedCost += estimateCost(usage);
769
-
770
- // ... scoring logic
771
- } catch (error) {
772
- // Fallback to heuristic score
773
- return { score: 0.5, message: "LLM judge failed, using fallback" };
774
- }
775
- },
776
- });
777
- ```
778
-
779
- **Benefits:**
780
- - Predictable costs (budget enforced)
781
- - Graceful degradation (fallback on failure)
782
- - Fast feedback (skip LLM in CI, use for deep analysis locally)
783
-
784
- ### Issue 5: Baseline Calculation Too Naive
785
-
786
- **Problem:** Simple mean of all scores. Early bad runs drag down baseline forever. No time-based decay.
787
-
788
- **Impact:**
789
- - Baseline stagnates (old scores weighted equally with new)
790
- - Improvements don't raise baseline fast enough
791
- - Outliers distort baseline
792
-
793
- **Solution:**
794
- ```typescript
795
- // Exponential moving average (recent scores matter more)
796
- function calculateEMA(
797
- history: EvalRunRecord[],
798
- alpha: number = 0.2 // Smoothing factor (0.2 = 20% weight to new value)
799
- ): number {
800
- if (history.length === 0) return 0;
801
-
802
- let ema = history[0].score;
803
- for (let i = 1; i < history.length; i++) {
804
- ema = alpha * history[i].score + (1 - alpha) * ema;
805
- }
806
- return ema;
807
- }
808
-
809
- // Trimmed mean (remove outliers)
810
- function calculateTrimmedMean(
811
- history: EvalRunRecord[],
812
- trimPercent: number = 0.1 // Trim top/bottom 10%
813
- ): number {
814
- const sorted = history.map(r => r.score).sort((a, b) => a - b);
815
- const trimCount = Math.floor(sorted.length * trimPercent);
816
- const trimmed = sorted.slice(trimCount, sorted.length - trimCount);
817
- return mean(trimmed);
818
- }
819
-
820
- // Let caller choose baseline strategy
821
- type BaselineStrategy = "mean" | "ema" | "trimmed-mean" | "median";
822
-
823
- function calculateBaseline(
824
- history: EvalRunRecord[],
825
- strategy: BaselineStrategy = "ema"
826
- ): number {
827
- switch (strategy) {
828
- case "mean": return mean(history.map(r => r.score));
829
- case "ema": return calculateEMA(history);
830
- case "trimmed-mean": return calculateTrimmedMean(history);
831
- case "median": return median(history.map(r => r.score));
832
- }
833
- }
834
- ```
835
-
836
- **Benefits:**
837
- - Baseline adapts to improvements (EMA)
838
- - Robust to outliers (trimmed mean, median)
839
- - Configurable per eval (some need stability, others need responsiveness)
840
-
841
- ### Issue 6: No Eval Parameterization
842
-
843
- **Problem:** Can't run same eval with different configs (e.g., max_subtasks=4 vs max_subtasks=8). Must copy-paste eval file.
844
-
845
- **Impact:**
846
- - Duplication (multiple eval files for slight variations)
847
- - Can't grid search optimal params
848
- - Hard to compare strategies side-by-side
849
-
850
- **Solution:**
851
- ```typescript
852
- // Parameterized evals
853
- evalite.parameterize("Decomposition Quality", {
854
- params: [
855
- { maxSubtasks: 4, strategy: "file-based" },
856
- { maxSubtasks: 4, strategy: "feature-based" },
857
- { maxSubtasks: 8, strategy: "file-based" },
858
- { maxSubtasks: 8, strategy: "feature-based" },
859
- ],
860
- data: async ({ maxSubtasks, strategy }) =>
861
- loadEvalCases(PROJECT_KEY, { strategy, limit: 20 }),
862
- task: async (input, { maxSubtasks }) => {
863
- const prompt = formatDecompositionPrompt(input.task, input.context, maxSubtasks);
864
- return await generateDecomposition(prompt);
865
- },
866
- scorers: [subtaskIndependence, coverageCompleteness],
867
- });
868
- ```
869
-
870
- **Benefits:**
871
- - Single source of truth (DRY)
872
- - Easy to add new params (no file duplication)
873
- - Results grouped for comparison
874
-
875
- ---
876
-
877
- ## Performance Characteristics
878
-
879
- ### Eval Execution Times (Estimated)
880
-
881
- | Eval | Data Source | Task | Scorers | Time/Case | Total (20 cases) |
882
- |------|-------------|------|---------|-----------|------------------|
883
- | `swarm-decomposition` | PGlite | LLM call | 4 (1 LLM judge) | ~3-5s | ~60-100s |
884
- | `coordinator-session` | JSONL | Identity | 5 | ~10ms | ~200ms |
885
- | `compaction-prompt` | Fixtures | Identity | 5 | ~5ms | ~100ms |
886
- | `compaction-resumption` | JSONL | Logic | 4 | ~20ms | ~400ms |
887
-
888
- **Bottlenecks:**
889
- 1. **LLM calls** - `decompositionCoherence` dominates swarm-decomposition time
890
- 2. **PGlite queries** - Network RTT if using remote DB
891
- 3. **JSONL parsing** - Linear scan of all session files (could be indexed)
892
-
893
- **Optimization opportunities:**
894
- 1. **Parallel LLM calls** - Run test cases concurrently (10 parallel = 10x faster)
895
- 2. **Response caching** - Cache LLM responses by prompt hash
896
- 3. **Session indexing** - SQLite index on session_id, epic_id for fast lookup
897
- 4. **Incremental evals** - Only test changed cases (git diff → affected evals)
898
-
899
- ---
900
-
901
- ## Integration Points
902
-
903
- ### 1. Swarm Tools → Capture
904
-
905
- **File:** `src/eval-capture.ts`
906
-
907
- **Hook points:**
908
- - `swarm_decompose()` → `captureDecompositionEvent()`
909
- - `swarm_complete()` → `captureOutcomeEvent()`
910
- - Tool call inspection → `detectCoordinatorViolation()` → `captureViolationEvent()`
911
- - Compaction hook → `captureCompactionEvent()`
912
-
913
- **Data validation:** Zod schemas ensure type safety at capture time.
914
-
915
- ### 2. Evalite → Loaders
916
-
917
- **File:** `evals/lib/data-loader.ts`, `evals/lib/compaction-loader.ts`
918
-
919
- **Pattern:**
920
- ```typescript
921
- evalite("Test Name", {
922
- data: async () => {
923
- const realData = await hasRealEvalData(PROJECT_KEY, 5);
924
- return realData
925
- ? await loadEvalCases(PROJECT_KEY, { limit: 20 })
926
- : fixtures;
927
- },
928
- // ...
929
- });
930
- ```
931
-
932
- **Issue:** Fallback logic duplicated across eval files. Should be abstracted.
933
-
934
- ### 3. Evalite → Gates
935
-
936
- **File:** `src/eval-gates.ts`
937
-
938
- **Pattern:**
939
- ```typescript
940
- import { checkGate } from "../src/eval-gates.js";
941
-
942
- evalite("Test", {
943
- // ... data, task, scorers
944
- onComplete: ({ score }) => {
945
- const gate = checkGate(PROJECT_PATH, "test-name", score);
946
- if (!gate.passed) {
947
- console.error(`❌ Gate failed: ${gate.message}`);
948
- process.exit(1); // Fail CI
949
- }
950
- },
951
- });
952
- ```
953
-
954
- **Issue:** No built-in integration. Must manually wire `onComplete` hook in each eval file.
955
-
956
- ### 4. Gates → Learning
957
-
958
- **File:** `src/eval-learning.ts`
959
-
960
- **Pattern:**
961
- ```typescript
962
- import { learnFromEvalFailure } from "../src/eval-learning.js";
963
-
964
- const result = await learnFromEvalFailure(
965
- evalName,
966
- currentScore,
967
- history,
968
- memoryAdapter
969
- );
970
-
971
- if (result.triggered) {
972
- console.log(`📉 Stored failure to memory: ${result.memory_id}`);
973
- }
974
- ```
975
-
976
- **Issue:** No automatic execution. Must manually call after gate check.
977
-
978
- ### 5. Learning → Prompts (Missing)
979
-
980
- **Expected flow:**
981
- ```typescript
982
- // Query failures before generating prompts
983
- const failures = await queryEvalFailures(evalName, memoryAdapter);
984
-
985
- // Inject into LLM prompt
986
- const prompt = `
987
- ${basePrompt}
988
-
989
- PAST FAILURES:
990
- ${failures.map(f => `- ${f.information}`).join("\n")}
991
-
992
- Avoid these patterns.
993
- `;
994
- ```
995
-
996
- **Status:** Not implemented. Learning loop stores but doesn't retrieve.
997
-
998
- ---
999
-
1000
- ## Testing Strategy
1001
-
1002
- ### Current Coverage
1003
-
1004
- | Component | Unit Tests | Integration Tests | E2E Tests |
1005
- |-----------|------------|-------------------|-----------|
1006
- | Data loaders | ✅ `data-loader.test.ts` | ✅ `data-loader.evalite-test.ts` | ❌ |
1007
- | Scorers | ✅ `scorers/*.evalite-test.ts` | ❌ | ❌ |
1008
- | Gates | ✅ `eval-gates.test.ts` | ❌ | ❌ |
1009
- | Learning | ✅ `eval-learning.test.ts` | ❌ | ❌ |
1010
- | Capture | ❌ | ✅ `eval-capture.integration.test.ts` | ❌ |
1011
-
1012
- **Gaps:**
1013
- - No E2E tests (full CAPTURE → EVAL → GATE → LEARN flow)
1014
- - No scorer integration tests (composition logic)
1015
- - No error path tests (what if LLM fails? PGlite down? JSONL corrupt?)
1016
-
1017
- **Recommendation:**
1018
- ```typescript
1019
- // E2E test skeleton
1020
- describe("Eval Pipeline E2E", () => {
1021
- it("should capture → load → eval → gate → learn", async () => {
1022
- // 1. Trigger capture
1023
- await swarm_decompose(task, context);
1024
-
1025
- // 2. Load data
1026
- const cases = await loadEvalCases(PROJECT_KEY);
1027
- expect(cases.length).toBeGreaterThan(0);
1028
-
1029
- // 3. Run eval
1030
- const score = await runEval(cases);
1031
-
1032
- // 4. Check gate
1033
- const gate = checkGate(PROJECT_PATH, "test", score);
1034
- expect(gate.passed).toBe(true);
1035
-
1036
- // 5. Learn from failure (if any)
1037
- const learned = await learnFromEvalFailure("test", score, history, memory);
1038
- // ... assertions
1039
- });
1040
- });
1041
- ```
1042
-
1043
- ---
1044
-
1045
- ## Improvement Roadmap
1046
-
1047
- ### Phase 1: Foundation (1-2 weeks)
1048
-
1049
- 1. **Extract data source interface** (`EvalSource<T>`)
1050
- - Refactor `data-loader.ts` into `PGliteSource`, `JsonlSource`, `FixtureSource`
1051
- - Add source selection logic to shared utility
1052
- - Update all eval files to use new interface
1053
-
1054
- 2. **Make filters first-class**
1055
- - Extract `SessionFilter` type and filter library
1056
- - Move quality criteria out of loader, into eval files
1057
- - Add filter composition utilities
1058
-
1059
- 3. **Add scorer versioning**
1060
- - Add `version` field to scorer metadata
1061
- - Track scorer versions in eval history
1062
- - Update baseline calculation to only use compatible runs
1063
-
1064
- ### Phase 2: Robustness (2-3 weeks)
1065
-
1066
- 4. **LLM judge improvements**
1067
- - Add budget enforcement (max calls, max cost)
1068
- - Add response caching (hash prompt → cache result)
1069
- - Add fallback scoring (heuristic if LLM fails)
1070
-
1071
- 5. **Baseline improvements**
1072
- - Implement EMA, trimmed mean, median strategies
1073
- - Add `BaselineStrategy` config to eval-gates
1074
- - A/B test strategies against real data
1075
-
1076
- 6. **Error handling**
1077
- - Add retry logic to LLM calls
1078
- - Graceful degradation for missing data
1079
- - Corrupt JSONL line handling (currently silent skip)
1080
-
1081
- ### Phase 3: Intelligence (3-4 weeks)
1082
-
1083
- 7. **Learning loop completion**
1084
- - Query eval failures before generating prompts
1085
- - Inject past failures into LLM context
1086
- - Auto-generate hypotheses for regressions
1087
-
1088
- 8. **Failure analysis**
1089
- - Diff scorer outputs between runs
1090
- - Identify which test cases regressed
1091
- - Surface root cause signals (scorer, data, code change)
1092
-
1093
- 9. **CI/PR integration**
1094
- - Post gate results to GitHub PR comments
1095
- - Block merge on production gate failures
1096
- - Add `swarm eval status` badge to PRs
1097
-
1098
- ### Phase 4: Scale (4-6 weeks)
1099
-
1100
- 10. **Performance optimization**
1101
- - Parallel LLM calls for test cases
1102
- - Session indexing (SQLite for fast lookup)
1103
- - Incremental evals (only run affected tests)
1104
-
1105
- 11. **Eval parameterization**
1106
- - Add `evalite.parameterize()` support
1107
- - Grid search optimal params (max_subtasks, strategy combos)
1108
- - Compare strategies side-by-side
1109
-
1110
- 12. **Observability**
1111
- - Real-time eval dashboards (Grafana + Prometheus)
1112
- - Eval run traces (OpenTelemetry)
1113
- - Cost tracking (LLM usage, storage growth)
1114
-
1115
- ---
1116
-
1117
- ## Conclusion
1118
-
1119
- The eval infrastructure is **well-designed at the macro level** (clear pipeline, progressive gates, learning loop), but has **tactical issues** that impact usability and maintainability:
1120
-
1121
- **Key strengths to preserve:**
1122
- - Progressive gates prevent premature failures
1123
- - Real data integration grounds evals in reality
1124
- - Learning loop closes the feedback cycle
1125
- - Type-safe schemas prevent garbage data
1126
-
1127
- **Critical improvements needed:**
1128
- - **Abstraction:** Extract data source interface (reduce coupling)
1129
- - **Configurability:** Make filters, baselines, budgets first-class (not hardcoded)
1130
- - **Versioning:** Track scorer versions (enable safe improvements)
1131
- - **Robustness:** Add retries, fallbacks, error handling (production-grade)
1132
-
1133
- **Impact of improvements:**
1134
- - **Developer experience:** Easier to add new evals (less boilerplate)
1135
- - **Reliability:** Evals don't fail due to transient issues (network, LLM)
1136
- - **Trust:** Score changes attributable to code (not scorer drift)
1137
- - **Cost control:** LLM budgets prevent runaway spend
1138
-
1139
- **Next steps:** Start with Phase 1 (foundation) to unblock future improvements. The architecture is sound - just needs tactical refactoring.
1140
-
1141
- ---
1142
-
1143
- ## Appendix: File Inventory
1144
-
1145
- ```
1146
- evals/
1147
- ├── README.md # User-facing docs (comprehensive)
1148
- ├── ARCHITECTURE.md # This document
1149
- ├── evalite.config.ts.bak # Minimal config (mostly defaults)
1150
-
1151
- ├── fixtures/ # Synthetic test data
1152
- │ ├── decomposition-cases.ts # Decomposition test cases
1153
- │ ├── coordinator-sessions.ts # Perfect/bad coordinator examples
1154
- │ ├── compaction-cases.ts # Compaction logic test cases
1155
- │ └── compaction-prompt-cases.ts # Continuation prompt examples
1156
-
1157
- ├── lib/ # Data loading utilities
1158
- │ ├── data-loader.ts # PGlite + JSONL session loader
1159
- │ ├── data-loader.test.ts # Unit tests
1160
- │ ├── data-loader.evalite-test.ts # Integration tests
1161
- │ ├── compaction-loader.ts # COMPACTION event loader
1162
- │ ├── compaction-loader.test.ts # Unit tests
1163
- │ └── llm.ts # LLM client (AI SDK + Gateway)
1164
-
1165
- ├── scorers/ # Quality metric implementations
1166
- │ ├── index.ts # Decomposition scorers + exports
1167
- │ ├── index.test.ts # Unit tests
1168
- │ ├── coordinator-discipline.ts # Protocol adherence scorers
1169
- │ ├── coordinator-discipline.evalite-test.ts
1170
- │ ├── compaction-scorers.ts # Compaction correctness
1171
- │ ├── compaction-prompt-scorers.ts # Prompt quality
1172
- │ ├── outcome-scorers.ts # Real execution outcomes
1173
- │ └── outcome-scorers.evalite-test.ts
1174
-
1175
- ├── swarm-decomposition.eval.ts # Decomposition quality eval
1176
- ├── coordinator-session.eval.ts # Coordinator discipline eval
1177
- ├── compaction-prompt.eval.ts # Continuation prompt quality
1178
- ├── compaction-resumption.eval.ts # Compaction correctness eval
1179
- └── example.eval.ts # Sanity check / template
1180
-
1181
- Total: 24 TypeScript files (8 evals, 8 loaders/utils, 8 scorers)
1182
- ```
1183
-
1184
- ---
1185
-
1186
- **Generated by:** BlueForest (swarm worker)
1187
- **Cell:** opencode-swarm-plugin--ys7z8-mjlk7jsilk9
1188
- **Epic:** opencode-swarm-plugin--ys7z8-mjlk7js9bt1
1189
- **Date:** 2025-12-25