opencode-swarm-plugin 0.44.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.ts +16 -10
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.js +7644 -62599
- package/dist/plugin.js +23766 -78721
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2286
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/cass-baseline.ts +0 -217
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2515
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/contributor-tools.test.ts +0 -133
- package/src/contributor-tools.ts +0 -201
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -940
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/sessions/agent-discovery.test.ts +0 -137
- package/src/sessions/agent-discovery.ts +0 -112
- package/src/sessions/index.ts +0 -15
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
package/evals/README.md
DELETED
|
@@ -1,768 +0,0 @@
|
|
|
1
|
-
# Eval-Driven Development with Progressive Gates
|
|
2
|
-
|
|
3
|
-
```
|
|
4
|
-
┌──────────────────────────────────────────────────────────────┐
|
|
5
|
-
│ EVAL PIPELINE │
|
|
6
|
-
│ │
|
|
7
|
-
│ CAPTURE → SCORE → STORE → GATE → LEARN → IMPROVE │
|
|
8
|
-
│ │
|
|
9
|
-
│ Real execution data feeds back into prompt generation │
|
|
10
|
-
└──────────────────────────────────────────────────────────────┘
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
TypeScript-native evaluation framework for testing swarm task decomposition quality and coordinator discipline. Built on [Evalite](https://evalite.dev), powered by captured real-world execution data.
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
## Quick Start
|
|
18
|
-
|
|
19
|
-
```bash
|
|
20
|
-
# Run all evals once
|
|
21
|
-
bun run eval:run
|
|
22
|
-
|
|
23
|
-
# Run specific eval suite
|
|
24
|
-
bun run eval:decomposition # Task decomposition quality
|
|
25
|
-
bun run eval:coordinator # Coordinator protocol compliance
|
|
26
|
-
bun run eval:compaction # Compaction prompt quality
|
|
27
|
-
|
|
28
|
-
# Check eval status (progressive gates)
|
|
29
|
-
swarm eval status
|
|
30
|
-
|
|
31
|
-
# View eval history with trends
|
|
32
|
-
swarm eval history
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
---
|
|
36
|
-
|
|
37
|
-
## Architecture
|
|
38
|
-
|
|
39
|
-
### The Pipeline
|
|
40
|
-
|
|
41
|
-
```
|
|
42
|
-
┌─────────────────────────────────────────────────────────────────┐
|
|
43
|
-
│ │
|
|
44
|
-
│ 1. CAPTURE (Real Execution) │
|
|
45
|
-
│ ├─ Decomposition: task, strategy, subtasks │
|
|
46
|
-
│ ├─ Outcomes: duration, errors, retries, success │
|
|
47
|
-
│ ├─ Coordinator Events: decisions, violations, compaction │
|
|
48
|
-
│ └─ Store to: .opencode/eval-data.jsonl, sessions/*.jsonl │
|
|
49
|
-
│ │
|
|
50
|
-
│ 2. SCORE (Quality Metrics) │
|
|
51
|
-
│ ├─ Subtask Independence (file conflicts) │
|
|
52
|
-
│ ├─ Complexity Balance (fair work distribution) │
|
|
53
|
-
│ ├─ Coverage Completeness (files + scope) │
|
|
54
|
-
│ ├─ Instruction Clarity (actionable descriptions) │
|
|
55
|
-
│ └─ Coordinator Discipline (protocol adherence) │
|
|
56
|
-
│ │
|
|
57
|
-
│ 3. STORE (History Tracking) │
|
|
58
|
-
│ ├─ Record to: .opencode/eval-history.jsonl │
|
|
59
|
-
│ ├─ Track: score, timestamp, run_count │
|
|
60
|
-
│ └─ Calculate: phase, variance, baseline │
|
|
61
|
-
│ │
|
|
62
|
-
│ 4. GATE (Progressive Quality Control) │
|
|
63
|
-
│ ├─ Bootstrap (<10 runs): Always pass, collect data │
|
|
64
|
-
│ ├─ Stabilization (10-50 runs): Warn on >10% regression │
|
|
65
|
-
│ └─ Production (>50 runs, variance <0.1): Fail on >5% drop │
|
|
66
|
-
│ │
|
|
67
|
-
│ 5. LEARN (Failure Feedback) │
|
|
68
|
-
│ ├─ Detect: Significant score drops (>15% from baseline) │
|
|
69
|
-
│ ├─ Store to: Semantic memory with tags │
|
|
70
|
-
│ └─ Query: Before generating future prompts │
|
|
71
|
-
│ │
|
|
72
|
-
│ 6. IMPROVE (Continuous Refinement) │
|
|
73
|
-
│ └─ Future prompts query past failures for context │
|
|
74
|
-
│ │
|
|
75
|
-
└─────────────────────────────────────────────────────────────────┘
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
### Progressive Gates (Phase-Based Quality Control)
|
|
79
|
-
|
|
80
|
-
The eval system uses **progressive gates** that adapt based on data maturity:
|
|
81
|
-
|
|
82
|
-
```
|
|
83
|
-
Phase Runs Variance Gate Behavior
|
|
84
|
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
85
|
-
Bootstrap <10 N/A ✅ Always pass (collect data)
|
|
86
|
-
Stabilization 10-50 N/A ⚠️ Warn on >10% regression (pass)
|
|
87
|
-
Production >50 <0.1 ❌ Fail on >5% regression
|
|
88
|
-
(High Variance) >50 ≥0.1 ⚠️ Stay in stabilization
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
**Why progressive?**
|
|
92
|
-
|
|
93
|
-
- **Bootstrap**: No baseline yet, focus on data collection
|
|
94
|
-
- **Stabilization**: Baseline forming, tolerate noise while learning
|
|
95
|
-
- **Production**: Stable baseline, strict quality enforcement
|
|
96
|
-
|
|
97
|
-
**Variance threshold (0.1)**: Measures score consistency. High variance = unstable eval, stays in stabilization until it settles.
|
|
98
|
-
|
|
99
|
-
**Regression calculation**:
|
|
100
|
-
```
|
|
101
|
-
baseline = mean(historical_scores)
|
|
102
|
-
regression = (baseline - current_score) / baseline
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
---
|
|
106
|
-
|
|
107
|
-
## Eval Suites
|
|
108
|
-
|
|
109
|
-
### Swarm Decomposition (`swarm-decomposition.eval.ts`)
|
|
110
|
-
|
|
111
|
-
**What it measures:** Quality of task decomposition into parallel subtasks
|
|
112
|
-
|
|
113
|
-
**Data sources:**
|
|
114
|
-
- Fixtures: `fixtures/decomposition-cases.ts`
|
|
115
|
-
- Real captures: `.opencode/eval-data.jsonl`
|
|
116
|
-
|
|
117
|
-
**Scorers:**
|
|
118
|
-
|
|
119
|
-
| Scorer | Weight | What It Checks | Perfect Score |
|
|
120
|
-
| ------------------------ | ------ | ------------------------------------------------------- | ---------------------------------- |
|
|
121
|
-
| **Subtask Independence** | 0.25 | No file overlaps between subtasks (prevents conflicts) | 0 files in multiple subtasks |
|
|
122
|
-
| **Complexity Balance** | 0.25 | Work distributed evenly (coefficient of variation <0.3) | CV <0.3 (max/min complexity ratio) |
|
|
123
|
-
| **Coverage** | 0.25 | Required files covered, subtask count in range | All required files + 3-6 subtasks |
|
|
124
|
-
| **Instruction Clarity** | 0.25 | Descriptions actionable, files specified, titles clear | >20 chars, files listed, specific |
|
|
125
|
-
|
|
126
|
-
**Example output:**
|
|
127
|
-
```
|
|
128
|
-
swarm-decomposition
|
|
129
|
-
├─ subtaskIndependence: 1.0 (no conflicts)
|
|
130
|
-
├─ complexityBalance: 0.85 (CV: 0.22)
|
|
131
|
-
├─ coverageCompleteness: 1.0 (all files covered)
|
|
132
|
-
└─ instructionClarity: 0.90 (clear, actionable)
|
|
133
|
-
→ Overall: 0.94 ✅ PASS (stabilization phase)
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
### Coordinator Session (`coordinator-session.eval.ts`)
|
|
137
|
-
|
|
138
|
-
**What it measures:** Coordinator protocol adherence during swarm runs
|
|
139
|
-
|
|
140
|
-
**Data sources:**
|
|
141
|
-
- Real sessions: `~/.config/swarm-tools/sessions/*.jsonl`
|
|
142
|
-
- Fixtures: `fixtures/coordinator-sessions.ts`
|
|
143
|
-
|
|
144
|
-
**Scorers:**
|
|
145
|
-
|
|
146
|
-
| Scorer | Weight | What It Checks | Perfect Score |
|
|
147
|
-
| ---------------------------- | ------ | -------------------------------------------------- | -------------------- |
|
|
148
|
-
| **Violation Count** | 0.30 | Protocol violations (edit files, run tests, etc.) | 0 violations |
|
|
149
|
-
| **Spawn Efficiency** | 0.25 | Workers spawned / subtasks planned | 100% (all delegated) |
|
|
150
|
-
| **Review Thoroughness** | 0.25 | Reviews completed / workers finished | 100% (all reviewed) |
|
|
151
|
-
| **Time to First Spawn** | 0.20 | Speed from decomposition to first worker spawn | <60 seconds |
|
|
152
|
-
| **Overall Discipline** (composite) | 1.00 | Weighted composite of above | 1.0 (perfect) |
|
|
153
|
-
|
|
154
|
-
**Violations tracked:**
|
|
155
|
-
- `coordinator_edited_file` - Coordinator should NEVER edit directly
|
|
156
|
-
- `coordinator_ran_tests` - Workers run tests, not coordinator
|
|
157
|
-
- `coordinator_reserved_files` - Only workers reserve files
|
|
158
|
-
- `no_worker_spawned` - Coordinator must delegate, not do work itself
|
|
159
|
-
|
|
160
|
-
**Example output:**
|
|
161
|
-
```
|
|
162
|
-
coordinator-behavior
|
|
163
|
-
├─ violationCount: 1.0 (0 violations)
|
|
164
|
-
├─ spawnEfficiency: 1.0 (3/3 workers spawned)
|
|
165
|
-
├─ reviewThoroughness: 0.67 (2/3 reviewed)
|
|
166
|
-
└─ timeToFirstSpawn: 0.90 (45 seconds)
|
|
167
|
-
→ overallDiscipline: 0.89 ✅ PASS (bootstrap phase, collecting data)
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
#### Coordinator Session Capture (Deep Dive)
|
|
171
|
-
|
|
172
|
-
**How it works:** Session capture is fully automatic when coordinator tools are used. No manual instrumentation needed.
|
|
173
|
-
|
|
174
|
-
**Capture flow:**
|
|
175
|
-
|
|
176
|
-
```
|
|
177
|
-
┌─────────────────────────────────────────────────────────────┐
|
|
178
|
-
│ SESSION CAPTURE FLOW │
|
|
179
|
-
│ │
|
|
180
|
-
│ 1. Coordinator tool call detected │
|
|
181
|
-
│ ├─ swarm_decompose, hive_create_epic, etc. │
|
|
182
|
-
│ └─ Tool name + args inspected in real-time │
|
|
183
|
-
│ │
|
|
184
|
-
│ 2. Violation detection (planning-guardrails.ts) │
|
|
185
|
-
│ ├─ detectCoordinatorViolation() checks patterns │
|
|
186
|
-
│ ├─ Edit/Write tools → coordinator_edited_file │
|
|
187
|
-
│ ├─ bash with test patterns → coordinator_ran_tests │
|
|
188
|
-
│ └─ swarmmail_reserve → coordinator_reserved_files │
|
|
189
|
-
│ │
|
|
190
|
-
│ 3. Event emission (eval-capture.ts) │
|
|
191
|
-
│ ├─ captureCoordinatorEvent() validates via Zod │
|
|
192
|
-
│ ├─ Appends JSONL line to session file │
|
|
193
|
-
│ └─ ~/.config/swarm-tools/sessions/{session_id}.jsonl │
|
|
194
|
-
│ │
|
|
195
|
-
│ 4. Eval consumption (coordinator-session.eval.ts) │
|
|
196
|
-
│ ├─ loadCapturedSessions() reads all *.jsonl files │
|
|
197
|
-
│ ├─ Parses events, reconstructs sessions │
|
|
198
|
-
│ └─ Scorers analyze event sequences │
|
|
199
|
-
│ │
|
|
200
|
-
└─────────────────────────────────────────────────────────────┘
|
|
201
|
-
```
|
|
202
|
-
|
|
203
|
-
**Event types:**
|
|
204
|
-
|
|
205
|
-
| Event Type | Subtypes | When Captured |
|
|
206
|
-
| -------------- | --------------------------------------------------------------------- | ------------------------------------ |
|
|
207
|
-
| `DECISION` | strategy_selected, worker_spawned, review_completed, decomposition_complete | Coordinator makes decision |
|
|
208
|
-
| `VIOLATION` | coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned | Protocol violation detected |
|
|
209
|
-
| `OUTCOME` | subtask_success, subtask_retry, subtask_failed, epic_complete | Worker completes or epic finishes |
|
|
210
|
-
| `COMPACTION` | detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked | Compaction lifecycle events |
|
|
211
|
-
|
|
212
|
-
**Violation detection patterns** (from `planning-guardrails.ts`):
|
|
213
|
-
|
|
214
|
-
```typescript
|
|
215
|
-
// File modification detection
|
|
216
|
-
VIOLATION_PATTERNS.FILE_MODIFICATION_TOOLS = ["edit", "write"];
|
|
217
|
-
|
|
218
|
-
// Test execution detection (regex patterns in bash commands)
|
|
219
|
-
VIOLATION_PATTERNS.TEST_EXECUTION_PATTERNS = [
|
|
220
|
-
/\bbun\s+test\b/i,
|
|
221
|
-
/\bnpm\s+(run\s+)?test/i,
|
|
222
|
-
/\bjest\b/i,
|
|
223
|
-
/\bvitest\b/i,
|
|
224
|
-
// ... and 6 more patterns
|
|
225
|
-
];
|
|
226
|
-
|
|
227
|
-
// File reservation detection
|
|
228
|
-
VIOLATION_PATTERNS.RESERVATION_TOOLS = ["swarmmail_reserve", "agentmail_reserve"];
|
|
229
|
-
```
|
|
230
|
-
|
|
231
|
-
**Example session file** (`~/.config/swarm-tools/sessions/session-abc123.jsonl`):
|
|
232
|
-
|
|
233
|
-
```jsonl
|
|
234
|
-
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:00:00Z","event_type":"DECISION","decision_type":"strategy_selected","payload":{"strategy":"feature-based"}}
|
|
235
|
-
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:01:00Z","event_type":"DECISION","decision_type":"decomposition_complete","payload":{"subtask_count":3}}
|
|
236
|
-
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:02:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{"worker_id":"SwiftFire","bead_id":"mjkw81rkq4c.1"}}
|
|
237
|
-
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:05:00Z","event_type":"VIOLATION","violation_type":"coordinator_edited_file","payload":{"tool":"edit","file":"src/auth.ts"}}
|
|
238
|
-
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:10:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{"bead_id":"mjkw81rkq4c.1","duration_ms":480000}}
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
**Viewing sessions:**
|
|
242
|
-
|
|
243
|
-
```bash
|
|
244
|
-
# List all captured sessions (coming soon)
|
|
245
|
-
swarm log sessions
|
|
246
|
-
|
|
247
|
-
# View specific session events
|
|
248
|
-
cat ~/.config/swarm-tools/sessions/session-abc123.jsonl | jq .
|
|
249
|
-
|
|
250
|
-
# Filter to violations only
|
|
251
|
-
cat ~/.config/swarm-tools/sessions/*.jsonl | jq 'select(.event_type == "VIOLATION")'
|
|
252
|
-
|
|
253
|
-
# Count violations by type
|
|
254
|
-
cat ~/.config/swarm-tools/sessions/*.jsonl | jq -r 'select(.event_type == "VIOLATION") | .violation_type' | sort | uniq -c
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
**Why JSONL format?**
|
|
258
|
-
|
|
259
|
-
- **Append-only**: No file locking, safe for concurrent writes
|
|
260
|
-
- **Streamable**: Process events one-by-one without loading full file
|
|
261
|
-
- **Line-oriented**: Easy to `grep`, `jq`, `tail -f` for live monitoring
|
|
262
|
-
- **Fault-tolerant**: Corrupted line doesn't break entire file
|
|
263
|
-
|
|
264
|
-
**Integration points:**
|
|
265
|
-
|
|
266
|
-
| Where | What Gets Captured | File |
|
|
267
|
-
| -------------------------- | ----------------------------------------- | ----------------------- |
|
|
268
|
-
| `swarm_decompose` | DECISION: strategy_selected, decomposition_complete | sessions/*.jsonl |
|
|
269
|
-
| `swarm_spawn_subtask` | DECISION: worker_spawned | sessions/*.jsonl |
|
|
270
|
-
| `swarm_review` | DECISION: review_completed | sessions/*.jsonl |
|
|
271
|
-
| `swarm_complete` | OUTCOME: subtask_success/failed | sessions/*.jsonl |
|
|
272
|
-
| Tool call inspection | VIOLATION: (real-time pattern matching) | sessions/*.jsonl |
|
|
273
|
-
| Compaction hook | COMPACTION: (all lifecycle stages) | sessions/*.jsonl |
|
|
274
|
-
|
|
275
|
-
**Source files:**
|
|
276
|
-
|
|
277
|
-
- **Schema**: `src/eval-capture.ts` - CoordinatorEventSchema (Zod discriminated union)
|
|
278
|
-
- **Violation detection**: `src/planning-guardrails.ts` - detectCoordinatorViolation()
|
|
279
|
-
- **Capture**: `src/eval-capture.ts` - captureCoordinatorEvent()
|
|
280
|
-
- **Scorers**: `evals/scorers/coordinator-discipline.ts` - violationCount, spawnEfficiency, etc.
|
|
281
|
-
- **Eval**: `evals/coordinator-session.eval.ts` - Real sessions + fixtures
|
|
282
|
-
|
|
283
|
-
### Compaction Prompt (`compaction-prompt.eval.ts`)
|
|
284
|
-
|
|
285
|
-
**What it measures:** Quality of continuation prompts after context compaction
|
|
286
|
-
|
|
287
|
-
**Data sources:**
|
|
288
|
-
- Captured compaction events from session files
|
|
289
|
-
- Test fixtures with known-good/bad prompts
|
|
290
|
-
|
|
291
|
-
**Scorers:**
|
|
292
|
-
|
|
293
|
-
| Scorer | Weight | What It Checks | Perfect Score |
|
|
294
|
-
| -------------------------------- | ------ | --------------------------------------------------------- | -------------------------------- |
|
|
295
|
-
| **Epic ID Specificity** | 0.20 | Real IDs (mjkw...) not placeholders (<epic-id>, bd-xxx) | Real epic ID present |
|
|
296
|
-
| **Actionability** | 0.20 | Tool calls with real values (swarm_status with epic ID) | Actionable tool with real values |
|
|
297
|
-
| **Coordinator Identity** | 0.25 | ASCII header + strong mandates (NEVER/ALWAYS) | ASCII box + strong language |
|
|
298
|
-
| **Forbidden Tools Listed** | 0.15 | Lists Edit, Write, swarmmail_reserve, git commit by name | 4/4 forbidden tools listed |
|
|
299
|
-
| **Post-Compaction Discipline** | 0.20 | First suggested tool is swarm_status or inbox (not Edit) | First tool correct |
|
|
300
|
-
|
|
301
|
-
**Why these metrics?**
|
|
302
|
-
|
|
303
|
-
Post-compaction coordinators often "wake up" confused:
|
|
304
|
-
- Forget they're coordinators → start editing files
|
|
305
|
-
- Use placeholders → can't check actual status
|
|
306
|
-
- Weak language → ignore mandates
|
|
307
|
-
- Wrong first tool → dive into code instead of checking workers
|
|
308
|
-
|
|
309
|
-
**Example output:**
|
|
310
|
-
```
|
|
311
|
-
compaction-prompt
|
|
312
|
-
├─ epicIdSpecificity: 1.0 (real ID: mjkw81rkq4c)
|
|
313
|
-
├─ actionability: 1.0 (swarm_status with real epic ID)
|
|
314
|
-
├─ coordinatorIdentity: 1.0 (ASCII header + NEVER/ALWAYS)
|
|
315
|
-
├─ forbiddenToolsPresent: 1.0 (4/4 tools listed)
|
|
316
|
-
└─ postCompactionDiscipline: 1.0 (first tool: swarm_status)
|
|
317
|
-
→ Overall: 1.0 ✅ PASS (production phase)
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
---
|
|
321
|
-
|
|
322
|
-
## Data Capture
|
|
323
|
-
|
|
324
|
-
### What Gets Captured
|
|
325
|
-
|
|
326
|
-
**Decomposition Events** (`.opencode/eval-data.jsonl`):
|
|
327
|
-
```jsonl
|
|
328
|
-
{
|
|
329
|
-
"id": "mjkw81rkq4c",
|
|
330
|
-
"timestamp": "2025-01-01T12:00:00Z",
|
|
331
|
-
"task": "Add OAuth authentication",
|
|
332
|
-
"strategy": "feature-based",
|
|
333
|
-
"epic_title": "OAuth Implementation",
|
|
334
|
-
"subtasks": [...],
|
|
335
|
-
"outcomes": [...]
|
|
336
|
-
}
|
|
337
|
-
```
|
|
338
|
-
|
|
339
|
-
**Coordinator Sessions** (`~/.config/swarm-tools/sessions/<session-id>.jsonl`):
|
|
340
|
-
```jsonl
|
|
341
|
-
{"event_type": "DECISION", "decision_type": "strategy_selected", ...}
|
|
342
|
-
{"event_type": "DECISION", "decision_type": "worker_spawned", ...}
|
|
343
|
-
{"event_type": "VIOLATION", "violation_type": "coordinator_edited_file", ...}
|
|
344
|
-
{"event_type": "COMPACTION", "compaction_type": "prompt_generated", ...}
|
|
345
|
-
```
|
|
346
|
-
|
|
347
|
-
**Eval History** (`.opencode/eval-history.jsonl`):
|
|
348
|
-
```jsonl
|
|
349
|
-
{"timestamp": "...", "eval_name": "swarm-decomposition", "score": 0.92, "run_count": 15}
|
|
350
|
-
```
|
|
351
|
-
|
|
352
|
-
### Capture Points (Automatic)
|
|
353
|
-
|
|
354
|
-
| Integration Point | What Gets Captured | File |
|
|
355
|
-
| -------------------------- | ------------------------------------- | ----------------------- |
|
|
356
|
-
| `swarm_decompose` | Task, strategy, subtasks | eval-data.jsonl |
|
|
357
|
-
| `swarm_complete` | Outcome signals (duration, errors) | eval-data.jsonl |
|
|
358
|
-
| `swarm_record_outcome` | Learning signals | swarm-mail database |
|
|
359
|
-
| Coordinator spawn | Worker spawn event | sessions/*.jsonl |
|
|
360
|
-
| Coordinator review | Review decision | sessions/*.jsonl |
|
|
361
|
-
| Compaction hook | Prompt content, detection results | sessions/*.jsonl |
|
|
362
|
-
| Evalite runner | Score, baseline, phase | eval-history.jsonl |
|
|
363
|
-
|
|
364
|
-
---
|
|
365
|
-
|
|
366
|
-
## CLI Commands
|
|
367
|
-
|
|
368
|
-
### `swarm eval status [eval-name]`
|
|
369
|
-
|
|
370
|
-
Shows current phase, gate thresholds, and recent scores with sparklines.
|
|
371
|
-
|
|
372
|
-
```bash
|
|
373
|
-
$ swarm eval status swarm-decomposition
|
|
374
|
-
|
|
375
|
-
┌─────────────────────────────────────────────────────────────┐
|
|
376
|
-
│ Eval: swarm-decomposition │
|
|
377
|
-
│ Phase: 🚀 Production (53 runs, variance: 0.08) │
|
|
378
|
-
│ │
|
|
379
|
-
│ Gate Thresholds: │
|
|
380
|
-
│ ├─ Stabilization: >10% regression (warn) │
|
|
381
|
-
│ └─ Production: >5% regression (fail) │
|
|
382
|
-
│ │
|
|
383
|
-
│ Recent Scores (last 10 runs): │
|
|
384
|
-
│ 0.92 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 12:00 │
|
|
385
|
-
│ 0.89 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 11:30 │
|
|
386
|
-
│ 0.94 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 2025-01-01 11:00 │
|
|
387
|
-
│ Baseline: 0.91 | Variance: 0.08 | Trend: ↗ │
|
|
388
|
-
└─────────────────────────────────────────────────────────────┘
|
|
389
|
-
```
|
|
390
|
-
|
|
391
|
-
**Phase indicators:**
|
|
392
|
-
- 🌱 Bootstrap - Collecting data
|
|
393
|
-
- ⚙️ Stabilization - Learning baseline
|
|
394
|
-
- 🚀 Production - Enforcing quality
|
|
395
|
-
|
|
396
|
-
### `swarm eval history`
|
|
397
|
-
|
|
398
|
-
Shows eval run history grouped by eval name with trends and color-coded scores.
|
|
399
|
-
|
|
400
|
-
```bash
|
|
401
|
-
$ swarm eval history
|
|
402
|
-
|
|
403
|
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
404
|
-
swarm-decomposition (53 runs)
|
|
405
|
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
406
|
-
Run #53 0.92 2025-01-01 12:00:00 ✅ PASS
|
|
407
|
-
Run #52 0.89 2025-01-01 11:30:00 ✅ PASS
|
|
408
|
-
Run #51 0.94 2025-01-01 11:00:00 ✅ PASS
|
|
409
|
-
...
|
|
410
|
-
Sparkline: ▁▂▃▄▅▆▇█▇▆▅▄▃▂▁▂▃▄▅▆▇█
|
|
411
|
-
Trend: ↗ (improving)
|
|
412
|
-
|
|
413
|
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
414
|
-
coordinator-behavior (8 runs)
|
|
415
|
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
416
|
-
Run #8 0.85 2025-01-01 10:00:00 ⚠️ WARN
|
|
417
|
-
Run #7 0.91 2025-01-01 09:30:00 ✅ PASS
|
|
418
|
-
...
|
|
419
|
-
```
|
|
420
|
-
|
|
421
|
-
**Color coding:**
|
|
422
|
-
- 🟢 Green: ≥0.8 (pass/high score)
|
|
423
|
-
- 🟡 Yellow: 0.6-0.8 (warning/medium score)
|
|
424
|
-
- 🔴 Red: <0.6 (fail/low score)
|
|
425
|
-
|
|
426
|
-
### `swarm eval run` (Stub)
|
|
427
|
-
|
|
428
|
-
Placeholder for future direct eval execution from CLI.
|
|
429
|
-
|
|
430
|
-
---
|
|
431
|
-
|
|
432
|
-
## CI Integration
|
|
433
|
-
|
|
434
|
-
### GitHub Actions Workflow
|
|
435
|
-
|
|
436
|
-
Progressive gates integrate with CI for PR checks:
|
|
437
|
-
|
|
438
|
-
```yaml
|
|
439
|
-
# .github/workflows/eval-check.yml
|
|
440
|
-
name: Eval Quality Gate
|
|
441
|
-
|
|
442
|
-
on:
|
|
443
|
-
pull_request:
|
|
444
|
-
branches: [main]
|
|
445
|
-
|
|
446
|
-
jobs:
|
|
447
|
-
eval-gate:
|
|
448
|
-
runs-on: ubuntu-latest
|
|
449
|
-
steps:
|
|
450
|
-
- uses: actions/checkout@v4
|
|
451
|
-
|
|
452
|
-
- name: Setup Bun
|
|
453
|
-
uses: oven-sh/setup-bun@v1
|
|
454
|
-
|
|
455
|
-
- name: Install deps
|
|
456
|
-
run: bun install
|
|
457
|
-
|
|
458
|
-
- name: Run evals
|
|
459
|
-
run: bun run eval:run
|
|
460
|
-
|
|
461
|
-
- name: Check gates
|
|
462
|
-
run: |
|
|
463
|
-
# Check if any eval failed production gate
|
|
464
|
-
swarm eval status | grep "FAIL" && exit 1 || exit 0
|
|
465
|
-
|
|
466
|
-
- name: Post PR comment
|
|
467
|
-
if: failure()
|
|
468
|
-
uses: actions/github-script@v7
|
|
469
|
-
with:
|
|
470
|
-
script: |
|
|
471
|
-
// Post detailed gate failure to PR
|
|
472
|
-
const status = await exec.getExecOutput('swarm eval status');
|
|
473
|
-
github.rest.issues.createComment({
|
|
474
|
-
issue_number: context.issue.number,
|
|
475
|
-
body: `## ❌ Eval Gate Failed\n\n\`\`\`\n${status.stdout}\n\`\`\``
|
|
476
|
-
});
|
|
477
|
-
```
|
|
478
|
-
|
|
479
|
-
**Gate behavior in CI:**
|
|
480
|
-
- Bootstrap: Always pass (collecting data)
|
|
481
|
-
- Stabilization: Pass but warn on >10% regression
|
|
482
|
-
- Production: **FAIL PR** on >5% regression
|
|
483
|
-
|
|
484
|
-
---
|
|
485
|
-
|
|
486
|
-
## Writing New Evals
|
|
487
|
-
|
|
488
|
-
### 1. Create Eval File
|
|
489
|
-
|
|
490
|
-
```typescript
|
|
491
|
-
// evals/my-feature.eval.ts
|
|
492
|
-
import { evalite } from "evalite";
|
|
493
|
-
import { createScorer } from "evalite";
|
|
494
|
-
|
|
495
|
-
// Define your scorer
|
|
496
|
-
const myScorer = createScorer({
|
|
497
|
-
name: "My Quality Metric",
|
|
498
|
-
description: "Checks if feature meets quality bar",
|
|
499
|
-
scorer: async ({ output, expected, input }) => {
|
|
500
|
-
// Implement scoring logic
|
|
501
|
-
const score = /* calculate 0-1 score */;
|
|
502
|
-
return {
|
|
503
|
-
score,
|
|
504
|
-
message: "Details about score"
|
|
505
|
-
};
|
|
506
|
-
},
|
|
507
|
-
});
|
|
508
|
-
|
|
509
|
-
// Define the eval
|
|
510
|
-
evalite("My Feature Quality", {
|
|
511
|
-
data: async () => {
|
|
512
|
-
// Load test cases
|
|
513
|
-
return [
|
|
514
|
-
{
|
|
515
|
-
input: "test input",
|
|
516
|
-
expected: { /* expected structure */ },
|
|
517
|
-
},
|
|
518
|
-
];
|
|
519
|
-
},
|
|
520
|
-
task: async (input) => {
|
|
521
|
-
// Call your system under test
|
|
522
|
-
const output = await myFeature(input);
|
|
523
|
-
return output;
|
|
524
|
-
},
|
|
525
|
-
scorers: [myScorer],
|
|
526
|
-
});
|
|
527
|
-
```
|
|
528
|
-
|
|
529
|
-
### 2. Add to Package Scripts
|
|
530
|
-
|
|
531
|
-
```json
|
|
532
|
-
{
|
|
533
|
-
"scripts": {
|
|
534
|
-
"eval:my-feature": "bunx evalite run evals/my-feature.eval.ts"
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
```
|
|
538
|
-
|
|
539
|
-
### 3. Add Capture Points
|
|
540
|
-
|
|
541
|
-
Wire your feature to capture real execution data:
|
|
542
|
-
|
|
543
|
-
```typescript
|
|
544
|
-
import { captureMyFeatureEvent } from "./eval-capture.js";
|
|
545
|
-
|
|
546
|
-
async function myFeature(input) {
|
|
547
|
-
const startTime = Date.now();
|
|
548
|
-
|
|
549
|
-
try {
|
|
550
|
-
const result = await doWork(input);
|
|
551
|
-
|
|
552
|
-
// Capture success
|
|
553
|
-
captureMyFeatureEvent({
|
|
554
|
-
input,
|
|
555
|
-
output: result,
|
|
556
|
-
duration_ms: Date.now() - startTime,
|
|
557
|
-
success: true,
|
|
558
|
-
});
|
|
559
|
-
|
|
560
|
-
return result;
|
|
561
|
-
} catch (error) {
|
|
562
|
-
// Capture failure
|
|
563
|
-
captureMyFeatureEvent({
|
|
564
|
-
input,
|
|
565
|
-
error: error.message,
|
|
566
|
-
duration_ms: Date.now() - startTime,
|
|
567
|
-
success: false,
|
|
568
|
-
});
|
|
569
|
-
throw error;
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
```
|
|
573
|
-
|
|
574
|
-
### 4. Test Locally
|
|
575
|
-
|
|
576
|
-
```bash
|
|
577
|
-
# Run your eval
|
|
578
|
-
bun run eval:my-feature
|
|
579
|
-
|
|
580
|
-
# Check status
|
|
581
|
-
swarm eval status my-feature
|
|
582
|
-
|
|
583
|
-
# View history
|
|
584
|
-
swarm eval history
|
|
585
|
-
```
|
|
586
|
-
|
|
587
|
-
---
|
|
588
|
-
|
|
589
|
-
## Scorer Reference
|
|
590
|
-
|
|
591
|
-
### Scorer Pattern (Evalite v1.0)
|
|
592
|
-
|
|
593
|
-
**IMPORTANT**: Evalite scorers are **async functions**, not objects with `.scorer` property.
|
|
594
|
-
|
|
595
|
-
```typescript
|
|
596
|
-
import { createScorer } from "evalite";
|
|
597
|
-
|
|
598
|
-
// CORRECT ✅
|
|
599
|
-
const myScorer = createScorer({
|
|
600
|
-
name: "My Scorer",
|
|
601
|
-
description: "What it measures",
|
|
602
|
-
scorer: async ({ output, expected, input }) => {
|
|
603
|
-
return { score: 0.8, message: "Details" };
|
|
604
|
-
},
|
|
605
|
-
});
|
|
606
|
-
|
|
607
|
-
// Use in eval
|
|
608
|
-
evalite("test", {
|
|
609
|
-
scorers: [myScorer], // Pass the scorer directly
|
|
610
|
-
});
|
|
611
|
-
|
|
612
|
-
// In composite scorers
|
|
613
|
-
const result = await childScorer({ output, expected, input });
|
|
614
|
-
const score = result.score ?? 0;
|
|
615
|
-
```
|
|
616
|
-
|
|
617
|
-
**WRONG ❌**:
|
|
618
|
-
```typescript
|
|
619
|
-
// Don't do this - .scorer property doesn't exist
|
|
620
|
-
const result = childScorer.scorer({ output, expected }); // ❌
|
|
621
|
-
```
|
|
622
|
-
|
|
623
|
-
### Custom Scorer Template
|
|
624
|
-
|
|
625
|
-
```typescript
|
|
626
|
-
export const myCustomScorer = createScorer({
|
|
627
|
-
name: "My Custom Metric",
|
|
628
|
-
description: "Detailed description of what this measures and why it matters",
|
|
629
|
-
scorer: async ({ output, expected, input }) => {
|
|
630
|
-
// 1. Parse output
|
|
631
|
-
let data;
|
|
632
|
-
try {
|
|
633
|
-
data = typeof output === "string" ? JSON.parse(output) : output;
|
|
634
|
-
} catch {
|
|
635
|
-
return { score: 0, message: "Invalid output format" };
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
// 2. Calculate score (0-1 range)
|
|
639
|
-
const score = calculateYourMetric(data, expected);
|
|
640
|
-
|
|
641
|
-
// 3. Return with detailed message
|
|
642
|
-
return {
|
|
643
|
-
score,
|
|
644
|
-
message: `Score: ${score.toFixed(2)} - ${getExplanation(score)}`,
|
|
645
|
-
};
|
|
646
|
-
},
|
|
647
|
-
});
|
|
648
|
-
```
|
|
649
|
-
|
|
650
|
-
---
|
|
651
|
-
|
|
652
|
-
## Troubleshooting
|
|
653
|
-
|
|
654
|
-
### "No eval history found"
|
|
655
|
-
|
|
656
|
-
**Cause:** Haven't run any evals yet or `.opencode/eval-history.jsonl` missing.
|
|
657
|
-
|
|
658
|
-
**Fix:**
|
|
659
|
-
```bash
|
|
660
|
-
# Run an eval to create history
|
|
661
|
-
bun run eval:decomposition
|
|
662
|
-
swarm eval status # Should show bootstrap phase
|
|
663
|
-
```
|
|
664
|
-
|
|
665
|
-
### "Phase stuck in stabilization despite >50 runs"
|
|
666
|
-
|
|
667
|
-
**Cause:** High variance (≥0.1). Scores not consistent enough for production phase.
|
|
668
|
-
|
|
669
|
-
**Fix:** Investigate why scores fluctuate:
|
|
670
|
-
```bash
|
|
671
|
-
# Check variance
|
|
672
|
-
swarm eval status my-eval # Shows variance value
|
|
673
|
-
|
|
674
|
-
# View score history to spot outliers
|
|
675
|
-
swarm eval history
|
|
676
|
-
|
|
677
|
-
# Common causes:
|
|
678
|
-
# - Eval depends on external state (network, filesystem)
|
|
679
|
-
# - Non-deterministic scoring logic
|
|
680
|
-
# - Input data changing between runs
|
|
681
|
-
```
|
|
682
|
-
|
|
683
|
-
### "Gate failing on minor changes"
|
|
684
|
-
|
|
685
|
-
**Cause:** Production phase threshold (5%) too strict for your use case.
|
|
686
|
-
|
|
687
|
-
**Fix:** Adjust threshold in eval code:
|
|
688
|
-
```typescript
|
|
689
|
-
import { checkGate } from "../src/eval-gates.js";
|
|
690
|
-
|
|
691
|
-
const result = checkGate(projectPath, evalName, score, {
|
|
692
|
-
productionThreshold: 0.10, // 10% instead of 5%
|
|
693
|
-
});
|
|
694
|
-
```
|
|
695
|
-
|
|
696
|
-
### "Evalite not finding my eval file"
|
|
697
|
-
|
|
698
|
-
**Cause:** File not matching `*.eval.ts` pattern or not in `evals/` directory.
|
|
699
|
-
|
|
700
|
-
**Fix:**
|
|
701
|
-
```bash
|
|
702
|
-
# Ensure file is named correctly
|
|
703
|
-
mv evals/my-test.ts evals/my-test.eval.ts
|
|
704
|
-
|
|
705
|
-
# Verify discovery
|
|
706
|
-
bunx evalite run evals/ # Should list your eval
|
|
707
|
-
```
|
|
708
|
-
|
|
709
|
-
### "Scorers returning undefined"
|
|
710
|
-
|
|
711
|
-
**Cause:** Forgot to `await` async scorers or accessing `.scorer` property (doesn't exist).
|
|
712
|
-
|
|
713
|
-
**Fix:**
|
|
714
|
-
```typescript
|
|
715
|
-
// CORRECT ✅
|
|
716
|
-
const result = await myScorer({ output, expected, input });
|
|
717
|
-
const score = result.score ?? 0;
|
|
718
|
-
|
|
719
|
-
// WRONG ❌
|
|
720
|
-
const result = myScorer.scorer({ output, expected }); // .scorer doesn't exist
|
|
721
|
-
```
|
|
722
|
-
|
|
723
|
-
---
|
|
724
|
-
|
|
725
|
-
## File Structure
|
|
726
|
-
|
|
727
|
-
```
|
|
728
|
-
evals/
|
|
729
|
-
├── README.md # This file
|
|
730
|
-
├── evalite.config.ts # Evalite configuration
|
|
731
|
-
│
|
|
732
|
-
├── fixtures/
|
|
733
|
-
│ ├── decomposition-cases.ts # Test cases for decomposition
|
|
734
|
-
│ ├── coordinator-sessions.ts # Known good/bad coordinator sessions
|
|
735
|
-
│ └── compaction-prompts.ts # Sample compaction prompts
|
|
736
|
-
│
|
|
737
|
-
├── lib/
|
|
738
|
-
│ ├── data-loader.ts # Load eval data from JSONL files
|
|
739
|
-
│ └── test-helpers.ts # Shared test utilities
|
|
740
|
-
│
|
|
741
|
-
├── scorers/
|
|
742
|
-
│ ├── decomposition-scorers.ts # Subtask quality scorers
|
|
743
|
-
│ ├── coordinator-scorers.ts # Protocol adherence scorers
|
|
744
|
-
│ └── compaction-prompt-scorers.ts # Prompt quality scorers
|
|
745
|
-
│
|
|
746
|
-
├── swarm-decomposition.eval.ts # Decomposition quality eval
|
|
747
|
-
├── coordinator-session.eval.ts # Coordinator discipline eval
|
|
748
|
-
├── compaction-prompt.eval.ts # Compaction prompt quality eval
|
|
749
|
-
└── example.eval.ts # Sanity check / template
|
|
750
|
-
```
|
|
751
|
-
|
|
752
|
-
**Data locations:**
|
|
753
|
-
- `.opencode/eval-data.jsonl` - Decomposition captures
|
|
754
|
-
- `.opencode/eval-history.jsonl` - Score history
|
|
755
|
-
- `~/.config/swarm-tools/sessions/*.jsonl` - Coordinator sessions
|
|
756
|
-
|
|
757
|
-
---
|
|
758
|
-
|
|
759
|
-
## Further Reading
|
|
760
|
-
|
|
761
|
-
- **[Evalite Docs](https://evalite.dev)** - Evaluation framework
|
|
762
|
-
- **[Progressive Gates Implementation](../src/eval-gates.ts)** - Phase-based quality control
|
|
763
|
-
- **[Learning Feedback Loop](../src/eval-learning.ts)** - Auto-store failures to memory
|
|
764
|
-
- **[Data Capture](../src/eval-capture.ts)** - Real execution tracking
|
|
765
|
-
- **[Compaction Scorers](../src/compaction-prompt-scoring.ts)** - Pure scoring functions
|
|
766
|
-
|
|
767
|
-
> _"Measure outcomes, not outputs. The system that learns from failure beats the system that avoids it."_
|
|
768
|
-
> — Inspired by Site Reliability Engineering principles
|