opencode-swarm-plugin 0.39.1 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
- package/.hive/analysis/session-data-quality-audit.md +320 -0
- package/.hive/eval-results.json +481 -24
- package/.hive/issues.jsonl +76 -11
- package/.hive/memories.jsonl +159 -1
- package/.opencode/eval-history.jsonl +315 -0
- package/.turbo/turbo-build.log +5 -5
- package/CHANGELOG.md +207 -0
- package/README.md +2 -0
- package/SCORER-ANALYSIS.md +598 -0
- package/bin/eval-gate.test.ts +158 -0
- package/bin/eval-gate.ts +74 -0
- package/bin/swarm.test.ts +1054 -719
- package/bin/swarm.ts +577 -0
- package/dist/compaction-hook.d.ts +10 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-observability.d.ts +173 -0
- package/dist/compaction-observability.d.ts.map +1 -0
- package/dist/compaction-prompt-scoring.d.ts +1 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -1
- package/dist/eval-capture.d.ts +93 -0
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-runner.d.ts +134 -0
- package/dist/eval-runner.d.ts.map +1 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +65 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +84043 -28070
- package/dist/memory-tools.d.ts +70 -2
- package/dist/memory-tools.d.ts.map +1 -1
- package/dist/memory.d.ts +37 -0
- package/dist/memory.d.ts.map +1 -1
- package/dist/observability-tools.d.ts +64 -0
- package/dist/observability-tools.d.ts.map +1 -1
- package/dist/plugin.js +83570 -27466
- package/dist/schemas/task.d.ts +3 -3
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +32 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
- package/evals/ARCHITECTURE.md +1189 -0
- package/evals/README.md +113 -0
- package/evals/example.eval.ts +3 -4
- package/evals/fixtures/compaction-prompt-cases.ts +6 -0
- package/evals/scorers/coordinator-discipline.evalite-test.ts +163 -0
- package/evals/scorers/coordinator-discipline.ts +82 -2
- package/evals/scorers/index.test.ts +146 -0
- package/evals/scorers/index.ts +104 -0
- package/evals/swarm-decomposition.eval.ts +13 -4
- package/examples/commands/swarm.md +291 -21
- package/package.json +4 -3
- package/src/compaction-hook.ts +258 -110
- package/src/compaction-observability.integration.test.ts +139 -0
- package/src/compaction-observability.test.ts +187 -0
- package/src/compaction-observability.ts +324 -0
- package/src/compaction-prompt-scorers.test.ts +10 -9
- package/src/compaction-prompt-scoring.ts +7 -5
- package/src/eval-capture.test.ts +204 -1
- package/src/eval-capture.ts +194 -2
- package/src/eval-runner.test.ts +223 -0
- package/src/eval-runner.ts +402 -0
- package/src/hive.ts +57 -22
- package/src/index.ts +54 -1
- package/src/memory-tools.test.ts +84 -0
- package/src/memory-tools.ts +68 -3
- package/src/memory.test.ts +2 -2
- package/src/memory.ts +122 -49
- package/src/observability-tools.test.ts +13 -0
- package/src/observability-tools.ts +277 -0
- package/src/swarm-orchestrate.test.ts +162 -0
- package/src/swarm-orchestrate.ts +7 -5
- package/src/swarm-prompts.test.ts +168 -4
- package/src/swarm-prompts.ts +228 -7
- package/.env +0 -2
- package/.turbo/turbo-test.log +0 -481
- package/.turbo/turbo-typecheck.log +0 -1
- package/dist/beads.d.ts +0 -386
- package/dist/beads.d.ts.map +0 -1
- package/dist/schemas/bead-events.d.ts +0 -698
- package/dist/schemas/bead-events.d.ts.map +0 -1
- package/dist/schemas/bead.d.ts +0 -255
- package/dist/schemas/bead.d.ts.map +0 -1
package/evals/README.md
CHANGED
|
@@ -167,6 +167,119 @@ coordinator-behavior
|
|
|
167
167
|
→ overallDiscipline: 0.89 ✅ PASS (bootstrap phase, collecting data)
|
|
168
168
|
```
|
|
169
169
|
|
|
170
|
+
#### Coordinator Session Capture (Deep Dive)
|
|
171
|
+
|
|
172
|
+
**How it works:** Session capture is fully automatic when coordinator tools are used. No manual instrumentation needed.
|
|
173
|
+
|
|
174
|
+
**Capture flow:**
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
178
|
+
│ SESSION CAPTURE FLOW │
|
|
179
|
+
│ │
|
|
180
|
+
│ 1. Coordinator tool call detected │
|
|
181
|
+
│ ├─ swarm_decompose, hive_create_epic, etc. │
|
|
182
|
+
│ └─ Tool name + args inspected in real-time │
|
|
183
|
+
│ │
|
|
184
|
+
│ 2. Violation detection (planning-guardrails.ts) │
|
|
185
|
+
│ ├─ detectCoordinatorViolation() checks patterns │
|
|
186
|
+
│ ├─ Edit/Write tools → coordinator_edited_file │
|
|
187
|
+
│ ├─ bash with test patterns → coordinator_ran_tests │
|
|
188
|
+
│ └─ swarmmail_reserve → coordinator_reserved_files │
|
|
189
|
+
│ │
|
|
190
|
+
│ 3. Event emission (eval-capture.ts) │
|
|
191
|
+
│ ├─ captureCoordinatorEvent() validates via Zod │
|
|
192
|
+
│ ├─ Appends JSONL line to session file │
|
|
193
|
+
│ └─ ~/.config/swarm-tools/sessions/{session_id}.jsonl │
|
|
194
|
+
│ │
|
|
195
|
+
│ 4. Eval consumption (coordinator-session.eval.ts) │
|
|
196
|
+
│ ├─ loadCapturedSessions() reads all *.jsonl files │
|
|
197
|
+
│ ├─ Parses events, reconstructs sessions │
|
|
198
|
+
│ └─ Scorers analyze event sequences │
|
|
199
|
+
│ │
|
|
200
|
+
└─────────────────────────────────────────────────────────────┘
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
**Event types:**
|
|
204
|
+
|
|
205
|
+
| Event Type | Subtypes | When Captured |
|
|
206
|
+
| -------------- | --------------------------------------------------------------------- | ------------------------------------ |
|
|
207
|
+
| `DECISION` | strategy_selected, worker_spawned, review_completed, decomposition_complete | Coordinator makes decision |
|
|
208
|
+
| `VIOLATION` | coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned | Protocol violation detected |
|
|
209
|
+
| `OUTCOME` | subtask_success, subtask_retry, subtask_failed, epic_complete | Worker completes or epic finishes |
|
|
210
|
+
| `COMPACTION` | detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked | Compaction lifecycle events |
|
|
211
|
+
|
|
212
|
+
**Violation detection patterns** (from `planning-guardrails.ts`):
|
|
213
|
+
|
|
214
|
+
```typescript
|
|
215
|
+
// File modification detection
|
|
216
|
+
VIOLATION_PATTERNS.FILE_MODIFICATION_TOOLS = ["edit", "write"];
|
|
217
|
+
|
|
218
|
+
// Test execution detection (regex patterns in bash commands)
|
|
219
|
+
VIOLATION_PATTERNS.TEST_EXECUTION_PATTERNS = [
|
|
220
|
+
/\bbun\s+test\b/i,
|
|
221
|
+
/\bnpm\s+(run\s+)?test/i,
|
|
222
|
+
/\bjest\b/i,
|
|
223
|
+
/\bvitest\b/i,
|
|
224
|
+
// ... and 6 more patterns
|
|
225
|
+
];
|
|
226
|
+
|
|
227
|
+
// File reservation detection
|
|
228
|
+
VIOLATION_PATTERNS.RESERVATION_TOOLS = ["swarmmail_reserve", "agentmail_reserve"];
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**Example session file** (`~/.config/swarm-tools/sessions/session-abc123.jsonl`):
|
|
232
|
+
|
|
233
|
+
```jsonl
|
|
234
|
+
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:00:00Z","event_type":"DECISION","decision_type":"strategy_selected","payload":{"strategy":"feature-based"}}
|
|
235
|
+
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:01:00Z","event_type":"DECISION","decision_type":"decomposition_complete","payload":{"subtask_count":3}}
|
|
236
|
+
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:02:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{"worker_id":"SwiftFire","bead_id":"mjkw81rkq4c.1"}}
|
|
237
|
+
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:05:00Z","event_type":"VIOLATION","violation_type":"coordinator_edited_file","payload":{"tool":"edit","file":"src/auth.ts"}}
|
|
238
|
+
{"session_id":"session-abc123","epic_id":"mjkw81rkq4c","timestamp":"2025-01-01T12:10:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{"bead_id":"mjkw81rkq4c.1","duration_ms":480000}}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
**Viewing sessions:**
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# List all captured sessions (coming soon)
|
|
245
|
+
swarm log sessions
|
|
246
|
+
|
|
247
|
+
# View specific session events
|
|
248
|
+
cat ~/.config/swarm-tools/sessions/session-abc123.jsonl | jq .
|
|
249
|
+
|
|
250
|
+
# Filter to violations only
|
|
251
|
+
cat ~/.config/swarm-tools/sessions/*.jsonl | jq 'select(.event_type == "VIOLATION")'
|
|
252
|
+
|
|
253
|
+
# Count violations by type
|
|
254
|
+
cat ~/.config/swarm-tools/sessions/*.jsonl | jq -r 'select(.event_type == "VIOLATION") | .violation_type' | sort | uniq -c
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Why JSONL format?**
|
|
258
|
+
|
|
259
|
+
- **Append-only**: No file locking, safe for concurrent writes
|
|
260
|
+
- **Streamable**: Process events one-by-one without loading full file
|
|
261
|
+
- **Line-oriented**: Easy to `grep`, `jq`, `tail -f` for live monitoring
|
|
262
|
+
- **Fault-tolerant**: Corrupted line doesn't break entire file
|
|
263
|
+
|
|
264
|
+
**Integration points:**
|
|
265
|
+
|
|
266
|
+
| Where | What Gets Captured | File |
|
|
267
|
+
| -------------------------- | ----------------------------------------- | ----------------------- |
|
|
268
|
+
| `swarm_decompose` | DECISION: strategy_selected, decomposition_complete | sessions/*.jsonl |
|
|
269
|
+
| `swarm_spawn_subtask` | DECISION: worker_spawned | sessions/*.jsonl |
|
|
270
|
+
| `swarm_review` | DECISION: review_completed | sessions/*.jsonl |
|
|
271
|
+
| `swarm_complete` | OUTCOME: subtask_success/failed | sessions/*.jsonl |
|
|
272
|
+
| Tool call inspection | VIOLATION: (real-time pattern matching) | sessions/*.jsonl |
|
|
273
|
+
| Compaction hook | COMPACTION: (all lifecycle stages) | sessions/*.jsonl |
|
|
274
|
+
|
|
275
|
+
**Source files:**
|
|
276
|
+
|
|
277
|
+
- **Schema**: `src/eval-capture.ts` - CoordinatorEventSchema (Zod discriminated union)
|
|
278
|
+
- **Violation detection**: `src/planning-guardrails.ts` - detectCoordinatorViolation()
|
|
279
|
+
- **Capture**: `src/eval-capture.ts` - captureCoordinatorEvent()
|
|
280
|
+
- **Scorers**: `evals/scorers/coordinator-discipline.ts` - violationCount, spawnEfficiency, etc.
|
|
281
|
+
- **Eval**: `evals/coordinator-session.eval.ts` - Real sessions + fixtures
|
|
282
|
+
|
|
170
283
|
### Compaction Prompt (`compaction-prompt.eval.ts`)
|
|
171
284
|
|
|
172
285
|
**What it measures:** Quality of continuation prompts after context compaction
|
package/evals/example.eval.ts
CHANGED
|
@@ -14,19 +14,18 @@ evalite("Example: Basic scorer test", {
|
|
|
14
14
|
data: async () => {
|
|
15
15
|
return [
|
|
16
16
|
{
|
|
17
|
-
input:
|
|
18
|
-
output: JSON.stringify({
|
|
17
|
+
input: {
|
|
19
18
|
epic: { title: "Test Epic", description: "Test" },
|
|
20
19
|
subtasks: [
|
|
21
20
|
{ title: "Subtask 1", files: ["a.ts"], estimated_complexity: 1 },
|
|
22
21
|
{ title: "Subtask 2", files: ["b.ts"], estimated_complexity: 1 },
|
|
23
22
|
],
|
|
24
|
-
}
|
|
23
|
+
},
|
|
25
24
|
},
|
|
26
25
|
];
|
|
27
26
|
},
|
|
28
27
|
task: async (input) => {
|
|
29
|
-
return input;
|
|
28
|
+
return JSON.stringify(input);
|
|
30
29
|
},
|
|
31
30
|
scorers: [subtaskIndependence],
|
|
32
31
|
});
|
|
@@ -78,6 +78,8 @@ Coordinators do NOT edit code directly. These tools are FORBIDDEN:
|
|
|
78
78
|
- edit
|
|
79
79
|
- write
|
|
80
80
|
- bash (for file modifications)
|
|
81
|
+
- swarmmail_reserve (only workers reserve)
|
|
82
|
+
- git commit (workers commit)
|
|
81
83
|
|
|
82
84
|
Use swarm_spawn_subtask to delegate work to workers.
|
|
83
85
|
|
|
@@ -249,6 +251,8 @@ You are the COORDINATOR of epic mjkweh7q9n4.
|
|
|
249
251
|
- edit
|
|
250
252
|
- write
|
|
251
253
|
- bash (for file mods)
|
|
254
|
+
- swarmmail_reserve (only workers)
|
|
255
|
+
- git commit (workers only)
|
|
252
256
|
|
|
253
257
|
NEVER edit files yourself.
|
|
254
258
|
ALWAYS delegate to workers.
|
|
@@ -289,6 +293,8 @@ You are coordinating epics:
|
|
|
289
293
|
- edit
|
|
290
294
|
- write
|
|
291
295
|
- bash
|
|
296
|
+
- swarmmail_reserve
|
|
297
|
+
- git commit
|
|
292
298
|
|
|
293
299
|
ALWAYS check status first.
|
|
294
300
|
NEVER edit files directly.
|
|
@@ -5,6 +5,7 @@ import { describe, expect, it } from "bun:test";
|
|
|
5
5
|
import type { CoordinatorSession } from "../../src/eval-capture.js";
|
|
6
6
|
import {
|
|
7
7
|
overallDiscipline,
|
|
8
|
+
reviewEfficiency,
|
|
8
9
|
reviewThoroughness,
|
|
9
10
|
spawnEfficiency,
|
|
10
11
|
timeToFirstSpawn,
|
|
@@ -535,3 +536,165 @@ describe("overallDiscipline", () => {
|
|
|
535
536
|
expect(result.message).toContain("Speed:");
|
|
536
537
|
});
|
|
537
538
|
});
|
|
539
|
+
|
|
540
|
+
describe("reviewEfficiency", () => {
|
|
541
|
+
it("scores 1.0 for ideal 1:1 ratio (one review per spawn)", async () => {
|
|
542
|
+
const session: CoordinatorSession = {
|
|
543
|
+
session_id: "test-session",
|
|
544
|
+
epic_id: "test-epic",
|
|
545
|
+
start_time: "2025-01-01T00:00:00Z",
|
|
546
|
+
events: [
|
|
547
|
+
{
|
|
548
|
+
session_id: "test-session",
|
|
549
|
+
epic_id: "test-epic",
|
|
550
|
+
timestamp: "2025-01-01T00:00:10Z",
|
|
551
|
+
event_type: "DECISION",
|
|
552
|
+
decision_type: "worker_spawned",
|
|
553
|
+
payload: { bead_id: "bd-1" },
|
|
554
|
+
},
|
|
555
|
+
{
|
|
556
|
+
session_id: "test-session",
|
|
557
|
+
epic_id: "test-epic",
|
|
558
|
+
timestamp: "2025-01-01T00:00:20Z",
|
|
559
|
+
event_type: "DECISION",
|
|
560
|
+
decision_type: "worker_spawned",
|
|
561
|
+
payload: { bead_id: "bd-2" },
|
|
562
|
+
},
|
|
563
|
+
{
|
|
564
|
+
session_id: "test-session",
|
|
565
|
+
epic_id: "test-epic",
|
|
566
|
+
timestamp: "2025-01-01T00:10:00Z",
|
|
567
|
+
event_type: "DECISION",
|
|
568
|
+
decision_type: "review_completed",
|
|
569
|
+
payload: { bead_id: "bd-1" },
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
session_id: "test-session",
|
|
573
|
+
epic_id: "test-epic",
|
|
574
|
+
timestamp: "2025-01-01T00:10:10Z",
|
|
575
|
+
event_type: "DECISION",
|
|
576
|
+
decision_type: "review_completed",
|
|
577
|
+
payload: { bead_id: "bd-2" },
|
|
578
|
+
},
|
|
579
|
+
],
|
|
580
|
+
};
|
|
581
|
+
|
|
582
|
+
const result = await reviewEfficiency({
|
|
583
|
+
output: JSON.stringify(session),
|
|
584
|
+
expected: {},
|
|
585
|
+
input: undefined,
|
|
586
|
+
});
|
|
587
|
+
|
|
588
|
+
expect(result.score).toBe(1.0);
|
|
589
|
+
expect(result.message).toContain("2 reviews / 2 spawns");
|
|
590
|
+
});
|
|
591
|
+
|
|
592
|
+
it("penalizes over-reviewing (>2:1 ratio)", async () => {
|
|
593
|
+
// 6 reviews for 2 spawns = 3:1 ratio (over-reviewing)
|
|
594
|
+
const session: CoordinatorSession = {
|
|
595
|
+
session_id: "test-session",
|
|
596
|
+
epic_id: "test-epic",
|
|
597
|
+
start_time: "2025-01-01T00:00:00Z",
|
|
598
|
+
events: [
|
|
599
|
+
{
|
|
600
|
+
session_id: "test-session",
|
|
601
|
+
epic_id: "test-epic",
|
|
602
|
+
timestamp: "2025-01-01T00:00:10Z",
|
|
603
|
+
event_type: "DECISION",
|
|
604
|
+
decision_type: "worker_spawned",
|
|
605
|
+
payload: { bead_id: "bd-1" },
|
|
606
|
+
},
|
|
607
|
+
{
|
|
608
|
+
session_id: "test-session",
|
|
609
|
+
epic_id: "test-epic",
|
|
610
|
+
timestamp: "2025-01-01T00:00:20Z",
|
|
611
|
+
event_type: "DECISION",
|
|
612
|
+
decision_type: "worker_spawned",
|
|
613
|
+
payload: { bead_id: "bd-2" },
|
|
614
|
+
},
|
|
615
|
+
...Array.from({ length: 6 }, (_, i) => ({
|
|
616
|
+
session_id: "test-session",
|
|
617
|
+
epic_id: "test-epic",
|
|
618
|
+
timestamp: `2025-01-01T00:10:${String(i * 10).padStart(2, "0")}Z`,
|
|
619
|
+
event_type: "DECISION" as const,
|
|
620
|
+
decision_type: "review_completed" as const,
|
|
621
|
+
payload: { bead_id: `bd-${(i % 2) + 1}` },
|
|
622
|
+
})),
|
|
623
|
+
],
|
|
624
|
+
};
|
|
625
|
+
|
|
626
|
+
const result = await reviewEfficiency({
|
|
627
|
+
output: JSON.stringify(session),
|
|
628
|
+
expected: {},
|
|
629
|
+
input: undefined,
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
// 3:1 ratio should be penalized (score < 0.5)
|
|
633
|
+
expect(result.score).toBeLessThan(0.5);
|
|
634
|
+
expect(result.message).toContain("6 reviews / 2 spawns");
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
it("handles no spawns gracefully", async () => {
|
|
638
|
+
const session: CoordinatorSession = {
|
|
639
|
+
session_id: "test-session",
|
|
640
|
+
epic_id: "test-epic",
|
|
641
|
+
start_time: "2025-01-01T00:00:00Z",
|
|
642
|
+
events: [
|
|
643
|
+
{
|
|
644
|
+
session_id: "test-session",
|
|
645
|
+
epic_id: "test-epic",
|
|
646
|
+
timestamp: "2025-01-01T00:00:00Z",
|
|
647
|
+
event_type: "DECISION",
|
|
648
|
+
decision_type: "strategy_selected",
|
|
649
|
+
payload: { strategy: "file-based" },
|
|
650
|
+
},
|
|
651
|
+
],
|
|
652
|
+
};
|
|
653
|
+
|
|
654
|
+
const result = await reviewEfficiency({
|
|
655
|
+
output: JSON.stringify(session),
|
|
656
|
+
expected: {},
|
|
657
|
+
input: undefined,
|
|
658
|
+
});
|
|
659
|
+
|
|
660
|
+
expect(result.score).toBe(1.0);
|
|
661
|
+
expect(result.message).toContain("No workers spawned");
|
|
662
|
+
});
|
|
663
|
+
|
|
664
|
+
it("handles no reviews gracefully (0:N ratio)", async () => {
|
|
665
|
+
const session: CoordinatorSession = {
|
|
666
|
+
session_id: "test-session",
|
|
667
|
+
epic_id: "test-epic",
|
|
668
|
+
start_time: "2025-01-01T00:00:00Z",
|
|
669
|
+
events: [
|
|
670
|
+
{
|
|
671
|
+
session_id: "test-session",
|
|
672
|
+
epic_id: "test-epic",
|
|
673
|
+
timestamp: "2025-01-01T00:00:10Z",
|
|
674
|
+
event_type: "DECISION",
|
|
675
|
+
decision_type: "worker_spawned",
|
|
676
|
+
payload: { bead_id: "bd-1" },
|
|
677
|
+
},
|
|
678
|
+
{
|
|
679
|
+
session_id: "test-session",
|
|
680
|
+
epic_id: "test-epic",
|
|
681
|
+
timestamp: "2025-01-01T00:00:20Z",
|
|
682
|
+
event_type: "DECISION",
|
|
683
|
+
decision_type: "worker_spawned",
|
|
684
|
+
payload: { bead_id: "bd-2" },
|
|
685
|
+
},
|
|
686
|
+
],
|
|
687
|
+
};
|
|
688
|
+
|
|
689
|
+
const result = await reviewEfficiency({
|
|
690
|
+
output: JSON.stringify(session),
|
|
691
|
+
expected: {},
|
|
692
|
+
input: undefined,
|
|
693
|
+
});
|
|
694
|
+
|
|
695
|
+
// No reviews is bad (should use reviewThoroughness for this)
|
|
696
|
+
// But this scorer focuses on over-reviewing, so no reviews = 1.0 (not over-reviewing)
|
|
697
|
+
expect(result.score).toBe(1.0);
|
|
698
|
+
expect(result.message).toContain("0 reviews / 2 spawns");
|
|
699
|
+
});
|
|
700
|
+
});
|
|
@@ -70,6 +70,9 @@ export const violationCount = createScorer({
|
|
|
70
70
|
* Coordinators should delegate work, not do it themselves.
|
|
71
71
|
*
|
|
72
72
|
* Score: workers_spawned / subtasks_planned
|
|
73
|
+
*
|
|
74
|
+
* If no decomposition_complete event exists, falls back to counting spawns
|
|
75
|
+
* and returns 1.0 if any workers were spawned (better than nothing).
|
|
73
76
|
*/
|
|
74
77
|
export const spawnEfficiency = createScorer({
|
|
75
78
|
name: "Spawn Efficiency",
|
|
@@ -85,7 +88,20 @@ export const spawnEfficiency = createScorer({
|
|
|
85
88
|
e.decision_type === "decomposition_complete"
|
|
86
89
|
);
|
|
87
90
|
|
|
91
|
+
// Count worker_spawned events
|
|
92
|
+
const spawned = session.events.filter(
|
|
93
|
+
(e) =>
|
|
94
|
+
e.event_type === "DECISION" && e.decision_type === "worker_spawned"
|
|
95
|
+
).length;
|
|
96
|
+
|
|
88
97
|
if (!decomp) {
|
|
98
|
+
// Fallback: if workers were spawned but no decomp event, assume they're doing work
|
|
99
|
+
if (spawned > 0) {
|
|
100
|
+
return {
|
|
101
|
+
score: 1.0,
|
|
102
|
+
message: `${spawned} workers spawned (no decomposition event)`,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
89
105
|
return {
|
|
90
106
|
score: 0,
|
|
91
107
|
message: "No decomposition event found",
|
|
@@ -101,17 +117,81 @@ export const spawnEfficiency = createScorer({
|
|
|
101
117
|
};
|
|
102
118
|
}
|
|
103
119
|
|
|
120
|
+
const score = spawned / subtaskCount;
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
score,
|
|
124
|
+
message: `${spawned}/${subtaskCount} workers spawned (${(score * 100).toFixed(0)}%)`,
|
|
125
|
+
};
|
|
126
|
+
} catch (error) {
|
|
127
|
+
return {
|
|
128
|
+
score: 0,
|
|
129
|
+
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
},
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Review Efficiency Scorer
|
|
137
|
+
*
|
|
138
|
+
* Measures review-to-spawn ratio to detect over-reviewing.
|
|
139
|
+
* Ideal ratio is 1:1 (one review per spawned worker).
|
|
140
|
+
* Penalizes >2:1 ratio (over-reviewing wastes context).
|
|
141
|
+
*
|
|
142
|
+
* Scoring:
|
|
143
|
+
* - 0:N or 1:1 ratio = 1.0 (perfect)
|
|
144
|
+
* - 2:1 ratio = 0.5 (threshold)
|
|
145
|
+
* - >2:1 ratio = linear penalty toward 0.0
|
|
146
|
+
*
|
|
147
|
+
* Score: normalized to 0-1 (lower ratio is better)
|
|
148
|
+
*/
|
|
149
|
+
export const reviewEfficiency = createScorer({
|
|
150
|
+
name: "Review Efficiency",
|
|
151
|
+
description: "Review-to-spawn ratio (penalize over-reviewing >2:1)",
|
|
152
|
+
scorer: ({ output }) => {
|
|
153
|
+
try {
|
|
154
|
+
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
155
|
+
|
|
104
156
|
// Count worker_spawned events
|
|
105
157
|
const spawned = session.events.filter(
|
|
106
158
|
(e) =>
|
|
107
159
|
e.event_type === "DECISION" && e.decision_type === "worker_spawned"
|
|
108
160
|
).length;
|
|
109
161
|
|
|
110
|
-
|
|
162
|
+
if (spawned === 0) {
|
|
163
|
+
return {
|
|
164
|
+
score: 1.0,
|
|
165
|
+
message: "No workers spawned",
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Count review_completed events
|
|
170
|
+
const reviewed = session.events.filter(
|
|
171
|
+
(e) =>
|
|
172
|
+
e.event_type === "DECISION" && e.decision_type === "review_completed"
|
|
173
|
+
).length;
|
|
174
|
+
|
|
175
|
+
const ratio = reviewed / spawned;
|
|
176
|
+
|
|
177
|
+
// Scoring:
|
|
178
|
+
// - ratio <= 1.0: perfect (1.0)
|
|
179
|
+
// - ratio <= 2.0: linear decay from 1.0 to 0.5
|
|
180
|
+
// - ratio > 2.0: linear penalty from 0.5 toward 0.0
|
|
181
|
+
let score: number;
|
|
182
|
+
if (ratio <= 1.0) {
|
|
183
|
+
score = 1.0;
|
|
184
|
+
} else if (ratio <= 2.0) {
|
|
185
|
+
// Linear decay: 1.0 at ratio=1.0, 0.5 at ratio=2.0
|
|
186
|
+
score = 1.0 - (ratio - 1.0) * 0.5;
|
|
187
|
+
} else {
|
|
188
|
+
// Penalty for extreme over-reviewing: 0.5 at ratio=2.0, 0.0 at ratio=4.0
|
|
189
|
+
score = Math.max(0, 0.5 - (ratio - 2.0) * 0.25);
|
|
190
|
+
}
|
|
111
191
|
|
|
112
192
|
return {
|
|
113
193
|
score,
|
|
114
|
-
message: `${
|
|
194
|
+
message: `${reviewed} reviews / ${spawned} spawns (${ratio.toFixed(1)}:1 ratio)`,
|
|
115
195
|
};
|
|
116
196
|
} catch (error) {
|
|
117
197
|
return {
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for decomposition scorers
|
|
3
|
+
*
|
|
4
|
+
* Uses Vitest (evalite's test runner), not Bun's test runner.
|
|
5
|
+
*
|
|
6
|
+
* Note: evalite's Score type only exposes `score`, not `message`.
|
|
7
|
+
* We test scores only - message testing requires accessing internal scorer.
|
|
8
|
+
*/
|
|
9
|
+
import { describe, expect, test } from "vitest";
|
|
10
|
+
import {
|
|
11
|
+
coverageCompleteness,
|
|
12
|
+
decompositionCoherence,
|
|
13
|
+
instructionClarity,
|
|
14
|
+
subtaskIndependence,
|
|
15
|
+
} from "./index.js";
|
|
16
|
+
|
|
17
|
+
describe("Heuristic Scorers", () => {
|
|
18
|
+
const goodDecomposition = JSON.stringify({
|
|
19
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
20
|
+
subtasks: [
|
|
21
|
+
{
|
|
22
|
+
title: "Add login form component",
|
|
23
|
+
description: "Create React component for login with email/password",
|
|
24
|
+
files: ["src/components/LoginForm.tsx"],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
title: "Add auth API routes",
|
|
28
|
+
description: "Create API endpoints for login/logout/session",
|
|
29
|
+
files: ["src/api/auth.ts"],
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
title: "Add auth middleware",
|
|
33
|
+
description: "Create middleware to protect routes",
|
|
34
|
+
files: ["src/middleware/auth.ts"],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
const conflictingDecomposition = JSON.stringify({
|
|
40
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
41
|
+
subtasks: [
|
|
42
|
+
{
|
|
43
|
+
title: "Add login",
|
|
44
|
+
files: ["src/auth.ts"],
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
title: "Add logout",
|
|
48
|
+
files: ["src/auth.ts"], // Same file - conflict!
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("subtaskIndependence scores 1.0 for no conflicts", async () => {
|
|
54
|
+
const result = await subtaskIndependence({
|
|
55
|
+
output: goodDecomposition,
|
|
56
|
+
expected: undefined,
|
|
57
|
+
input: {},
|
|
58
|
+
});
|
|
59
|
+
expect(result.score).toBe(1);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test("subtaskIndependence scores 0 for file conflicts", async () => {
|
|
63
|
+
const result = await subtaskIndependence({
|
|
64
|
+
output: conflictingDecomposition,
|
|
65
|
+
expected: undefined,
|
|
66
|
+
input: {},
|
|
67
|
+
});
|
|
68
|
+
expect(result.score).toBe(0);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("instructionClarity scores higher for detailed subtasks", async () => {
|
|
72
|
+
const result = await instructionClarity({
|
|
73
|
+
output: goodDecomposition,
|
|
74
|
+
expected: undefined,
|
|
75
|
+
input: {},
|
|
76
|
+
});
|
|
77
|
+
expect(result.score).toBeGreaterThan(0.7);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test("coverageCompleteness checks subtask count", async () => {
|
|
81
|
+
const result = await coverageCompleteness({
|
|
82
|
+
output: goodDecomposition,
|
|
83
|
+
expected: { minSubtasks: 2, maxSubtasks: 5 },
|
|
84
|
+
input: {},
|
|
85
|
+
});
|
|
86
|
+
expect(result.score).toBe(1);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
describe("LLM-as-Judge Scorer", () => {
|
|
91
|
+
// Skip LLM test in CI - requires API key
|
|
92
|
+
const hasApiKey = !!process.env.AI_GATEWAY_API_KEY;
|
|
93
|
+
|
|
94
|
+
test(
|
|
95
|
+
"decompositionCoherence returns valid score",
|
|
96
|
+
async () => {
|
|
97
|
+
if (!hasApiKey) {
|
|
98
|
+
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const decomposition = JSON.stringify({
|
|
103
|
+
epic: { title: "Add auth", description: "Add authentication" },
|
|
104
|
+
subtasks: [
|
|
105
|
+
{
|
|
106
|
+
title: "Add login form",
|
|
107
|
+
description: "Create login UI",
|
|
108
|
+
files: ["src/LoginForm.tsx"],
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
title: "Add auth API",
|
|
112
|
+
description: "Create auth endpoints",
|
|
113
|
+
files: ["src/api/auth.ts"],
|
|
114
|
+
},
|
|
115
|
+
],
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
const result = await decompositionCoherence({
|
|
119
|
+
output: decomposition,
|
|
120
|
+
expected: undefined,
|
|
121
|
+
input: { task: "Add user authentication with login/logout" },
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
expect(result.score).toBeGreaterThanOrEqual(0);
|
|
125
|
+
expect(result.score).toBeLessThanOrEqual(1);
|
|
126
|
+
},
|
|
127
|
+
30000,
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
test("decompositionCoherence scores invalid decomposition low", async () => {
|
|
131
|
+
if (!process.env.AI_GATEWAY_API_KEY) {
|
|
132
|
+
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const result = await decompositionCoherence({
|
|
137
|
+
output: "not valid json at all {{{",
|
|
138
|
+
expected: undefined,
|
|
139
|
+
input: {},
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// LLM should recognize garbage input and score it very low
|
|
143
|
+
// (0 or close to 0, not 0.5 fallback)
|
|
144
|
+
expect(result.score).toBeLessThanOrEqual(0.2);
|
|
145
|
+
}, 30000);
|
|
146
|
+
});
|