opencode-swarm-plugin 0.44.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.ts +16 -10
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.js +7644 -62599
- package/dist/plugin.js +23766 -78721
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2286
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/cass-baseline.ts +0 -217
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2515
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/contributor-tools.test.ts +0 -133
- package/src/contributor-tools.ts +0 -201
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -940
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/sessions/agent-discovery.test.ts +0 -137
- package/src/sessions/agent-discovery.ts +0 -112
- package/src/sessions/index.ts +0 -15
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Coordinator Behavior After Compaction Eval
|
|
3
|
-
*
|
|
4
|
-
* LLM-as-judge eval that tests whether the compaction context actually
|
|
5
|
-
* causes Claude to behave like a coordinator (spawn workers, check status)
|
|
6
|
-
* rather than a worker (run tests, edit files directly).
|
|
7
|
-
*
|
|
8
|
-
* This is the missing piece - we test the CONTEXT CONTENT in unit tests,
|
|
9
|
-
* but we need to test whether the LLM BEHAVES CORRECTLY given that context.
|
|
10
|
-
*
|
|
11
|
-
* Run with: bunx evalite run evals/coordinator-behavior.eval.ts
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import { evalite } from "evalite";
|
|
15
|
-
import { createScorer } from "evalite";
|
|
16
|
-
import { generateText, gateway } from "ai";
|
|
17
|
-
import type { GatewayModelId } from "ai";
|
|
18
|
-
|
|
19
|
-
const MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
|
|
20
|
-
|
|
21
|
-
// ============================================================================
|
|
22
|
-
// Test Context: Simulated compaction context injection
|
|
23
|
-
// ============================================================================
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Build the context that would be injected after compaction
|
|
27
|
-
* This mirrors buildDynamicSwarmState() from compaction-hook.ts
|
|
28
|
-
*/
|
|
29
|
-
function buildTestContext(epicId: string, projectPath: string): string {
|
|
30
|
-
return `## 🐝 Current Swarm State
|
|
31
|
-
|
|
32
|
-
**Epic:** ${epicId} - Add user authentication
|
|
33
|
-
**Subtasks:**
|
|
34
|
-
- 1 closed
|
|
35
|
-
- 1 in_progress
|
|
36
|
-
- 2 open
|
|
37
|
-
**Project:** ${projectPath}
|
|
38
|
-
|
|
39
|
-
## 🎯 YOU ARE THE COORDINATOR
|
|
40
|
-
|
|
41
|
-
**Primary role:** Orchestrate workers, review their output, unblock dependencies.
|
|
42
|
-
**Spawn workers** for implementation tasks - don't do them yourself.
|
|
43
|
-
|
|
44
|
-
**RESUME STEPS:**
|
|
45
|
-
1. Check swarm status: \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\`
|
|
46
|
-
2. Check inbox for worker messages: \`swarmmail_inbox(limit=5)\`
|
|
47
|
-
3. For in_progress subtasks: Review worker results with \`swarm_review\`
|
|
48
|
-
4. For open subtasks: Spawn workers with \`swarm_spawn_subtask\`
|
|
49
|
-
5. For blocked subtasks: Investigate and unblock
|
|
50
|
-
|
|
51
|
-
## 🐝 SWARM ACTIVE - Keep Cooking
|
|
52
|
-
|
|
53
|
-
You are the **COORDINATOR** of an active swarm. Context was compacted but the swarm is still running.
|
|
54
|
-
|
|
55
|
-
**YOUR JOB:** Keep orchestrating. Spawn agents. Monitor progress. Unblock work. Ship it.
|
|
56
|
-
|
|
57
|
-
### On Resume - IMMEDIATELY
|
|
58
|
-
|
|
59
|
-
1. \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\` - Get current state
|
|
60
|
-
2. \`swarmmail_inbox(limit=5)\` - Check for agent messages
|
|
61
|
-
3. \`swarm_review(project_key, epic_id, task_id, files_touched)\` - Review any completed work
|
|
62
|
-
4. **Spawn ready subtasks** - Don't wait, fire them off
|
|
63
|
-
|
|
64
|
-
**You are not waiting for instructions. You are the coordinator. Coordinate.**`;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// ============================================================================
|
|
68
|
-
// Scorers
|
|
69
|
-
// ============================================================================
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Scores whether the response mentions coordinator tools
|
|
73
|
-
*/
|
|
74
|
-
export const mentionsCoordinatorTools = createScorer({
|
|
75
|
-
name: "Mentions Coordinator Tools",
|
|
76
|
-
description: "Response mentions swarm_status, swarmmail_inbox, swarm_spawn_subtask, or swarm_review",
|
|
77
|
-
scorer: ({ output }) => {
|
|
78
|
-
const text = String(output).toLowerCase();
|
|
79
|
-
const coordinatorTools = [
|
|
80
|
-
"swarm_status",
|
|
81
|
-
"swarmmail_inbox",
|
|
82
|
-
"swarm_spawn_subtask",
|
|
83
|
-
"swarm_review",
|
|
84
|
-
"spawn",
|
|
85
|
-
"worker",
|
|
86
|
-
];
|
|
87
|
-
|
|
88
|
-
const found = coordinatorTools.filter(tool => text.includes(tool));
|
|
89
|
-
const score = Math.min(found.length / 3, 1); // Need at least 3 for full score
|
|
90
|
-
|
|
91
|
-
return {
|
|
92
|
-
score,
|
|
93
|
-
message: found.length > 0
|
|
94
|
-
? `Found coordinator patterns: ${found.join(", ")}`
|
|
95
|
-
: "No coordinator patterns found",
|
|
96
|
-
};
|
|
97
|
-
},
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
/**
|
|
101
|
-
* Scores whether the response avoids worker behaviors
|
|
102
|
-
*/
|
|
103
|
-
export const avoidsWorkerBehaviors = createScorer({
|
|
104
|
-
name: "Avoids Worker Behaviors",
|
|
105
|
-
description: "Response does NOT suggest running tests, editing files, or doing implementation directly",
|
|
106
|
-
scorer: ({ output }) => {
|
|
107
|
-
const text = String(output).toLowerCase();
|
|
108
|
-
const workerPatterns = [
|
|
109
|
-
"bun test",
|
|
110
|
-
"npm test",
|
|
111
|
-
"pnpm test",
|
|
112
|
-
"let me run",
|
|
113
|
-
"i'll run the tests",
|
|
114
|
-
"let me edit",
|
|
115
|
-
"i'll fix",
|
|
116
|
-
"let me implement",
|
|
117
|
-
"i'll write the code",
|
|
118
|
-
"```typescript", // Code blocks suggest implementation
|
|
119
|
-
"```javascript",
|
|
120
|
-
];
|
|
121
|
-
|
|
122
|
-
const found = workerPatterns.filter(pattern => text.includes(pattern));
|
|
123
|
-
|
|
124
|
-
if (found.length === 0) {
|
|
125
|
-
return {
|
|
126
|
-
score: 1,
|
|
127
|
-
message: "No worker behaviors detected",
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
return {
|
|
132
|
-
score: Math.max(0, 1 - (found.length * 0.25)),
|
|
133
|
-
message: `Worker behaviors detected: ${found.join(", ")}`,
|
|
134
|
-
};
|
|
135
|
-
},
|
|
136
|
-
});
|
|
137
|
-
|
|
138
|
-
/**
|
|
139
|
-
* Scores whether the response shows coordinator mindset
|
|
140
|
-
*/
|
|
141
|
-
export const coordinatorMindset = createScorer({
|
|
142
|
-
name: "Coordinator Mindset",
|
|
143
|
-
description: "Response demonstrates orchestration thinking, not implementation thinking",
|
|
144
|
-
scorer: ({ output }) => {
|
|
145
|
-
const text = String(output).toLowerCase();
|
|
146
|
-
|
|
147
|
-
// Positive signals: orchestration language
|
|
148
|
-
const orchestrationPatterns = [
|
|
149
|
-
"check status",
|
|
150
|
-
"check inbox",
|
|
151
|
-
"spawn",
|
|
152
|
-
"delegate",
|
|
153
|
-
"assign",
|
|
154
|
-
"review",
|
|
155
|
-
"coordinate",
|
|
156
|
-
"orchestrat",
|
|
157
|
-
"worker",
|
|
158
|
-
"subtask",
|
|
159
|
-
"unblock",
|
|
160
|
-
];
|
|
161
|
-
|
|
162
|
-
// Negative signals: implementation language
|
|
163
|
-
const implementationPatterns = [
|
|
164
|
-
"let me code",
|
|
165
|
-
"i'll implement",
|
|
166
|
-
"here's the fix",
|
|
167
|
-
"the solution is",
|
|
168
|
-
"i'll write",
|
|
169
|
-
"let me add",
|
|
170
|
-
];
|
|
171
|
-
|
|
172
|
-
const positiveCount = orchestrationPatterns.filter(p => text.includes(p)).length;
|
|
173
|
-
const negativeCount = implementationPatterns.filter(p => text.includes(p)).length;
|
|
174
|
-
|
|
175
|
-
const score = Math.min(1, Math.max(0, (positiveCount - negativeCount * 2) / 4));
|
|
176
|
-
|
|
177
|
-
return {
|
|
178
|
-
score,
|
|
179
|
-
message: `Orchestration signals: ${positiveCount}, Implementation signals: ${negativeCount}`,
|
|
180
|
-
};
|
|
181
|
-
},
|
|
182
|
-
});
|
|
183
|
-
|
|
184
|
-
/**
|
|
185
|
-
* Composite scorer for overall coordinator behavior
|
|
186
|
-
*/
|
|
187
|
-
export const overallCoordinatorBehavior = createScorer({
|
|
188
|
-
name: "Overall Coordinator Behavior",
|
|
189
|
-
description: "Composite score: does the LLM behave like a coordinator?",
|
|
190
|
-
scorer: async ({ output, expected, input }) => {
|
|
191
|
-
const toolsResult = await mentionsCoordinatorTools({ output, expected, input });
|
|
192
|
-
const avoidsResult = await avoidsWorkerBehaviors({ output, expected, input });
|
|
193
|
-
const mindsetResult = await coordinatorMindset({ output, expected, input });
|
|
194
|
-
|
|
195
|
-
// Weighted average: avoiding worker behavior is most important
|
|
196
|
-
const score =
|
|
197
|
-
(toolsResult.score ?? 0) * 0.3 +
|
|
198
|
-
(avoidsResult.score ?? 0) * 0.4 +
|
|
199
|
-
(mindsetResult.score ?? 0) * 0.3;
|
|
200
|
-
|
|
201
|
-
return {
|
|
202
|
-
score,
|
|
203
|
-
message: `Tools: ${((toolsResult.score ?? 0) * 100).toFixed(0)}%, Avoids Worker: ${((avoidsResult.score ?? 0) * 100).toFixed(0)}%, Mindset: ${((mindsetResult.score ?? 0) * 100).toFixed(0)}%`,
|
|
204
|
-
};
|
|
205
|
-
},
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
// ============================================================================
|
|
209
|
-
// Eval Cases
|
|
210
|
-
// ============================================================================
|
|
211
|
-
|
|
212
|
-
interface CoordinatorTestCase {
|
|
213
|
-
name: string;
|
|
214
|
-
systemContext: string;
|
|
215
|
-
userPrompt: string;
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
const testCases: CoordinatorTestCase[] = [
|
|
219
|
-
{
|
|
220
|
-
name: "Basic resumption after compaction",
|
|
221
|
-
systemContext: buildTestContext("bd-epic-123", "/my/project"),
|
|
222
|
-
userPrompt: "Context was compacted. What should I do next?",
|
|
223
|
-
},
|
|
224
|
-
{
|
|
225
|
-
name: "Resumption with specific epic",
|
|
226
|
-
systemContext: buildTestContext("my-app-lf2p4u-auth-epic", "/Users/dev/my-app"),
|
|
227
|
-
userPrompt: "I just resumed. The swarm is active. What's my next step?",
|
|
228
|
-
},
|
|
229
|
-
{
|
|
230
|
-
name: "Temptation to do work directly",
|
|
231
|
-
systemContext: buildTestContext("bd-epic-456", "/project"),
|
|
232
|
-
userPrompt: "There are 2 open subtasks. Should I just implement them myself to save time?",
|
|
233
|
-
},
|
|
234
|
-
{
|
|
235
|
-
name: "Worker completed - what now?",
|
|
236
|
-
systemContext: buildTestContext("bd-epic-789", "/app"),
|
|
237
|
-
userPrompt: "A worker just finished a subtask. What should I do?",
|
|
238
|
-
},
|
|
239
|
-
];
|
|
240
|
-
|
|
241
|
-
// ============================================================================
|
|
242
|
-
// Main Eval
|
|
243
|
-
// ============================================================================
|
|
244
|
-
|
|
245
|
-
evalite("Coordinator Behavior After Compaction", {
|
|
246
|
-
data: async () => testCases.map(tc => ({
|
|
247
|
-
input: tc,
|
|
248
|
-
expected: undefined, // Scorers don't need expected values
|
|
249
|
-
})),
|
|
250
|
-
|
|
251
|
-
task: async (input) => {
|
|
252
|
-
const { text } = await generateText({
|
|
253
|
-
model: gateway(MODEL),
|
|
254
|
-
system: input.systemContext,
|
|
255
|
-
prompt: input.userPrompt,
|
|
256
|
-
maxOutputTokens: 1024,
|
|
257
|
-
});
|
|
258
|
-
|
|
259
|
-
return text;
|
|
260
|
-
},
|
|
261
|
-
|
|
262
|
-
scorers: [
|
|
263
|
-
mentionsCoordinatorTools,
|
|
264
|
-
avoidsWorkerBehaviors,
|
|
265
|
-
coordinatorMindset,
|
|
266
|
-
overallCoordinatorBehavior,
|
|
267
|
-
],
|
|
268
|
-
});
|
|
269
|
-
|
|
270
|
-
/**
|
|
271
|
-
* Edge Case: Direct implementation temptation
|
|
272
|
-
*
|
|
273
|
-
* Tests that even when prompted to "just do it", the coordinator resists
|
|
274
|
-
*/
|
|
275
|
-
evalite("Coordinator Resists Direct Implementation", {
|
|
276
|
-
data: async () => [
|
|
277
|
-
{
|
|
278
|
-
input: {
|
|
279
|
-
name: "User asks to skip workers",
|
|
280
|
-
systemContext: buildTestContext("bd-epic-skip", "/project"),
|
|
281
|
-
userPrompt: "Can you just run the tests and fix any errors yourself? It would be faster.",
|
|
282
|
-
},
|
|
283
|
-
expected: undefined,
|
|
284
|
-
},
|
|
285
|
-
{
|
|
286
|
-
input: {
|
|
287
|
-
name: "User asks for direct code",
|
|
288
|
-
systemContext: buildTestContext("bd-epic-code", "/project"),
|
|
289
|
-
userPrompt: "Write the authentication middleware code for me.",
|
|
290
|
-
},
|
|
291
|
-
expected: undefined,
|
|
292
|
-
},
|
|
293
|
-
],
|
|
294
|
-
|
|
295
|
-
task: async (input) => {
|
|
296
|
-
const { text } = await generateText({
|
|
297
|
-
model: gateway(MODEL),
|
|
298
|
-
system: input.systemContext,
|
|
299
|
-
prompt: input.userPrompt,
|
|
300
|
-
maxOutputTokens: 1024,
|
|
301
|
-
});
|
|
302
|
-
|
|
303
|
-
return text;
|
|
304
|
-
},
|
|
305
|
-
|
|
306
|
-
scorers: [avoidsWorkerBehaviors, coordinatorMindset],
|
|
307
|
-
});
|
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Coordinator Session Eval - Scores Real Captured Sessions
|
|
3
|
-
*
|
|
4
|
-
* Tests that coordinators follow protocol:
|
|
5
|
-
* 1. Don't edit files directly (spawn workers)
|
|
6
|
-
* 2. Don't run tests directly (workers do verification)
|
|
7
|
-
* 3. Spawn workers for all subtasks
|
|
8
|
-
* 4. Review worker output before accepting
|
|
9
|
-
* 5. Minimize time to first spawn (don't overthink)
|
|
10
|
-
*
|
|
11
|
-
* ## Data Sources
|
|
12
|
-
*
|
|
13
|
-
* - **Real sessions**: Captured from ~/.config/swarm-tools/sessions/*.jsonl
|
|
14
|
-
* - **Synthetic fixtures**: Test cases in fixtures/coordinator-sessions.ts
|
|
15
|
-
*
|
|
16
|
-
* ## Test Flow
|
|
17
|
-
*
|
|
18
|
-
* 1. Load captured sessions from disk (via loadCapturedSessions)
|
|
19
|
-
* 2. Load synthetic fixtures for baseline validation
|
|
20
|
-
* 3. Run coordinator-discipline scorers on all sessions
|
|
21
|
-
* 4. Output scores and violation details
|
|
22
|
-
*
|
|
23
|
-
* Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
|
|
24
|
-
*/
|
|
25
|
-
|
|
26
|
-
import { evalite } from "evalite";
|
|
27
|
-
import { coordinatorSessionFixtures } from "./fixtures/coordinator-sessions.js";
|
|
28
|
-
import { loadCapturedSessions } from "./lib/data-loader.js";
|
|
29
|
-
import {
|
|
30
|
-
overallDiscipline,
|
|
31
|
-
reviewThoroughness,
|
|
32
|
-
spawnEfficiency,
|
|
33
|
-
timeToFirstSpawn,
|
|
34
|
-
violationCount,
|
|
35
|
-
} from "./scorers/index.js";
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Test 1: Synthetic Fixtures (Baseline)
|
|
39
|
-
*
|
|
40
|
-
* Validates scorers against known-good and known-bad coordinator sessions.
|
|
41
|
-
* These should have predictable scores.
|
|
42
|
-
*/
|
|
43
|
-
evalite("Coordinator Discipline - Synthetic Fixtures", {
|
|
44
|
-
data: async () =>
|
|
45
|
-
coordinatorSessionFixtures.map((fixture) => ({
|
|
46
|
-
input: fixture,
|
|
47
|
-
expected: {
|
|
48
|
-
session_id: fixture.session_id,
|
|
49
|
-
epic_id: fixture.epic_id,
|
|
50
|
-
},
|
|
51
|
-
})),
|
|
52
|
-
|
|
53
|
-
task: async (input) => {
|
|
54
|
-
// Return session as JSON string for scorers
|
|
55
|
-
return JSON.stringify(input);
|
|
56
|
-
},
|
|
57
|
-
|
|
58
|
-
scorers: [
|
|
59
|
-
violationCount,
|
|
60
|
-
spawnEfficiency,
|
|
61
|
-
reviewThoroughness,
|
|
62
|
-
timeToFirstSpawn,
|
|
63
|
-
overallDiscipline,
|
|
64
|
-
],
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Test 2: Real Captured Sessions
|
|
69
|
-
*
|
|
70
|
-
* Loads sessions from ~/.config/swarm-tools/sessions/ and scores them.
|
|
71
|
-
* This eval will skip if no captured sessions exist.
|
|
72
|
-
*/
|
|
73
|
-
evalite("Coordinator Discipline - Real Sessions", {
|
|
74
|
-
data: async () => {
|
|
75
|
-
// Try to load real sessions
|
|
76
|
-
const captured = await loadCapturedSessions({ limit: 20 });
|
|
77
|
-
|
|
78
|
-
// If no real sessions, return empty (eval will skip)
|
|
79
|
-
if (captured.length === 0) {
|
|
80
|
-
console.log(
|
|
81
|
-
"\n⚠️ No real coordinator sessions found in ~/.config/swarm-tools/sessions/",
|
|
82
|
-
);
|
|
83
|
-
console.log(
|
|
84
|
-
" Run a coordinator session with eval capture enabled to populate data.\n",
|
|
85
|
-
);
|
|
86
|
-
return [];
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
console.log(
|
|
90
|
-
`\n✓ Loaded ${captured.length} real coordinator sessions for evaluation\n`,
|
|
91
|
-
);
|
|
92
|
-
|
|
93
|
-
return captured.map(({ session }) => ({
|
|
94
|
-
input: session,
|
|
95
|
-
expected: {
|
|
96
|
-
session_id: session.session_id,
|
|
97
|
-
epic_id: session.epic_id,
|
|
98
|
-
},
|
|
99
|
-
}));
|
|
100
|
-
},
|
|
101
|
-
|
|
102
|
-
task: async (input) => {
|
|
103
|
-
return JSON.stringify(input);
|
|
104
|
-
},
|
|
105
|
-
|
|
106
|
-
scorers: [
|
|
107
|
-
violationCount,
|
|
108
|
-
spawnEfficiency,
|
|
109
|
-
reviewThoroughness,
|
|
110
|
-
timeToFirstSpawn,
|
|
111
|
-
overallDiscipline,
|
|
112
|
-
],
|
|
113
|
-
});
|
|
114
|
-
|
|
115
|
-
/**
|
|
116
|
-
* Test 3: Perfect vs Bad Comparison
|
|
117
|
-
*
|
|
118
|
-
* Direct comparison between perfectCoordinator and badCoordinator fixtures
|
|
119
|
-
* to validate scorer ranges and weighting.
|
|
120
|
-
*/
|
|
121
|
-
evalite("Coordinator Discipline - Perfect vs Bad", {
|
|
122
|
-
data: async () => [
|
|
123
|
-
{
|
|
124
|
-
input: coordinatorSessionFixtures[0], // perfectCoordinator
|
|
125
|
-
expected: {
|
|
126
|
-
name: "perfect",
|
|
127
|
-
expectedViolations: 0,
|
|
128
|
-
expectedSpawnEfficiency: 1.0,
|
|
129
|
-
expectedReviewThoroughness: 1.0,
|
|
130
|
-
},
|
|
131
|
-
},
|
|
132
|
-
{
|
|
133
|
-
input: coordinatorSessionFixtures[1], // badCoordinator
|
|
134
|
-
expected: {
|
|
135
|
-
name: "bad",
|
|
136
|
-
expectedViolations: 5, // 3 direct violations + 2 no_worker_spawned
|
|
137
|
-
expectedSpawnEfficiency: 0.33, // 1/3 workers spawned
|
|
138
|
-
expectedReviewThoroughness: 0.0, // 0 reviews
|
|
139
|
-
},
|
|
140
|
-
},
|
|
141
|
-
],
|
|
142
|
-
|
|
143
|
-
task: async (input) => {
|
|
144
|
-
return JSON.stringify(input);
|
|
145
|
-
},
|
|
146
|
-
|
|
147
|
-
scorers: [
|
|
148
|
-
violationCount,
|
|
149
|
-
spawnEfficiency,
|
|
150
|
-
reviewThoroughness,
|
|
151
|
-
timeToFirstSpawn,
|
|
152
|
-
overallDiscipline,
|
|
153
|
-
],
|
|
154
|
-
});
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import { defineConfig } from "evalite/config";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Evalite configuration for swarm decomposition testing
|
|
5
|
-
*
|
|
6
|
-
* Tests swarm task decomposition strategies to ensure:
|
|
7
|
-
* - Subtasks are independent (no file conflicts)
|
|
8
|
-
* - Complexity is balanced across subtasks
|
|
9
|
-
* - Full task coverage
|
|
10
|
-
*/
|
|
11
|
-
export default defineConfig({
|
|
12
|
-
// Base configuration - Evalite will auto-discover evals in this directory
|
|
13
|
-
// Custom scorers are defined in scorers/index.ts
|
|
14
|
-
// Test fixtures are in fixtures/decomposition-cases.ts
|
|
15
|
-
});
|
package/evals/example.eval.ts
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Example eval file to test Evalite setup
|
|
3
|
-
*
|
|
4
|
-
* This is a minimal test to verify:
|
|
5
|
-
* 1. Evalite CLI can discover .eval.ts files
|
|
6
|
-
* 2. createScorer works
|
|
7
|
-
* 3. evalite() function works
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
import { evalite } from "evalite";
|
|
11
|
-
import { subtaskIndependence } from "./scorers/index.js";
|
|
12
|
-
|
|
13
|
-
evalite("Example: Basic scorer test", {
|
|
14
|
-
data: async () => {
|
|
15
|
-
return [
|
|
16
|
-
{
|
|
17
|
-
input: {
|
|
18
|
-
epic: { title: "Test Epic", description: "Test" },
|
|
19
|
-
subtasks: [
|
|
20
|
-
{ title: "Subtask 1", files: ["a.ts"], estimated_complexity: 1 },
|
|
21
|
-
{ title: "Subtask 2", files: ["b.ts"], estimated_complexity: 1 },
|
|
22
|
-
],
|
|
23
|
-
},
|
|
24
|
-
},
|
|
25
|
-
];
|
|
26
|
-
},
|
|
27
|
-
task: async (input) => {
|
|
28
|
-
return JSON.stringify(input);
|
|
29
|
-
},
|
|
30
|
-
scorers: [subtaskIndependence],
|
|
31
|
-
});
|