opencode-swarm-plugin 0.43.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cass.characterization.test.ts +422 -0
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.test.ts +68 -0
- package/bin/swarm.ts +81 -8
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/contributor-tools.d.ts +42 -0
- package/dist/contributor-tools.d.ts.map +1 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7728 -62590
- package/dist/plugin.js +23833 -78695
- package/dist/sessions/agent-discovery.d.ts +59 -0
- package/dist/sessions/agent-discovery.d.ts.map +1 -0
- package/dist/sessions/index.d.ts +10 -0
- package/dist/sessions/index.d.ts.map +1 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2255
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2426
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -935
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
|
@@ -1,325 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Coordinator Discipline Scorers - Evaluate coordinator behavior
|
|
3
|
-
*
|
|
4
|
-
* These scorers measure whether a coordinator follows the protocol:
|
|
5
|
-
* 1. Don't edit files directly (spawn workers)
|
|
6
|
-
* 2. Don't run tests directly (workers do verification)
|
|
7
|
-
* 3. Spawn workers for all subtasks
|
|
8
|
-
* 4. Review worker output before accepting
|
|
9
|
-
* 5. Minimize time to first spawn (don't overthink)
|
|
10
|
-
*
|
|
11
|
-
* Inputs: CoordinatorSession from eval-capture
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import { createScorer } from "evalite";
|
|
15
|
-
import type { CoordinatorSession } from "../../src/eval-capture.js";
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* Violation Count Scorer
|
|
19
|
-
*
|
|
20
|
-
* Counts VIOLATION events in the session.
|
|
21
|
-
* Each violation reduces score by 0.2.
|
|
22
|
-
*
|
|
23
|
-
* Violations tracked:
|
|
24
|
-
* - coordinator_edited_file (should spawn worker instead)
|
|
25
|
-
* - coordinator_ran_tests (workers do verification)
|
|
26
|
-
* - coordinator_reserved_files (only workers reserve)
|
|
27
|
-
* - no_worker_spawned (subtask exists but no worker)
|
|
28
|
-
*
|
|
29
|
-
* Score: 1.0 - (0.2 * violation_count), floored at 0.0
|
|
30
|
-
*/
|
|
31
|
-
export const violationCount = createScorer({
|
|
32
|
-
name: "Violation Count",
|
|
33
|
-
description: "Coordinator followed protocol (no direct edits, tests, or reservations)",
|
|
34
|
-
scorer: ({ output }) => {
|
|
35
|
-
try {
|
|
36
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
37
|
-
|
|
38
|
-
// Count violations
|
|
39
|
-
const violations = session.events.filter(
|
|
40
|
-
(e) => e.event_type === "VIOLATION"
|
|
41
|
-
);
|
|
42
|
-
|
|
43
|
-
const count = violations.length;
|
|
44
|
-
const score = Math.max(0, 1.0 - count * 0.2);
|
|
45
|
-
|
|
46
|
-
if (count === 0) {
|
|
47
|
-
return {
|
|
48
|
-
score: 1.0,
|
|
49
|
-
message: "Perfect - 0 violations",
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
return {
|
|
54
|
-
score,
|
|
55
|
-
message: `${count} violations detected`,
|
|
56
|
-
};
|
|
57
|
-
} catch (error) {
|
|
58
|
-
return {
|
|
59
|
-
score: 0,
|
|
60
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
},
|
|
64
|
-
});
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Spawn Efficiency Scorer
|
|
68
|
-
*
|
|
69
|
-
* Measures whether workers were spawned for all subtasks.
|
|
70
|
-
* Coordinators should delegate work, not do it themselves.
|
|
71
|
-
*
|
|
72
|
-
* Score: workers_spawned / subtasks_planned
|
|
73
|
-
*
|
|
74
|
-
* If no decomposition_complete event exists, falls back to counting spawns
|
|
75
|
-
* and returns 1.0 if any workers were spawned (better than nothing).
|
|
76
|
-
*/
|
|
77
|
-
export const spawnEfficiency = createScorer({
|
|
78
|
-
name: "Spawn Efficiency",
|
|
79
|
-
description: "Workers spawned for all subtasks (delegation ratio)",
|
|
80
|
-
scorer: ({ output }) => {
|
|
81
|
-
try {
|
|
82
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
83
|
-
|
|
84
|
-
// Find decomposition_complete event (has subtask count)
|
|
85
|
-
const decomp = session.events.find(
|
|
86
|
-
(e) =>
|
|
87
|
-
e.event_type === "DECISION" &&
|
|
88
|
-
e.decision_type === "decomposition_complete"
|
|
89
|
-
);
|
|
90
|
-
|
|
91
|
-
// Count worker_spawned events
|
|
92
|
-
const spawned = session.events.filter(
|
|
93
|
-
(e) =>
|
|
94
|
-
e.event_type === "DECISION" && e.decision_type === "worker_spawned"
|
|
95
|
-
).length;
|
|
96
|
-
|
|
97
|
-
if (!decomp) {
|
|
98
|
-
// Fallback: if workers were spawned but no decomp event, assume they're doing work
|
|
99
|
-
if (spawned > 0) {
|
|
100
|
-
return {
|
|
101
|
-
score: 1.0,
|
|
102
|
-
message: `${spawned} workers spawned (no decomposition event)`,
|
|
103
|
-
};
|
|
104
|
-
}
|
|
105
|
-
return {
|
|
106
|
-
score: 0,
|
|
107
|
-
message: "No decomposition event found",
|
|
108
|
-
};
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
const subtaskCount = (decomp.payload as { subtask_count?: number })?.subtask_count || 0;
|
|
112
|
-
|
|
113
|
-
if (subtaskCount === 0) {
|
|
114
|
-
return {
|
|
115
|
-
score: 0,
|
|
116
|
-
message: "No subtasks planned",
|
|
117
|
-
};
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
const score = spawned / subtaskCount;
|
|
121
|
-
|
|
122
|
-
return {
|
|
123
|
-
score,
|
|
124
|
-
message: `${spawned}/${subtaskCount} workers spawned (${(score * 100).toFixed(0)}%)`,
|
|
125
|
-
};
|
|
126
|
-
} catch (error) {
|
|
127
|
-
return {
|
|
128
|
-
score: 0,
|
|
129
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
|
-
},
|
|
133
|
-
});
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Review Thoroughness Scorer
|
|
137
|
-
*
|
|
138
|
-
* Measures whether coordinator reviewed worker output.
|
|
139
|
-
* Should have review_completed events for all finished subtasks.
|
|
140
|
-
*
|
|
141
|
-
* Score: reviews_completed / workers_finished
|
|
142
|
-
*/
|
|
143
|
-
export const reviewThoroughness = createScorer({
|
|
144
|
-
name: "Review Thoroughness",
|
|
145
|
-
description: "Coordinator reviewed all worker output",
|
|
146
|
-
scorer: ({ output }) => {
|
|
147
|
-
try {
|
|
148
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
149
|
-
|
|
150
|
-
// Count finished workers (subtask_success or subtask_failed)
|
|
151
|
-
const finished = session.events.filter(
|
|
152
|
-
(e) =>
|
|
153
|
-
e.event_type === "OUTCOME" &&
|
|
154
|
-
(e.outcome_type === "subtask_success" ||
|
|
155
|
-
e.outcome_type === "subtask_failed")
|
|
156
|
-
).length;
|
|
157
|
-
|
|
158
|
-
if (finished === 0) {
|
|
159
|
-
return {
|
|
160
|
-
score: 1.0,
|
|
161
|
-
message: "No finished workers to review",
|
|
162
|
-
};
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
// Count review_completed events
|
|
166
|
-
const reviewed = session.events.filter(
|
|
167
|
-
(e) =>
|
|
168
|
-
e.event_type === "DECISION" && e.decision_type === "review_completed"
|
|
169
|
-
).length;
|
|
170
|
-
|
|
171
|
-
const score = reviewed / finished;
|
|
172
|
-
|
|
173
|
-
return {
|
|
174
|
-
score,
|
|
175
|
-
message: `${reviewed}/${finished} workers reviewed (${(score * 100).toFixed(0)}%)`,
|
|
176
|
-
};
|
|
177
|
-
} catch (error) {
|
|
178
|
-
return {
|
|
179
|
-
score: 0,
|
|
180
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
181
|
-
};
|
|
182
|
-
}
|
|
183
|
-
},
|
|
184
|
-
});
|
|
185
|
-
|
|
186
|
-
/**
|
|
187
|
-
* Time to First Spawn Scorer
|
|
188
|
-
*
|
|
189
|
-
* Measures how fast the coordinator spawned the first worker.
|
|
190
|
-
* Overthinking and perfectionism delays workers and blocks progress.
|
|
191
|
-
*
|
|
192
|
-
* Normalization:
|
|
193
|
-
* - < 60s: 1.0 (excellent)
|
|
194
|
-
* - 60-300s: linear decay to 0.5
|
|
195
|
-
* - > 300s: 0.0 (way too slow)
|
|
196
|
-
*
|
|
197
|
-
* Score: normalized to 0-1 (faster is better)
|
|
198
|
-
*/
|
|
199
|
-
export const timeToFirstSpawn = createScorer({
|
|
200
|
-
name: "Time to First Spawn",
|
|
201
|
-
description: "Coordinator spawned workers quickly (no overthinking)",
|
|
202
|
-
scorer: ({ output }) => {
|
|
203
|
-
try {
|
|
204
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
205
|
-
|
|
206
|
-
// Find decomposition_complete event
|
|
207
|
-
const decomp = session.events.find(
|
|
208
|
-
(e) =>
|
|
209
|
-
e.event_type === "DECISION" &&
|
|
210
|
-
e.decision_type === "decomposition_complete"
|
|
211
|
-
);
|
|
212
|
-
|
|
213
|
-
if (!decomp) {
|
|
214
|
-
return {
|
|
215
|
-
score: 0,
|
|
216
|
-
message: "No decomposition event found",
|
|
217
|
-
};
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
// Find first worker_spawned event
|
|
221
|
-
const firstSpawn = session.events.find(
|
|
222
|
-
(e) =>
|
|
223
|
-
e.event_type === "DECISION" && e.decision_type === "worker_spawned"
|
|
224
|
-
);
|
|
225
|
-
|
|
226
|
-
if (!firstSpawn) {
|
|
227
|
-
return {
|
|
228
|
-
score: 0,
|
|
229
|
-
message: "No worker spawned",
|
|
230
|
-
};
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
// Calculate time delta
|
|
234
|
-
const decompTime = new Date(decomp.timestamp).getTime();
|
|
235
|
-
const spawnTime = new Date(firstSpawn.timestamp).getTime();
|
|
236
|
-
const deltaMs = spawnTime - decompTime;
|
|
237
|
-
|
|
238
|
-
// Normalize: < 60s = 1.0, > 300s = 0.0, linear in between
|
|
239
|
-
const EXCELLENT_MS = 60_000;
|
|
240
|
-
const POOR_MS = 300_000;
|
|
241
|
-
|
|
242
|
-
let score: number;
|
|
243
|
-
if (deltaMs < EXCELLENT_MS) {
|
|
244
|
-
score = 1.0;
|
|
245
|
-
} else if (deltaMs > POOR_MS) {
|
|
246
|
-
score = 0.0;
|
|
247
|
-
} else {
|
|
248
|
-
// Linear decay from 1.0 to 0.0
|
|
249
|
-
score = 1.0 - (deltaMs - EXCELLENT_MS) / (POOR_MS - EXCELLENT_MS);
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
const seconds = Math.round(deltaMs / 1000);
|
|
253
|
-
|
|
254
|
-
return {
|
|
255
|
-
score,
|
|
256
|
-
message: `First spawn after ${deltaMs}ms (${seconds}s)`,
|
|
257
|
-
};
|
|
258
|
-
} catch (error) {
|
|
259
|
-
return {
|
|
260
|
-
score: 0,
|
|
261
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
262
|
-
};
|
|
263
|
-
}
|
|
264
|
-
},
|
|
265
|
-
});
|
|
266
|
-
|
|
267
|
-
/**
|
|
268
|
-
* Overall Discipline Scorer
|
|
269
|
-
*
|
|
270
|
-
* Weighted composite of all coordinator discipline metrics.
|
|
271
|
-
*
|
|
272
|
-
* Weights:
|
|
273
|
-
* - Violations: 30% (most critical - breaking protocol)
|
|
274
|
-
* - Spawn efficiency: 25% (delegation is key)
|
|
275
|
-
* - Review thoroughness: 25% (quality gate)
|
|
276
|
-
* - Time to first spawn: 20% (bias toward action)
|
|
277
|
-
*
|
|
278
|
-
* Score: 0.0 to 1.0
|
|
279
|
-
*/
|
|
280
|
-
export const overallDiscipline = createScorer({
|
|
281
|
-
name: "Overall Coordinator Discipline",
|
|
282
|
-
description: "Composite score for coordinator protocol adherence",
|
|
283
|
-
scorer: async ({ output, expected, input }) => {
|
|
284
|
-
try {
|
|
285
|
-
// Run all scorers
|
|
286
|
-
const scores = {
|
|
287
|
-
violations: await violationCount({ output, expected, input }),
|
|
288
|
-
spawn: await spawnEfficiency({ output, expected, input }),
|
|
289
|
-
review: await reviewThoroughness({ output, expected, input }),
|
|
290
|
-
speed: await timeToFirstSpawn({ output, expected, input }),
|
|
291
|
-
};
|
|
292
|
-
|
|
293
|
-
// Weighted average
|
|
294
|
-
const weights = {
|
|
295
|
-
violations: 0.3,
|
|
296
|
-
spawn: 0.25,
|
|
297
|
-
review: 0.25,
|
|
298
|
-
speed: 0.2,
|
|
299
|
-
};
|
|
300
|
-
|
|
301
|
-
const totalScore =
|
|
302
|
-
(scores.violations.score ?? 0) * weights.violations +
|
|
303
|
-
(scores.spawn.score ?? 0) * weights.spawn +
|
|
304
|
-
(scores.review.score ?? 0) * weights.review +
|
|
305
|
-
(scores.speed.score ?? 0) * weights.speed;
|
|
306
|
-
|
|
307
|
-
const details = [
|
|
308
|
-
`Violations: ${((scores.violations.score ?? 0) * 100).toFixed(0)}%`,
|
|
309
|
-
`Spawn: ${((scores.spawn.score ?? 0) * 100).toFixed(0)}%`,
|
|
310
|
-
`Review: ${((scores.review.score ?? 0) * 100).toFixed(0)}%`,
|
|
311
|
-
`Speed: ${((scores.speed.score ?? 0) * 100).toFixed(0)}%`,
|
|
312
|
-
].join(", ");
|
|
313
|
-
|
|
314
|
-
return {
|
|
315
|
-
score: totalScore,
|
|
316
|
-
message: `Overall: ${(totalScore * 100).toFixed(0)}% (${details})`,
|
|
317
|
-
};
|
|
318
|
-
} catch (error) {
|
|
319
|
-
return {
|
|
320
|
-
score: 0,
|
|
321
|
-
message: `Failed to compute composite score: ${error}`,
|
|
322
|
-
};
|
|
323
|
-
}
|
|
324
|
-
},
|
|
325
|
-
});
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tests for decomposition scorers
|
|
3
|
-
*
|
|
4
|
-
* Uses Vitest (evalite's test runner), not Bun's test runner.
|
|
5
|
-
*
|
|
6
|
-
* Note: evalite's Score type only exposes `score`, not `message`.
|
|
7
|
-
* We test scores only - message testing requires accessing internal scorer.
|
|
8
|
-
*/
|
|
9
|
-
import { describe, expect, test } from "vitest";
|
|
10
|
-
import {
|
|
11
|
-
coverageCompleteness,
|
|
12
|
-
decompositionCoherence,
|
|
13
|
-
instructionClarity,
|
|
14
|
-
subtaskIndependence,
|
|
15
|
-
} from "./index.js";
|
|
16
|
-
|
|
17
|
-
describe("Heuristic Scorers", () => {
|
|
18
|
-
const goodDecomposition = JSON.stringify({
|
|
19
|
-
epic: { title: "Add auth", description: "Add authentication" },
|
|
20
|
-
subtasks: [
|
|
21
|
-
{
|
|
22
|
-
title: "Add login form component",
|
|
23
|
-
description: "Create React component for login with email/password",
|
|
24
|
-
files: ["src/components/LoginForm.tsx"],
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
title: "Add auth API routes",
|
|
28
|
-
description: "Create API endpoints for login/logout/session",
|
|
29
|
-
files: ["src/api/auth.ts"],
|
|
30
|
-
},
|
|
31
|
-
{
|
|
32
|
-
title: "Add auth middleware",
|
|
33
|
-
description: "Create middleware to protect routes",
|
|
34
|
-
files: ["src/middleware/auth.ts"],
|
|
35
|
-
},
|
|
36
|
-
],
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
const conflictingDecomposition = JSON.stringify({
|
|
40
|
-
epic: { title: "Add auth", description: "Add authentication" },
|
|
41
|
-
subtasks: [
|
|
42
|
-
{
|
|
43
|
-
title: "Add login",
|
|
44
|
-
files: ["src/auth.ts"],
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
title: "Add logout",
|
|
48
|
-
files: ["src/auth.ts"], // Same file - conflict!
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
test("subtaskIndependence scores 1.0 for no conflicts", async () => {
|
|
54
|
-
const result = await subtaskIndependence({
|
|
55
|
-
output: goodDecomposition,
|
|
56
|
-
expected: undefined,
|
|
57
|
-
input: {},
|
|
58
|
-
});
|
|
59
|
-
expect(result.score).toBe(1);
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
test("subtaskIndependence scores 0 for file conflicts", async () => {
|
|
63
|
-
const result = await subtaskIndependence({
|
|
64
|
-
output: conflictingDecomposition,
|
|
65
|
-
expected: undefined,
|
|
66
|
-
input: {},
|
|
67
|
-
});
|
|
68
|
-
expect(result.score).toBe(0);
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
test("instructionClarity scores higher for detailed subtasks", async () => {
|
|
72
|
-
const result = await instructionClarity({
|
|
73
|
-
output: goodDecomposition,
|
|
74
|
-
expected: undefined,
|
|
75
|
-
input: {},
|
|
76
|
-
});
|
|
77
|
-
expect(result.score).toBeGreaterThan(0.7);
|
|
78
|
-
});
|
|
79
|
-
|
|
80
|
-
test("coverageCompleteness checks subtask count", async () => {
|
|
81
|
-
const result = await coverageCompleteness({
|
|
82
|
-
output: goodDecomposition,
|
|
83
|
-
expected: { minSubtasks: 2, maxSubtasks: 5 },
|
|
84
|
-
input: {},
|
|
85
|
-
});
|
|
86
|
-
expect(result.score).toBe(1);
|
|
87
|
-
});
|
|
88
|
-
});
|
|
89
|
-
|
|
90
|
-
describe("LLM-as-Judge Scorer", () => {
|
|
91
|
-
// Skip LLM test in CI - requires API key
|
|
92
|
-
const hasApiKey = !!process.env.AI_GATEWAY_API_KEY;
|
|
93
|
-
|
|
94
|
-
test(
|
|
95
|
-
"decompositionCoherence returns valid score",
|
|
96
|
-
async () => {
|
|
97
|
-
if (!hasApiKey) {
|
|
98
|
-
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
99
|
-
return;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
const decomposition = JSON.stringify({
|
|
103
|
-
epic: { title: "Add auth", description: "Add authentication" },
|
|
104
|
-
subtasks: [
|
|
105
|
-
{
|
|
106
|
-
title: "Add login form",
|
|
107
|
-
description: "Create login UI",
|
|
108
|
-
files: ["src/LoginForm.tsx"],
|
|
109
|
-
},
|
|
110
|
-
{
|
|
111
|
-
title: "Add auth API",
|
|
112
|
-
description: "Create auth endpoints",
|
|
113
|
-
files: ["src/api/auth.ts"],
|
|
114
|
-
},
|
|
115
|
-
],
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
const result = await decompositionCoherence({
|
|
119
|
-
output: decomposition,
|
|
120
|
-
expected: undefined,
|
|
121
|
-
input: { task: "Add user authentication with login/logout" },
|
|
122
|
-
});
|
|
123
|
-
|
|
124
|
-
expect(result.score).toBeGreaterThanOrEqual(0);
|
|
125
|
-
expect(result.score).toBeLessThanOrEqual(1);
|
|
126
|
-
},
|
|
127
|
-
30000,
|
|
128
|
-
);
|
|
129
|
-
|
|
130
|
-
test("decompositionCoherence scores invalid decomposition low", async () => {
|
|
131
|
-
if (!process.env.AI_GATEWAY_API_KEY) {
|
|
132
|
-
console.log("Skipping LLM test - no AI_GATEWAY_API_KEY");
|
|
133
|
-
return;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
const result = await decompositionCoherence({
|
|
137
|
-
output: "not valid json at all {{{",
|
|
138
|
-
expected: undefined,
|
|
139
|
-
input: {},
|
|
140
|
-
});
|
|
141
|
-
|
|
142
|
-
// LLM should recognize garbage input and score it very low
|
|
143
|
-
// (0 or close to 0, not 0.5 fallback)
|
|
144
|
-
expect(result.score).toBeLessThanOrEqual(0.2);
|
|
145
|
-
}, 30000);
|
|
146
|
-
});
|