opencode-swarm-plugin 0.43.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cass.characterization.test.ts +422 -0
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.test.ts +68 -0
- package/bin/swarm.ts +81 -8
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/contributor-tools.d.ts +42 -0
- package/dist/contributor-tools.d.ts.map +1 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7728 -62590
- package/dist/plugin.js +23833 -78695
- package/dist/sessions/agent-discovery.d.ts +59 -0
- package/dist/sessions/agent-discovery.d.ts.map +1 -0
- package/dist/sessions/index.d.ts +10 -0
- package/dist/sessions/index.d.ts.map +1 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2255
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2426
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -935
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
package/evals/scorers/index.ts
DELETED
|
@@ -1,328 +0,0 @@
|
|
|
1
|
-
import { createScorer } from "evalite";
|
|
2
|
-
import { generateText, gateway } from "ai";
|
|
3
|
-
import type { GatewayModelId } from "ai";
|
|
4
|
-
import type { CellTree } from "../../src/schemas/index.js";
|
|
5
|
-
|
|
6
|
-
const JUDGE_MODEL: GatewayModelId = "anthropic/claude-haiku-4-5";
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Custom scorers for evaluating swarm task decomposition quality
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Checks that no files appear in multiple subtasks
|
|
14
|
-
*
|
|
15
|
-
* Independent subtasks are critical for parallel execution.
|
|
16
|
-
* File conflicts cause merge conflicts and coordination overhead.
|
|
17
|
-
*
|
|
18
|
-
* Score: 1.0 if no conflicts, 0.0 if conflicts found
|
|
19
|
-
*/
|
|
20
|
-
export const subtaskIndependence = createScorer({
|
|
21
|
-
name: "Subtask Independence",
|
|
22
|
-
description: "Checks that no files appear in multiple subtasks",
|
|
23
|
-
scorer: ({ output }) => {
|
|
24
|
-
try {
|
|
25
|
-
const beadTree = JSON.parse(String(output)) as CellTree;
|
|
26
|
-
const fileMap = new Map<string, number>();
|
|
27
|
-
|
|
28
|
-
// Track which files appear in which subtasks
|
|
29
|
-
beadTree.subtasks.forEach((subtask) => {
|
|
30
|
-
subtask.files?.forEach((file) => {
|
|
31
|
-
const count = fileMap.get(file) || 0;
|
|
32
|
-
fileMap.set(file, count + 1);
|
|
33
|
-
});
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
// Check for conflicts
|
|
37
|
-
const conflicts = Array.from(fileMap.entries()).filter(
|
|
38
|
-
([_, count]) => count > 1,
|
|
39
|
-
);
|
|
40
|
-
|
|
41
|
-
if (conflicts.length > 0) {
|
|
42
|
-
return {
|
|
43
|
-
score: 0,
|
|
44
|
-
message: `File conflicts found: ${conflicts.map(([f]) => f).join(", ")}`,
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
return {
|
|
49
|
-
score: 1,
|
|
50
|
-
message: "No file conflicts - subtasks are independent",
|
|
51
|
-
};
|
|
52
|
-
} catch (error) {
|
|
53
|
-
return {
|
|
54
|
-
score: 0,
|
|
55
|
-
message: `Failed to parse CellTree: ${error}`,
|
|
56
|
-
};
|
|
57
|
-
}
|
|
58
|
-
},
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
// ============================================================================
|
|
62
|
-
// Outcome-based scorers
|
|
63
|
-
// ============================================================================
|
|
64
|
-
|
|
65
|
-
export {
|
|
66
|
-
executionSuccess,
|
|
67
|
-
timeBalance,
|
|
68
|
-
scopeAccuracy,
|
|
69
|
-
scopeDrift,
|
|
70
|
-
noRework,
|
|
71
|
-
} from "./outcome-scorers.js";
|
|
72
|
-
|
|
73
|
-
// ============================================================================
|
|
74
|
-
// Compaction-specific scorers
|
|
75
|
-
// ============================================================================
|
|
76
|
-
|
|
77
|
-
export {
|
|
78
|
-
confidenceAccuracy,
|
|
79
|
-
contextInjectionCorrectness,
|
|
80
|
-
requiredPatternsPresent,
|
|
81
|
-
forbiddenPatternsAbsent,
|
|
82
|
-
compactionQuality,
|
|
83
|
-
} from "./compaction-scorers.js";
|
|
84
|
-
|
|
85
|
-
// ============================================================================
|
|
86
|
-
// Coordinator discipline scorers
|
|
87
|
-
// ============================================================================
|
|
88
|
-
|
|
89
|
-
export {
|
|
90
|
-
violationCount,
|
|
91
|
-
spawnEfficiency,
|
|
92
|
-
reviewThoroughness,
|
|
93
|
-
timeToFirstSpawn,
|
|
94
|
-
overallDiscipline,
|
|
95
|
-
} from "./coordinator-discipline.js";
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Checks that subtasks cover the full task scope
|
|
99
|
-
*
|
|
100
|
-
* Incomplete coverage means:
|
|
101
|
-
* - Missing functionality
|
|
102
|
-
* - Follow-up work required
|
|
103
|
-
* - Task not actually complete
|
|
104
|
-
*
|
|
105
|
-
* Score: ratio of expected files covered (0.0 to 1.0)
|
|
106
|
-
* If no expected files specified, checks that subtasks exist
|
|
107
|
-
*/
|
|
108
|
-
export const coverageCompleteness = createScorer({
|
|
109
|
-
name: "Coverage Completeness",
|
|
110
|
-
description: "Checks that subtasks cover the full task scope",
|
|
111
|
-
scorer: ({ output, expected }) => {
|
|
112
|
-
try {
|
|
113
|
-
const beadTree = JSON.parse(String(output)) as CellTree;
|
|
114
|
-
|
|
115
|
-
// If expected files specified, check coverage
|
|
116
|
-
const expectedData = expected as Record<string, unknown> | undefined;
|
|
117
|
-
if (expectedData && Array.isArray(expectedData.requiredFiles)) {
|
|
118
|
-
const allFiles = new Set(
|
|
119
|
-
beadTree.subtasks.flatMap((st) => st.files || []),
|
|
120
|
-
);
|
|
121
|
-
|
|
122
|
-
const requiredFiles = expectedData.requiredFiles as string[];
|
|
123
|
-
const coveredFiles = requiredFiles.filter((f) => allFiles.has(f));
|
|
124
|
-
const coverage = coveredFiles.length / requiredFiles.length;
|
|
125
|
-
|
|
126
|
-
return {
|
|
127
|
-
score: coverage,
|
|
128
|
-
message: `${coveredFiles.length}/${requiredFiles.length} required files covered`,
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
// Otherwise, check min/max subtask count
|
|
133
|
-
const minSubtasks = (expectedData?.minSubtasks as number) || 1;
|
|
134
|
-
const maxSubtasks = (expectedData?.maxSubtasks as number) || 10;
|
|
135
|
-
const count = beadTree.subtasks.length;
|
|
136
|
-
|
|
137
|
-
if (count < minSubtasks) {
|
|
138
|
-
return {
|
|
139
|
-
score: 0,
|
|
140
|
-
message: `Too few subtasks: ${count} < ${minSubtasks}`,
|
|
141
|
-
};
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
if (count > maxSubtasks) {
|
|
145
|
-
return {
|
|
146
|
-
score: 0.5,
|
|
147
|
-
message: `Too many subtasks: ${count} > ${maxSubtasks} (over-decomposed)`,
|
|
148
|
-
};
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
return {
|
|
152
|
-
score: 1,
|
|
153
|
-
message: `Good subtask count: ${count} (${minSubtasks}-${maxSubtasks})`,
|
|
154
|
-
};
|
|
155
|
-
} catch (error) {
|
|
156
|
-
return {
|
|
157
|
-
score: 0,
|
|
158
|
-
message: `Failed to parse CellTree: ${error}`,
|
|
159
|
-
};
|
|
160
|
-
}
|
|
161
|
-
},
|
|
162
|
-
});
|
|
163
|
-
|
|
164
|
-
/**
|
|
165
|
-
* Checks that each subtask has clear, actionable instructions
|
|
166
|
-
*
|
|
167
|
-
* Vague instructions lead to:
|
|
168
|
-
* - Agent confusion and blocking
|
|
169
|
-
* - Incorrect implementations
|
|
170
|
-
* - Need for coordinator intervention
|
|
171
|
-
*
|
|
172
|
-
* Score: Average of per-subtask instruction quality
|
|
173
|
-
*/
|
|
174
|
-
export const instructionClarity = createScorer({
|
|
175
|
-
name: "Instruction Clarity",
|
|
176
|
-
description: "Checks that subtasks have clear, actionable instructions",
|
|
177
|
-
scorer: ({ output }) => {
|
|
178
|
-
try {
|
|
179
|
-
const beadTree = JSON.parse(String(output)) as CellTree;
|
|
180
|
-
|
|
181
|
-
if (beadTree.subtasks.length === 0) {
|
|
182
|
-
return {
|
|
183
|
-
score: 0,
|
|
184
|
-
message: "No subtasks found",
|
|
185
|
-
};
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// Check each subtask for clarity signals
|
|
189
|
-
const scores = beadTree.subtasks.map((subtask) => {
|
|
190
|
-
let score = 0.5; // baseline
|
|
191
|
-
|
|
192
|
-
// Has description?
|
|
193
|
-
if (subtask.description && subtask.description.length > 20) {
|
|
194
|
-
score += 0.2;
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
// Has files specified?
|
|
198
|
-
if (subtask.files && subtask.files.length > 0) {
|
|
199
|
-
score += 0.2;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
// Title is specific (not generic)?
|
|
203
|
-
const genericWords = ["update", "fix", "add", "change", "modify"];
|
|
204
|
-
const titleLower = subtask.title.toLowerCase();
|
|
205
|
-
const isGeneric = genericWords.some(
|
|
206
|
-
(word) => titleLower === word || titleLower.startsWith(`${word} `),
|
|
207
|
-
);
|
|
208
|
-
if (!isGeneric) {
|
|
209
|
-
score += 0.1;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
return Math.min(1.0, score);
|
|
213
|
-
});
|
|
214
|
-
|
|
215
|
-
const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
216
|
-
|
|
217
|
-
return {
|
|
218
|
-
score: avgScore,
|
|
219
|
-
message: `Average instruction clarity: ${(avgScore * 100).toFixed(0)}%`,
|
|
220
|
-
};
|
|
221
|
-
} catch (error) {
|
|
222
|
-
return {
|
|
223
|
-
score: 0,
|
|
224
|
-
message: `Failed to parse CellTree: ${error}`,
|
|
225
|
-
};
|
|
226
|
-
}
|
|
227
|
-
},
|
|
228
|
-
});
|
|
229
|
-
|
|
230
|
-
// ============================================================================
|
|
231
|
-
// LLM-as-Judge Scorers
|
|
232
|
-
// ============================================================================
|
|
233
|
-
|
|
234
|
-
/**
|
|
235
|
-
* LLM-as-judge scorer for decomposition coherence
|
|
236
|
-
*
|
|
237
|
-
* Uses Claude Haiku to evaluate whether subtasks are truly independent,
|
|
238
|
-
* well-scoped, and complete. This catches nuances that heuristics miss:
|
|
239
|
-
* - Semantic dependencies between subtasks
|
|
240
|
-
* - Scope that's too big or too trivial
|
|
241
|
-
* - Missing pieces that would block completion
|
|
242
|
-
*
|
|
243
|
-
* Only use for decomposition evals - this is where it matters.
|
|
244
|
-
*/
|
|
245
|
-
export const decompositionCoherence = createScorer({
|
|
246
|
-
name: "Decomposition Coherence (LLM Judge)",
|
|
247
|
-
description:
|
|
248
|
-
"LLM evaluates whether subtasks are truly independent and well-scoped",
|
|
249
|
-
scorer: async ({ output, input }) => {
|
|
250
|
-
try {
|
|
251
|
-
const decomposition =
|
|
252
|
-
typeof output === "string" ? output : JSON.stringify(output, null, 2);
|
|
253
|
-
|
|
254
|
-
// Get original task from input if available
|
|
255
|
-
const originalTask =
|
|
256
|
-
typeof input === "object" && input !== null && "task" in input
|
|
257
|
-
? String((input as { task: string }).task)
|
|
258
|
-
: "Unknown task";
|
|
259
|
-
|
|
260
|
-
const { text } = await generateText({
|
|
261
|
-
model: gateway(JUDGE_MODEL),
|
|
262
|
-
prompt: `You are evaluating a task decomposition for parallel agent execution.
|
|
263
|
-
|
|
264
|
-
ORIGINAL TASK:
|
|
265
|
-
${originalTask}
|
|
266
|
-
|
|
267
|
-
DECOMPOSITION:
|
|
268
|
-
${decomposition}
|
|
269
|
-
|
|
270
|
-
Evaluate on these criteria (be harsh - bad decompositions waste expensive parallel work):
|
|
271
|
-
|
|
272
|
-
1. INDEPENDENCE (25%): Can subtasks truly run in parallel? Look for:
|
|
273
|
-
- Shared state dependencies (one writes, another reads)
|
|
274
|
-
- Ordering requirements hidden in the task descriptions
|
|
275
|
-
- Shared files that will cause merge conflicts
|
|
276
|
-
|
|
277
|
-
2. SCOPE (25%): Is each subtask right-sized?
|
|
278
|
-
- Too big: Should be split further (>2 hours of work)
|
|
279
|
-
- Too small: Trivial tasks that waste agent spawn overhead
|
|
280
|
-
- Goldilocks: 30min-2hr of focused work
|
|
281
|
-
|
|
282
|
-
3. COMPLETENESS (25%): Does the sum equal the whole?
|
|
283
|
-
- Missing pieces that would leave the task incomplete
|
|
284
|
-
- Gaps between subtasks (who handles X?)
|
|
285
|
-
- Implicit work not captured in any subtask
|
|
286
|
-
|
|
287
|
-
4. CLARITY (25%): Would an agent know what to do?
|
|
288
|
-
- Vague descriptions that invite interpretation
|
|
289
|
-
- Missing context needed to start work
|
|
290
|
-
- Ambiguous boundaries between subtasks
|
|
291
|
-
|
|
292
|
-
Return ONLY valid JSON (no markdown, no explanation):
|
|
293
|
-
{"score": <0-100>, "issues": ["issue1", "issue2"], "strengths": ["strength1"]}`,
|
|
294
|
-
maxOutputTokens: 512,
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
// Parse JSON response - handle potential markdown wrapping
|
|
298
|
-
let jsonText = text.trim();
|
|
299
|
-
if (jsonText.startsWith("```")) {
|
|
300
|
-
jsonText = jsonText.replace(/```json?\n?/g, "").replace(/```$/g, "");
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
const result = JSON.parse(jsonText) as {
|
|
304
|
-
score: number;
|
|
305
|
-
issues: string[];
|
|
306
|
-
strengths?: string[];
|
|
307
|
-
};
|
|
308
|
-
|
|
309
|
-
const issueText =
|
|
310
|
-
result.issues.length > 0 ? result.issues.join("; ") : "No issues";
|
|
311
|
-
const strengthText =
|
|
312
|
-
result.strengths && result.strengths.length > 0
|
|
313
|
-
? ` | Strengths: ${result.strengths.join("; ")}`
|
|
314
|
-
: "";
|
|
315
|
-
|
|
316
|
-
return {
|
|
317
|
-
score: result.score / 100,
|
|
318
|
-
message: `${issueText}${strengthText}`,
|
|
319
|
-
};
|
|
320
|
-
} catch (error) {
|
|
321
|
-
// Don't fail the eval if judge fails - return neutral score
|
|
322
|
-
return {
|
|
323
|
-
score: 0.5,
|
|
324
|
-
message: `LLM judge error: ${error instanceof Error ? error.message : String(error)}`,
|
|
325
|
-
};
|
|
326
|
-
}
|
|
327
|
-
},
|
|
328
|
-
});
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Outcome-based Scorers Tests
|
|
3
|
-
*
|
|
4
|
-
* Tests the 5 new outcome-based scorers by verifying their exports.
|
|
5
|
-
* Full functional testing happens via Evalite integration.
|
|
6
|
-
*/
|
|
7
|
-
import { describe, it, expect } from "bun:test";
|
|
8
|
-
|
|
9
|
-
describe("Outcome Scorers", () => {
|
|
10
|
-
it("exports all 5 outcome scorers from outcome-scorers.ts", async () => {
|
|
11
|
-
const module = await import("./outcome-scorers.js");
|
|
12
|
-
expect(module.executionSuccess).toBeDefined();
|
|
13
|
-
expect(module.timeBalance).toBeDefined();
|
|
14
|
-
expect(module.scopeAccuracy).toBeDefined();
|
|
15
|
-
expect(module.scopeDrift).toBeDefined();
|
|
16
|
-
expect(module.noRework).toBeDefined();
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it("re-exports all 5 outcome scorers from index.ts", async () => {
|
|
20
|
-
const indexModule = await import("./index.js");
|
|
21
|
-
expect(indexModule.executionSuccess).toBeDefined();
|
|
22
|
-
expect(indexModule.timeBalance).toBeDefined();
|
|
23
|
-
expect(indexModule.scopeAccuracy).toBeDefined();
|
|
24
|
-
expect(indexModule.scopeDrift).toBeDefined();
|
|
25
|
-
expect(indexModule.noRework).toBeDefined();
|
|
26
|
-
});
|
|
27
|
-
});
|
|
@@ -1,349 +0,0 @@
|
|
|
1
|
-
import { createScorer } from "evalite";
|
|
2
|
-
import type { EvalRecord } from "../../src/eval-capture.js";
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Outcome-based scorers for evaluating decomposition quality
|
|
6
|
-
*
|
|
7
|
-
* These scorers evaluate based on ACTUAL execution outcomes,
|
|
8
|
-
* not just the structure of the decomposition.
|
|
9
|
-
*
|
|
10
|
-
* Requires EvalRecord with outcomes populated.
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Execution Success Scorer
|
|
15
|
-
*
|
|
16
|
-
* Measures whether all subtasks succeeded without errors.
|
|
17
|
-
* This is the ultimate measure - did the decomposition actually work?
|
|
18
|
-
*
|
|
19
|
-
* Score: 1.0 if all outcomes.success === true, 0.0 otherwise
|
|
20
|
-
*/
|
|
21
|
-
export const executionSuccess = createScorer({
|
|
22
|
-
name: "Execution Success",
|
|
23
|
-
description: "All subtasks completed successfully without errors",
|
|
24
|
-
scorer: ({ output }) => {
|
|
25
|
-
try {
|
|
26
|
-
const record = JSON.parse(String(output)) as EvalRecord;
|
|
27
|
-
|
|
28
|
-
// Check if outcomes exist
|
|
29
|
-
if (!record.outcomes || record.outcomes.length === 0) {
|
|
30
|
-
return {
|
|
31
|
-
score: 0,
|
|
32
|
-
message: "No outcome data available",
|
|
33
|
-
};
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
// Check if all subtasks succeeded
|
|
37
|
-
const allSucceeded = record.outcomes.every((outcome) => outcome.success);
|
|
38
|
-
|
|
39
|
-
if (allSucceeded) {
|
|
40
|
-
return {
|
|
41
|
-
score: 1,
|
|
42
|
-
message: `All ${record.outcomes.length} subtasks succeeded`,
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// Report failures
|
|
47
|
-
const failures = record.outcomes.filter((o) => !o.success);
|
|
48
|
-
const failureList = failures.map((f) => f.title || f.bead_id).join(", ");
|
|
49
|
-
|
|
50
|
-
return {
|
|
51
|
-
score: 0,
|
|
52
|
-
message: `${failures.length}/${record.outcomes.length} subtasks failed: ${failureList}`,
|
|
53
|
-
};
|
|
54
|
-
} catch (error) {
|
|
55
|
-
return {
|
|
56
|
-
score: 0,
|
|
57
|
-
message: `Failed to parse EvalRecord: ${error}`,
|
|
58
|
-
};
|
|
59
|
-
}
|
|
60
|
-
},
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Time Balance Scorer
|
|
65
|
-
*
|
|
66
|
-
* Measures how evenly balanced the work was across subtasks.
|
|
67
|
-
* Unbalanced work means some agents finish early while others are bottlenecked.
|
|
68
|
-
*
|
|
69
|
-
* Score: 1.0 if max/min ratio < 2.0 (well balanced)
|
|
70
|
-
* 0.5 if ratio < 4.0 (moderately balanced)
|
|
71
|
-
* 0.0 if ratio >= 4.0 (poorly balanced)
|
|
72
|
-
*/
|
|
73
|
-
export const timeBalance = createScorer({
|
|
74
|
-
name: "Time Balance",
|
|
75
|
-
description: "Work is evenly distributed across subtasks (max/min duration)",
|
|
76
|
-
scorer: ({ output }) => {
|
|
77
|
-
try {
|
|
78
|
-
const record = JSON.parse(String(output)) as EvalRecord;
|
|
79
|
-
|
|
80
|
-
// Check if outcomes exist
|
|
81
|
-
if (!record.outcomes || record.outcomes.length === 0) {
|
|
82
|
-
return {
|
|
83
|
-
score: 0,
|
|
84
|
-
message: "No outcome data available",
|
|
85
|
-
};
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// Need at least 2 subtasks to measure balance
|
|
89
|
-
if (record.outcomes.length < 2) {
|
|
90
|
-
return {
|
|
91
|
-
score: 1,
|
|
92
|
-
message: "Only one subtask - perfect balance",
|
|
93
|
-
};
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
// Get durations (filter out zeros)
|
|
97
|
-
const durations = record.outcomes
|
|
98
|
-
.map((o) => o.duration_ms)
|
|
99
|
-
.filter((d) => d > 0);
|
|
100
|
-
|
|
101
|
-
if (durations.length === 0) {
|
|
102
|
-
return {
|
|
103
|
-
score: 0,
|
|
104
|
-
message: "No duration data available",
|
|
105
|
-
};
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
const maxDuration = Math.max(...durations);
|
|
109
|
-
const minDuration = Math.min(...durations);
|
|
110
|
-
const ratio = maxDuration / minDuration;
|
|
111
|
-
|
|
112
|
-
// Score based on ratio
|
|
113
|
-
let score: number;
|
|
114
|
-
let assessment: string;
|
|
115
|
-
|
|
116
|
-
if (ratio < 2.0) {
|
|
117
|
-
score = 1.0;
|
|
118
|
-
assessment = "well balanced";
|
|
119
|
-
} else if (ratio < 4.0) {
|
|
120
|
-
score = 0.5;
|
|
121
|
-
assessment = "moderately balanced";
|
|
122
|
-
} else {
|
|
123
|
-
score = 0.0;
|
|
124
|
-
assessment = "poorly balanced";
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
const maxSeconds = Math.round(maxDuration / 1000);
|
|
128
|
-
const minSeconds = Math.round(minDuration / 1000);
|
|
129
|
-
|
|
130
|
-
return {
|
|
131
|
-
score,
|
|
132
|
-
message: `Ratio ${ratio.toFixed(1)}x (${maxSeconds}s / ${minSeconds}s) - ${assessment}`,
|
|
133
|
-
};
|
|
134
|
-
} catch (error) {
|
|
135
|
-
return {
|
|
136
|
-
score: 0,
|
|
137
|
-
message: `Failed to parse EvalRecord: ${error}`,
|
|
138
|
-
};
|
|
139
|
-
}
|
|
140
|
-
},
|
|
141
|
-
});
|
|
142
|
-
|
|
143
|
-
/**
|
|
144
|
-
* Scope Accuracy Scorer
|
|
145
|
-
*
|
|
146
|
-
* Measures how accurately the decomposition predicted which files would be touched.
|
|
147
|
-
* High accuracy means the planner understood the work scope correctly.
|
|
148
|
-
*
|
|
149
|
-
* Score: intersection(actual, planned) / planned.length
|
|
150
|
-
* 1.0 = all planned files were touched, no extras
|
|
151
|
-
* 0.5 = half the planned files were touched
|
|
152
|
-
* 0.0 = none of the planned files were touched
|
|
153
|
-
*/
|
|
154
|
-
export const scopeAccuracy = createScorer({
|
|
155
|
-
name: "Scope Accuracy",
|
|
156
|
-
description:
|
|
157
|
-
"Planned files match actual files touched (accuracy of scope prediction)",
|
|
158
|
-
scorer: ({ output }) => {
|
|
159
|
-
try {
|
|
160
|
-
const record = JSON.parse(String(output)) as EvalRecord;
|
|
161
|
-
|
|
162
|
-
// Check if outcomes exist
|
|
163
|
-
if (!record.outcomes || record.outcomes.length === 0) {
|
|
164
|
-
return {
|
|
165
|
-
score: 0,
|
|
166
|
-
message: "No outcome data available",
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
// Calculate accuracy per subtask
|
|
171
|
-
let totalPlanned = 0;
|
|
172
|
-
let totalCorrect = 0;
|
|
173
|
-
|
|
174
|
-
for (const outcome of record.outcomes) {
|
|
175
|
-
const planned = new Set(outcome.planned_files);
|
|
176
|
-
const actual = new Set(outcome.actual_files);
|
|
177
|
-
|
|
178
|
-
// Count intersection (files in both planned and actual)
|
|
179
|
-
const intersection = Array.from(planned).filter((f) => actual.has(f));
|
|
180
|
-
|
|
181
|
-
totalPlanned += planned.size;
|
|
182
|
-
totalCorrect += intersection.length;
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
if (totalPlanned === 0) {
|
|
186
|
-
return {
|
|
187
|
-
score: 0,
|
|
188
|
-
message: "No planned files to measure against",
|
|
189
|
-
};
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
const accuracy = totalCorrect / totalPlanned;
|
|
193
|
-
|
|
194
|
-
return {
|
|
195
|
-
score: accuracy,
|
|
196
|
-
message: `${totalCorrect}/${totalPlanned} planned files touched (${(accuracy * 100).toFixed(0)}% accuracy)`,
|
|
197
|
-
};
|
|
198
|
-
} catch (error) {
|
|
199
|
-
return {
|
|
200
|
-
score: 0,
|
|
201
|
-
message: `Failed to parse EvalRecord: ${error}`,
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
},
|
|
205
|
-
});
|
|
206
|
-
|
|
207
|
-
/**
|
|
208
|
-
* Scope Drift Scorer
|
|
209
|
-
*
|
|
210
|
-
* Penalizes when agents touch files NOT in their planned scope.
|
|
211
|
-
* Scope drift indicates poor planning or unexpected dependencies.
|
|
212
|
-
*
|
|
213
|
-
* Score: 1.0 if no drift (all actual files were planned)
|
|
214
|
-
* Decreases linearly with drift percentage
|
|
215
|
-
* 0.0 if drift > 50%
|
|
216
|
-
*/
|
|
217
|
-
export const scopeDrift = createScorer({
|
|
218
|
-
name: "Scope Drift",
|
|
219
|
-
description:
|
|
220
|
-
"Agents stayed within their planned file scope (no unexpected files)",
|
|
221
|
-
scorer: ({ output }) => {
|
|
222
|
-
try {
|
|
223
|
-
const record = JSON.parse(String(output)) as EvalRecord;
|
|
224
|
-
|
|
225
|
-
// Check if outcomes exist
|
|
226
|
-
if (!record.outcomes || record.outcomes.length === 0) {
|
|
227
|
-
return {
|
|
228
|
-
score: 0,
|
|
229
|
-
message: "No outcome data available",
|
|
230
|
-
};
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
// Calculate drift per subtask
|
|
234
|
-
let totalActual = 0;
|
|
235
|
-
let totalDrift = 0;
|
|
236
|
-
|
|
237
|
-
for (const outcome of record.outcomes) {
|
|
238
|
-
const planned = new Set(outcome.planned_files);
|
|
239
|
-
const actual = new Set(outcome.actual_files);
|
|
240
|
-
|
|
241
|
-
// Count files in actual but NOT in planned
|
|
242
|
-
const drift = Array.from(actual).filter((f) => !planned.has(f));
|
|
243
|
-
|
|
244
|
-
totalActual += actual.size;
|
|
245
|
-
totalDrift += drift.length;
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
if (totalActual === 0) {
|
|
249
|
-
return {
|
|
250
|
-
score: 1,
|
|
251
|
-
message: "No files touched",
|
|
252
|
-
};
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
const driftRatio = totalDrift / totalActual;
|
|
256
|
-
|
|
257
|
-
// Score: 1.0 if no drift, linearly decrease to 0 at 50% drift
|
|
258
|
-
const score = Math.max(0, 1.0 - driftRatio * 2);
|
|
259
|
-
|
|
260
|
-
const driftPct = (driftRatio * 100).toFixed(0);
|
|
261
|
-
|
|
262
|
-
return {
|
|
263
|
-
score,
|
|
264
|
-
message: `${totalDrift}/${totalActual} files were unplanned (${driftPct}% drift)`,
|
|
265
|
-
};
|
|
266
|
-
} catch (error) {
|
|
267
|
-
return {
|
|
268
|
-
score: 0,
|
|
269
|
-
message: `Failed to parse EvalRecord: ${error}`,
|
|
270
|
-
};
|
|
271
|
-
}
|
|
272
|
-
},
|
|
273
|
-
});
|
|
274
|
-
|
|
275
|
-
/**
|
|
276
|
-
* No Rework Scorer
|
|
277
|
-
*
|
|
278
|
-
* Checks that no subtask touched files assigned to another subtask.
|
|
279
|
-
* Rework indicates poor decomposition or missing dependencies.
|
|
280
|
-
*
|
|
281
|
-
* Score: 1.0 if no rework (no subtask touched another's planned files)
|
|
282
|
-
* 0.0 if rework detected
|
|
283
|
-
*/
|
|
284
|
-
export const noRework = createScorer({
|
|
285
|
-
name: "No Rework",
|
|
286
|
-
description: "No subtask touched files assigned to another subtask",
|
|
287
|
-
scorer: ({ output }) => {
|
|
288
|
-
try {
|
|
289
|
-
const record = JSON.parse(String(output)) as EvalRecord;
|
|
290
|
-
|
|
291
|
-
// Check if outcomes exist
|
|
292
|
-
if (!record.outcomes || record.outcomes.length === 0) {
|
|
293
|
-
return {
|
|
294
|
-
score: 0,
|
|
295
|
-
message: "No outcome data available",
|
|
296
|
-
};
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
// Build map of planned files per subtask
|
|
300
|
-
const plannedBySubtask = new Map<string, Set<string>>();
|
|
301
|
-
|
|
302
|
-
for (const outcome of record.outcomes) {
|
|
303
|
-
plannedBySubtask.set(outcome.bead_id, new Set(outcome.planned_files));
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// Check each subtask for rework
|
|
307
|
-
const reworkCases: string[] = [];
|
|
308
|
-
|
|
309
|
-
for (const outcome of record.outcomes) {
|
|
310
|
-
const actualFiles = new Set(outcome.actual_files);
|
|
311
|
-
|
|
312
|
-
// Check if this subtask touched files planned for another subtask
|
|
313
|
-
for (const [otherBeadId, otherPlanned] of plannedBySubtask.entries()) {
|
|
314
|
-
if (otherBeadId === outcome.bead_id) {
|
|
315
|
-
continue; // Skip self
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
// Find intersection
|
|
319
|
-
const overlap = Array.from(actualFiles).filter((f) =>
|
|
320
|
-
otherPlanned.has(f),
|
|
321
|
-
);
|
|
322
|
-
|
|
323
|
-
if (overlap.length > 0) {
|
|
324
|
-
reworkCases.push(
|
|
325
|
-
`${outcome.title || outcome.bead_id} touched ${overlap.length} file(s) from ${otherBeadId}`,
|
|
326
|
-
);
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
if (reworkCases.length > 0) {
|
|
332
|
-
return {
|
|
333
|
-
score: 0,
|
|
334
|
-
message: `Rework detected: ${reworkCases.join("; ")}`,
|
|
335
|
-
};
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
return {
|
|
339
|
-
score: 1,
|
|
340
|
-
message: "No rework - all subtasks stayed in their lanes",
|
|
341
|
-
};
|
|
342
|
-
} catch (error) {
|
|
343
|
-
return {
|
|
344
|
-
score: 0,
|
|
345
|
-
message: `Failed to parse EvalRecord: ${error}`,
|
|
346
|
-
};
|
|
347
|
-
}
|
|
348
|
-
},
|
|
349
|
-
});
|