opencode-swarm-plugin 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.beads/issues.jsonl +213 -0
- package/INTEGRATION_EXAMPLE.md +66 -0
- package/README.md +352 -522
- package/dist/index.js +2046 -984
- package/dist/plugin.js +2051 -1017
- package/docs/analysis/subagent-coordination-patterns.md +2 -0
- package/docs/semantic-memory-cli-syntax.md +123 -0
- package/docs/swarm-mail-architecture.md +1147 -0
- package/evals/README.md +116 -0
- package/evals/evalite.config.ts +15 -0
- package/evals/example.eval.ts +32 -0
- package/evals/fixtures/decomposition-cases.ts +105 -0
- package/evals/lib/data-loader.test.ts +288 -0
- package/evals/lib/data-loader.ts +111 -0
- package/evals/lib/llm.ts +115 -0
- package/evals/scorers/index.ts +200 -0
- package/evals/scorers/outcome-scorers.test.ts +27 -0
- package/evals/scorers/outcome-scorers.ts +349 -0
- package/evals/swarm-decomposition.eval.ts +112 -0
- package/package.json +8 -1
- package/scripts/cleanup-test-memories.ts +346 -0
- package/src/beads.ts +49 -0
- package/src/eval-capture.ts +487 -0
- package/src/index.ts +45 -3
- package/src/learning.integration.test.ts +19 -4
- package/src/output-guardrails.test.ts +438 -0
- package/src/output-guardrails.ts +381 -0
- package/src/schemas/index.ts +18 -0
- package/src/schemas/swarm-context.ts +115 -0
- package/src/storage.ts +117 -5
- package/src/streams/events.test.ts +296 -0
- package/src/streams/events.ts +93 -0
- package/src/streams/migrations.test.ts +24 -20
- package/src/streams/migrations.ts +51 -0
- package/src/streams/projections.ts +187 -0
- package/src/streams/store.ts +275 -0
- package/src/swarm-orchestrate.ts +771 -189
- package/src/swarm-prompts.ts +84 -12
- package/src/swarm.integration.test.ts +124 -0
- package/vitest.integration.config.ts +6 -0
- package/vitest.integration.setup.ts +48 -0
package/evals/lib/llm.ts
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Client for Evalite Evals
|
|
3
|
+
*
|
|
4
|
+
* Uses AI SDK v6 with Vercel AI Gateway.
|
|
5
|
+
* Gateway handles provider routing - just pass "provider/model" string.
|
|
6
|
+
*
|
|
7
|
+
* @module evals/lib/llm
|
|
8
|
+
*/
|
|
9
|
+
import { generateText, gateway } from "ai";
|
|
10
|
+
import type { GatewayModelId } from "ai";
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Default model for decomposition evals
|
|
14
|
+
* Using Claude Sonnet for good balance of quality and cost
|
|
15
|
+
*/
|
|
16
|
+
export const DEFAULT_MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Generate a decomposition from a task description
|
|
20
|
+
*
|
|
21
|
+
* @param prompt - The full decomposition prompt
|
|
22
|
+
* @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
|
|
23
|
+
* @returns The raw text response from the LLM
|
|
24
|
+
*/
|
|
25
|
+
export async function generateDecomposition(
|
|
26
|
+
prompt: string,
|
|
27
|
+
model: GatewayModelId = DEFAULT_MODEL,
|
|
28
|
+
): Promise<string> {
|
|
29
|
+
const { text } = await generateText({
|
|
30
|
+
model: gateway(model),
|
|
31
|
+
prompt,
|
|
32
|
+
maxOutputTokens: 4096,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
return text;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Format a decomposition prompt from task and context
|
|
40
|
+
*
|
|
41
|
+
* Uses the same prompt template as swarm_plan_prompt
|
|
42
|
+
*/
|
|
43
|
+
export function formatDecompositionPrompt(
|
|
44
|
+
task: string,
|
|
45
|
+
context?: string,
|
|
46
|
+
maxSubtasks: number = 6,
|
|
47
|
+
): string {
|
|
48
|
+
const contextSection = context ? `## Context\n${context}` : "";
|
|
49
|
+
|
|
50
|
+
return `You are decomposing a task into parallelizable subtasks for a swarm of agents.
|
|
51
|
+
|
|
52
|
+
## Task
|
|
53
|
+
${task}
|
|
54
|
+
|
|
55
|
+
${contextSection}
|
|
56
|
+
|
|
57
|
+
## Requirements
|
|
58
|
+
|
|
59
|
+
1. **Break into 2-${maxSubtasks} independent subtasks** that can run in parallel
|
|
60
|
+
2. **Assign files** - each subtask must specify which files it will modify
|
|
61
|
+
3. **No file overlap** - files cannot appear in multiple subtasks (they get exclusive locks)
|
|
62
|
+
4. **Order by dependency** - if subtask B needs subtask A's output, A must come first in the array
|
|
63
|
+
5. **Estimate complexity** - 1 (trivial) to 5 (complex)
|
|
64
|
+
|
|
65
|
+
## Response Format
|
|
66
|
+
|
|
67
|
+
Respond with ONLY a JSON object matching this schema (no markdown, no explanation):
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
"epic": {
|
|
71
|
+
"title": "string",
|
|
72
|
+
"description": "string"
|
|
73
|
+
},
|
|
74
|
+
"subtasks": [
|
|
75
|
+
{
|
|
76
|
+
"title": "string",
|
|
77
|
+
"description": "string",
|
|
78
|
+
"files": ["string"],
|
|
79
|
+
"dependencies": [0],
|
|
80
|
+
"estimated_complexity": 1
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
## Guidelines
|
|
86
|
+
|
|
87
|
+
- **Plan aggressively** - when in doubt, split further
|
|
88
|
+
- **Prefer smaller, focused subtasks** over large complex ones
|
|
89
|
+
- **Include test files** in the same subtask as the code they test
|
|
90
|
+
- **Be specific about files** - use actual file paths, not placeholders
|
|
91
|
+
|
|
92
|
+
Now decompose the task. Respond with JSON only:`;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Extract JSON from LLM response
|
|
97
|
+
*
|
|
98
|
+
* Handles responses that may have markdown code blocks or extra text
|
|
99
|
+
*/
|
|
100
|
+
export function extractJson(text: string): string {
|
|
101
|
+
// Try to find JSON in code blocks first
|
|
102
|
+
const codeBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
103
|
+
if (codeBlockMatch) {
|
|
104
|
+
return codeBlockMatch[1].trim();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Try to find raw JSON object
|
|
108
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
109
|
+
if (jsonMatch) {
|
|
110
|
+
return jsonMatch[0];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Return as-is if no JSON found
|
|
114
|
+
return text;
|
|
115
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import { createScorer } from "evalite";
|
|
2
|
+
import type { BeadTree } from "../../src/schemas/index.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Custom scorers for evaluating swarm task decomposition quality
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Checks that no files appear in multiple subtasks
|
|
10
|
+
*
|
|
11
|
+
* Independent subtasks are critical for parallel execution.
|
|
12
|
+
* File conflicts cause merge conflicts and coordination overhead.
|
|
13
|
+
*
|
|
14
|
+
* Score: 1.0 if no conflicts, 0.0 if conflicts found
|
|
15
|
+
*/
|
|
16
|
+
export const subtaskIndependence = createScorer({
|
|
17
|
+
name: "Subtask Independence",
|
|
18
|
+
description: "Checks that no files appear in multiple subtasks",
|
|
19
|
+
scorer: ({ output }) => {
|
|
20
|
+
try {
|
|
21
|
+
const beadTree = JSON.parse(String(output)) as BeadTree;
|
|
22
|
+
const fileMap = new Map<string, number>();
|
|
23
|
+
|
|
24
|
+
// Track which files appear in which subtasks
|
|
25
|
+
beadTree.subtasks.forEach((subtask) => {
|
|
26
|
+
subtask.files?.forEach((file) => {
|
|
27
|
+
const count = fileMap.get(file) || 0;
|
|
28
|
+
fileMap.set(file, count + 1);
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
// Check for conflicts
|
|
33
|
+
const conflicts = Array.from(fileMap.entries()).filter(
|
|
34
|
+
([_, count]) => count > 1,
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
if (conflicts.length > 0) {
|
|
38
|
+
return {
|
|
39
|
+
score: 0,
|
|
40
|
+
message: `File conflicts found: ${conflicts.map(([f]) => f).join(", ")}`,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
score: 1,
|
|
46
|
+
message: "No file conflicts - subtasks are independent",
|
|
47
|
+
};
|
|
48
|
+
} catch (error) {
|
|
49
|
+
return {
|
|
50
|
+
score: 0,
|
|
51
|
+
message: `Failed to parse BeadTree: ${error}`,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Outcome-based scorers
|
|
59
|
+
// ============================================================================
|
|
60
|
+
|
|
61
|
+
export {
|
|
62
|
+
executionSuccess,
|
|
63
|
+
timeBalance,
|
|
64
|
+
scopeAccuracy,
|
|
65
|
+
scopeDrift,
|
|
66
|
+
noRework,
|
|
67
|
+
} from "./outcome-scorers.js";
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Checks that subtasks cover the full task scope
|
|
71
|
+
*
|
|
72
|
+
* Incomplete coverage means:
|
|
73
|
+
* - Missing functionality
|
|
74
|
+
* - Follow-up work required
|
|
75
|
+
* - Task not actually complete
|
|
76
|
+
*
|
|
77
|
+
* Score: ratio of expected files covered (0.0 to 1.0)
|
|
78
|
+
* If no expected files specified, checks that subtasks exist
|
|
79
|
+
*/
|
|
80
|
+
export const coverageCompleteness = createScorer({
|
|
81
|
+
name: "Coverage Completeness",
|
|
82
|
+
description: "Checks that subtasks cover the full task scope",
|
|
83
|
+
scorer: ({ output, expected }) => {
|
|
84
|
+
try {
|
|
85
|
+
const beadTree = JSON.parse(String(output)) as BeadTree;
|
|
86
|
+
|
|
87
|
+
// If expected files specified, check coverage
|
|
88
|
+
const expectedData = expected as Record<string, unknown> | undefined;
|
|
89
|
+
if (expectedData && Array.isArray(expectedData.requiredFiles)) {
|
|
90
|
+
const allFiles = new Set(
|
|
91
|
+
beadTree.subtasks.flatMap((st) => st.files || []),
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
const requiredFiles = expectedData.requiredFiles as string[];
|
|
95
|
+
const coveredFiles = requiredFiles.filter((f) => allFiles.has(f));
|
|
96
|
+
const coverage = coveredFiles.length / requiredFiles.length;
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
score: coverage,
|
|
100
|
+
message: `${coveredFiles.length}/${requiredFiles.length} required files covered`,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Otherwise, check min/max subtask count
|
|
105
|
+
const minSubtasks = (expectedData?.minSubtasks as number) || 1;
|
|
106
|
+
const maxSubtasks = (expectedData?.maxSubtasks as number) || 10;
|
|
107
|
+
const count = beadTree.subtasks.length;
|
|
108
|
+
|
|
109
|
+
if (count < minSubtasks) {
|
|
110
|
+
return {
|
|
111
|
+
score: 0,
|
|
112
|
+
message: `Too few subtasks: ${count} < ${minSubtasks}`,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (count > maxSubtasks) {
|
|
117
|
+
return {
|
|
118
|
+
score: 0.5,
|
|
119
|
+
message: `Too many subtasks: ${count} > ${maxSubtasks} (over-decomposed)`,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
score: 1,
|
|
125
|
+
message: `Good subtask count: ${count} (${minSubtasks}-${maxSubtasks})`,
|
|
126
|
+
};
|
|
127
|
+
} catch (error) {
|
|
128
|
+
return {
|
|
129
|
+
score: 0,
|
|
130
|
+
message: `Failed to parse BeadTree: ${error}`,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Checks that each subtask has clear, actionable instructions
|
|
138
|
+
*
|
|
139
|
+
* Vague instructions lead to:
|
|
140
|
+
* - Agent confusion and blocking
|
|
141
|
+
* - Incorrect implementations
|
|
142
|
+
* - Need for coordinator intervention
|
|
143
|
+
*
|
|
144
|
+
* Score: Average of per-subtask instruction quality
|
|
145
|
+
*/
|
|
146
|
+
export const instructionClarity = createScorer({
|
|
147
|
+
name: "Instruction Clarity",
|
|
148
|
+
description: "Checks that subtasks have clear, actionable instructions",
|
|
149
|
+
scorer: ({ output }) => {
|
|
150
|
+
try {
|
|
151
|
+
const beadTree = JSON.parse(String(output)) as BeadTree;
|
|
152
|
+
|
|
153
|
+
if (beadTree.subtasks.length === 0) {
|
|
154
|
+
return {
|
|
155
|
+
score: 0,
|
|
156
|
+
message: "No subtasks found",
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Check each subtask for clarity signals
|
|
161
|
+
const scores = beadTree.subtasks.map((subtask) => {
|
|
162
|
+
let score = 0.5; // baseline
|
|
163
|
+
|
|
164
|
+
// Has description?
|
|
165
|
+
if (subtask.description && subtask.description.length > 20) {
|
|
166
|
+
score += 0.2;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Has files specified?
|
|
170
|
+
if (subtask.files && subtask.files.length > 0) {
|
|
171
|
+
score += 0.2;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Title is specific (not generic)?
|
|
175
|
+
const genericWords = ["update", "fix", "add", "change", "modify"];
|
|
176
|
+
const titleLower = subtask.title.toLowerCase();
|
|
177
|
+
const isGeneric = genericWords.some(
|
|
178
|
+
(word) => titleLower === word || titleLower.startsWith(`${word} `),
|
|
179
|
+
);
|
|
180
|
+
if (!isGeneric) {
|
|
181
|
+
score += 0.1;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return Math.min(1.0, score);
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
score: avgScore,
|
|
191
|
+
message: `Average instruction clarity: ${(avgScore * 100).toFixed(0)}%`,
|
|
192
|
+
};
|
|
193
|
+
} catch (error) {
|
|
194
|
+
return {
|
|
195
|
+
score: 0,
|
|
196
|
+
message: `Failed to parse BeadTree: ${error}`,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
});
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Outcome-based Scorers Tests
|
|
3
|
+
*
|
|
4
|
+
* Tests the 5 new outcome-based scorers by verifying their exports.
|
|
5
|
+
* Full functional testing happens via Evalite integration.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, it, expect } from "vitest";
|
|
8
|
+
|
|
9
|
+
describe("Outcome Scorers", () => {
|
|
10
|
+
it("exports all 5 outcome scorers from outcome-scorers.ts", async () => {
|
|
11
|
+
const module = await import("./outcome-scorers.js");
|
|
12
|
+
expect(module.executionSuccess).toBeDefined();
|
|
13
|
+
expect(module.timeBalance).toBeDefined();
|
|
14
|
+
expect(module.scopeAccuracy).toBeDefined();
|
|
15
|
+
expect(module.scopeDrift).toBeDefined();
|
|
16
|
+
expect(module.noRework).toBeDefined();
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it("re-exports all 5 outcome scorers from index.ts", async () => {
|
|
20
|
+
const indexModule = await import("./index.js");
|
|
21
|
+
expect(indexModule.executionSuccess).toBeDefined();
|
|
22
|
+
expect(indexModule.timeBalance).toBeDefined();
|
|
23
|
+
expect(indexModule.scopeAccuracy).toBeDefined();
|
|
24
|
+
expect(indexModule.scopeDrift).toBeDefined();
|
|
25
|
+
expect(indexModule.noRework).toBeDefined();
|
|
26
|
+
});
|
|
27
|
+
});
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
import { createScorer } from "evalite";
|
|
2
|
+
import type { EvalRecord } from "../../src/eval-capture.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Outcome-based scorers for evaluating decomposition quality
|
|
6
|
+
*
|
|
7
|
+
* These scorers evaluate based on ACTUAL execution outcomes,
|
|
8
|
+
* not just the structure of the decomposition.
|
|
9
|
+
*
|
|
10
|
+
* Requires EvalRecord with outcomes populated.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Execution Success Scorer
|
|
15
|
+
*
|
|
16
|
+
* Measures whether all subtasks succeeded without errors.
|
|
17
|
+
* This is the ultimate measure - did the decomposition actually work?
|
|
18
|
+
*
|
|
19
|
+
* Score: 1.0 if all outcomes.success === true, 0.0 otherwise
|
|
20
|
+
*/
|
|
21
|
+
export const executionSuccess = createScorer({
|
|
22
|
+
name: "Execution Success",
|
|
23
|
+
description: "All subtasks completed successfully without errors",
|
|
24
|
+
scorer: ({ output }) => {
|
|
25
|
+
try {
|
|
26
|
+
const record = JSON.parse(String(output)) as EvalRecord;
|
|
27
|
+
|
|
28
|
+
// Check if outcomes exist
|
|
29
|
+
if (!record.outcomes || record.outcomes.length === 0) {
|
|
30
|
+
return {
|
|
31
|
+
score: 0,
|
|
32
|
+
message: "No outcome data available",
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Check if all subtasks succeeded
|
|
37
|
+
const allSucceeded = record.outcomes.every((outcome) => outcome.success);
|
|
38
|
+
|
|
39
|
+
if (allSucceeded) {
|
|
40
|
+
return {
|
|
41
|
+
score: 1,
|
|
42
|
+
message: `All ${record.outcomes.length} subtasks succeeded`,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Report failures
|
|
47
|
+
const failures = record.outcomes.filter((o) => !o.success);
|
|
48
|
+
const failureList = failures.map((f) => f.title || f.bead_id).join(", ");
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
score: 0,
|
|
52
|
+
message: `${failures.length}/${record.outcomes.length} subtasks failed: ${failureList}`,
|
|
53
|
+
};
|
|
54
|
+
} catch (error) {
|
|
55
|
+
return {
|
|
56
|
+
score: 0,
|
|
57
|
+
message: `Failed to parse EvalRecord: ${error}`,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Time Balance Scorer
|
|
65
|
+
*
|
|
66
|
+
* Measures how evenly balanced the work was across subtasks.
|
|
67
|
+
* Unbalanced work means some agents finish early while others are bottlenecked.
|
|
68
|
+
*
|
|
69
|
+
* Score: 1.0 if max/min ratio < 2.0 (well balanced)
|
|
70
|
+
* 0.5 if ratio < 4.0 (moderately balanced)
|
|
71
|
+
* 0.0 if ratio >= 4.0 (poorly balanced)
|
|
72
|
+
*/
|
|
73
|
+
export const timeBalance = createScorer({
|
|
74
|
+
name: "Time Balance",
|
|
75
|
+
description: "Work is evenly distributed across subtasks (max/min duration)",
|
|
76
|
+
scorer: ({ output }) => {
|
|
77
|
+
try {
|
|
78
|
+
const record = JSON.parse(String(output)) as EvalRecord;
|
|
79
|
+
|
|
80
|
+
// Check if outcomes exist
|
|
81
|
+
if (!record.outcomes || record.outcomes.length === 0) {
|
|
82
|
+
return {
|
|
83
|
+
score: 0,
|
|
84
|
+
message: "No outcome data available",
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Need at least 2 subtasks to measure balance
|
|
89
|
+
if (record.outcomes.length < 2) {
|
|
90
|
+
return {
|
|
91
|
+
score: 1,
|
|
92
|
+
message: "Only one subtask - perfect balance",
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Get durations (filter out zeros)
|
|
97
|
+
const durations = record.outcomes
|
|
98
|
+
.map((o) => o.duration_ms)
|
|
99
|
+
.filter((d) => d > 0);
|
|
100
|
+
|
|
101
|
+
if (durations.length === 0) {
|
|
102
|
+
return {
|
|
103
|
+
score: 0,
|
|
104
|
+
message: "No duration data available",
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const maxDuration = Math.max(...durations);
|
|
109
|
+
const minDuration = Math.min(...durations);
|
|
110
|
+
const ratio = maxDuration / minDuration;
|
|
111
|
+
|
|
112
|
+
// Score based on ratio
|
|
113
|
+
let score: number;
|
|
114
|
+
let assessment: string;
|
|
115
|
+
|
|
116
|
+
if (ratio < 2.0) {
|
|
117
|
+
score = 1.0;
|
|
118
|
+
assessment = "well balanced";
|
|
119
|
+
} else if (ratio < 4.0) {
|
|
120
|
+
score = 0.5;
|
|
121
|
+
assessment = "moderately balanced";
|
|
122
|
+
} else {
|
|
123
|
+
score = 0.0;
|
|
124
|
+
assessment = "poorly balanced";
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const maxSeconds = Math.round(maxDuration / 1000);
|
|
128
|
+
const minSeconds = Math.round(minDuration / 1000);
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
score,
|
|
132
|
+
message: `Ratio ${ratio.toFixed(1)}x (${maxSeconds}s / ${minSeconds}s) - ${assessment}`,
|
|
133
|
+
};
|
|
134
|
+
} catch (error) {
|
|
135
|
+
return {
|
|
136
|
+
score: 0,
|
|
137
|
+
message: `Failed to parse EvalRecord: ${error}`,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Scope Accuracy Scorer
|
|
145
|
+
*
|
|
146
|
+
* Measures how accurately the decomposition predicted which files would be touched.
|
|
147
|
+
* High accuracy means the planner understood the work scope correctly.
|
|
148
|
+
*
|
|
149
|
+
* Score: intersection(actual, planned) / planned.length
|
|
150
|
+
* 1.0 = all planned files were touched, no extras
|
|
151
|
+
* 0.5 = half the planned files were touched
|
|
152
|
+
* 0.0 = none of the planned files were touched
|
|
153
|
+
*/
|
|
154
|
+
export const scopeAccuracy = createScorer({
|
|
155
|
+
name: "Scope Accuracy",
|
|
156
|
+
description:
|
|
157
|
+
"Planned files match actual files touched (accuracy of scope prediction)",
|
|
158
|
+
scorer: ({ output }) => {
|
|
159
|
+
try {
|
|
160
|
+
const record = JSON.parse(String(output)) as EvalRecord;
|
|
161
|
+
|
|
162
|
+
// Check if outcomes exist
|
|
163
|
+
if (!record.outcomes || record.outcomes.length === 0) {
|
|
164
|
+
return {
|
|
165
|
+
score: 0,
|
|
166
|
+
message: "No outcome data available",
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Calculate accuracy per subtask
|
|
171
|
+
let totalPlanned = 0;
|
|
172
|
+
let totalCorrect = 0;
|
|
173
|
+
|
|
174
|
+
for (const outcome of record.outcomes) {
|
|
175
|
+
const planned = new Set(outcome.planned_files);
|
|
176
|
+
const actual = new Set(outcome.actual_files);
|
|
177
|
+
|
|
178
|
+
// Count intersection (files in both planned and actual)
|
|
179
|
+
const intersection = Array.from(planned).filter((f) => actual.has(f));
|
|
180
|
+
|
|
181
|
+
totalPlanned += planned.size;
|
|
182
|
+
totalCorrect += intersection.length;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (totalPlanned === 0) {
|
|
186
|
+
return {
|
|
187
|
+
score: 0,
|
|
188
|
+
message: "No planned files to measure against",
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const accuracy = totalCorrect / totalPlanned;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
score: accuracy,
|
|
196
|
+
message: `${totalCorrect}/${totalPlanned} planned files touched (${(accuracy * 100).toFixed(0)}% accuracy)`,
|
|
197
|
+
};
|
|
198
|
+
} catch (error) {
|
|
199
|
+
return {
|
|
200
|
+
score: 0,
|
|
201
|
+
message: `Failed to parse EvalRecord: ${error}`,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
},
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Scope Drift Scorer
|
|
209
|
+
*
|
|
210
|
+
* Penalizes when agents touch files NOT in their planned scope.
|
|
211
|
+
* Scope drift indicates poor planning or unexpected dependencies.
|
|
212
|
+
*
|
|
213
|
+
* Score: 1.0 if no drift (all actual files were planned)
|
|
214
|
+
* Decreases linearly with drift percentage
|
|
215
|
+
* 0.0 if drift > 50%
|
|
216
|
+
*/
|
|
217
|
+
export const scopeDrift = createScorer({
|
|
218
|
+
name: "Scope Drift",
|
|
219
|
+
description:
|
|
220
|
+
"Agents stayed within their planned file scope (no unexpected files)",
|
|
221
|
+
scorer: ({ output }) => {
|
|
222
|
+
try {
|
|
223
|
+
const record = JSON.parse(String(output)) as EvalRecord;
|
|
224
|
+
|
|
225
|
+
// Check if outcomes exist
|
|
226
|
+
if (!record.outcomes || record.outcomes.length === 0) {
|
|
227
|
+
return {
|
|
228
|
+
score: 0,
|
|
229
|
+
message: "No outcome data available",
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Calculate drift per subtask
|
|
234
|
+
let totalActual = 0;
|
|
235
|
+
let totalDrift = 0;
|
|
236
|
+
|
|
237
|
+
for (const outcome of record.outcomes) {
|
|
238
|
+
const planned = new Set(outcome.planned_files);
|
|
239
|
+
const actual = new Set(outcome.actual_files);
|
|
240
|
+
|
|
241
|
+
// Count files in actual but NOT in planned
|
|
242
|
+
const drift = Array.from(actual).filter((f) => !planned.has(f));
|
|
243
|
+
|
|
244
|
+
totalActual += actual.size;
|
|
245
|
+
totalDrift += drift.length;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (totalActual === 0) {
|
|
249
|
+
return {
|
|
250
|
+
score: 1,
|
|
251
|
+
message: "No files touched",
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const driftRatio = totalDrift / totalActual;
|
|
256
|
+
|
|
257
|
+
// Score: 1.0 if no drift, linearly decrease to 0 at 50% drift
|
|
258
|
+
const score = Math.max(0, 1.0 - driftRatio * 2);
|
|
259
|
+
|
|
260
|
+
const driftPct = (driftRatio * 100).toFixed(0);
|
|
261
|
+
|
|
262
|
+
return {
|
|
263
|
+
score,
|
|
264
|
+
message: `${totalDrift}/${totalActual} files were unplanned (${driftPct}% drift)`,
|
|
265
|
+
};
|
|
266
|
+
} catch (error) {
|
|
267
|
+
return {
|
|
268
|
+
score: 0,
|
|
269
|
+
message: `Failed to parse EvalRecord: ${error}`,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
},
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* No Rework Scorer
|
|
277
|
+
*
|
|
278
|
+
* Checks that no subtask touched files assigned to another subtask.
|
|
279
|
+
* Rework indicates poor decomposition or missing dependencies.
|
|
280
|
+
*
|
|
281
|
+
* Score: 1.0 if no rework (no subtask touched another's planned files)
|
|
282
|
+
* 0.0 if rework detected
|
|
283
|
+
*/
|
|
284
|
+
export const noRework = createScorer({
|
|
285
|
+
name: "No Rework",
|
|
286
|
+
description: "No subtask touched files assigned to another subtask",
|
|
287
|
+
scorer: ({ output }) => {
|
|
288
|
+
try {
|
|
289
|
+
const record = JSON.parse(String(output)) as EvalRecord;
|
|
290
|
+
|
|
291
|
+
// Check if outcomes exist
|
|
292
|
+
if (!record.outcomes || record.outcomes.length === 0) {
|
|
293
|
+
return {
|
|
294
|
+
score: 0,
|
|
295
|
+
message: "No outcome data available",
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Build map of planned files per subtask
|
|
300
|
+
const plannedBySubtask = new Map<string, Set<string>>();
|
|
301
|
+
|
|
302
|
+
for (const outcome of record.outcomes) {
|
|
303
|
+
plannedBySubtask.set(outcome.bead_id, new Set(outcome.planned_files));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Check each subtask for rework
|
|
307
|
+
const reworkCases: string[] = [];
|
|
308
|
+
|
|
309
|
+
for (const outcome of record.outcomes) {
|
|
310
|
+
const actualFiles = new Set(outcome.actual_files);
|
|
311
|
+
|
|
312
|
+
// Check if this subtask touched files planned for another subtask
|
|
313
|
+
for (const [otherBeadId, otherPlanned] of plannedBySubtask.entries()) {
|
|
314
|
+
if (otherBeadId === outcome.bead_id) {
|
|
315
|
+
continue; // Skip self
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Find intersection
|
|
319
|
+
const overlap = Array.from(actualFiles).filter((f) =>
|
|
320
|
+
otherPlanned.has(f),
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
if (overlap.length > 0) {
|
|
324
|
+
reworkCases.push(
|
|
325
|
+
`${outcome.title || outcome.bead_id} touched ${overlap.length} file(s) from ${otherBeadId}`,
|
|
326
|
+
);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (reworkCases.length > 0) {
|
|
332
|
+
return {
|
|
333
|
+
score: 0,
|
|
334
|
+
message: `Rework detected: ${reworkCases.join("; ")}`,
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return {
|
|
339
|
+
score: 1,
|
|
340
|
+
message: "No rework - all subtasks stayed in their lanes",
|
|
341
|
+
};
|
|
342
|
+
} catch (error) {
|
|
343
|
+
return {
|
|
344
|
+
score: 0,
|
|
345
|
+
message: `Failed to parse EvalRecord: ${error}`,
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
});
|