opencode-swarm-plugin 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.beads/issues.jsonl +213 -0
  2. package/INTEGRATION_EXAMPLE.md +66 -0
  3. package/README.md +352 -522
  4. package/dist/index.js +2046 -984
  5. package/dist/plugin.js +2051 -1017
  6. package/docs/analysis/subagent-coordination-patterns.md +2 -0
  7. package/docs/semantic-memory-cli-syntax.md +123 -0
  8. package/docs/swarm-mail-architecture.md +1147 -0
  9. package/evals/README.md +116 -0
  10. package/evals/evalite.config.ts +15 -0
  11. package/evals/example.eval.ts +32 -0
  12. package/evals/fixtures/decomposition-cases.ts +105 -0
  13. package/evals/lib/data-loader.test.ts +288 -0
  14. package/evals/lib/data-loader.ts +111 -0
  15. package/evals/lib/llm.ts +115 -0
  16. package/evals/scorers/index.ts +200 -0
  17. package/evals/scorers/outcome-scorers.test.ts +27 -0
  18. package/evals/scorers/outcome-scorers.ts +349 -0
  19. package/evals/swarm-decomposition.eval.ts +112 -0
  20. package/package.json +8 -1
  21. package/scripts/cleanup-test-memories.ts +346 -0
  22. package/src/beads.ts +49 -0
  23. package/src/eval-capture.ts +487 -0
  24. package/src/index.ts +45 -3
  25. package/src/learning.integration.test.ts +19 -4
  26. package/src/output-guardrails.test.ts +438 -0
  27. package/src/output-guardrails.ts +381 -0
  28. package/src/schemas/index.ts +18 -0
  29. package/src/schemas/swarm-context.ts +115 -0
  30. package/src/storage.ts +117 -5
  31. package/src/streams/events.test.ts +296 -0
  32. package/src/streams/events.ts +93 -0
  33. package/src/streams/migrations.test.ts +24 -20
  34. package/src/streams/migrations.ts +51 -0
  35. package/src/streams/projections.ts +187 -0
  36. package/src/streams/store.ts +275 -0
  37. package/src/swarm-orchestrate.ts +771 -189
  38. package/src/swarm-prompts.ts +84 -12
  39. package/src/swarm.integration.test.ts +124 -0
  40. package/vitest.integration.config.ts +6 -0
  41. package/vitest.integration.setup.ts +48 -0
@@ -0,0 +1,115 @@
1
+ /**
2
+ * LLM Client for Evalite Evals
3
+ *
4
+ * Uses AI SDK v6 with Vercel AI Gateway.
5
+ * Gateway handles provider routing - just pass "provider/model" string.
6
+ *
7
+ * @module evals/lib/llm
8
+ */
9
+ import { generateText, gateway } from "ai";
10
+ import type { GatewayModelId } from "ai";
11
+
12
+ /**
13
+ * Default model for decomposition evals
14
+ * Using Claude Sonnet for good balance of quality and cost
15
+ */
16
+ export const DEFAULT_MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
17
+
18
+ /**
19
+ * Generate a decomposition from a task description
20
+ *
21
+ * @param prompt - The full decomposition prompt
22
+ * @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
23
+ * @returns The raw text response from the LLM
24
+ */
25
+ export async function generateDecomposition(
26
+ prompt: string,
27
+ model: GatewayModelId = DEFAULT_MODEL,
28
+ ): Promise<string> {
29
+ const { text } = await generateText({
30
+ model: gateway(model),
31
+ prompt,
32
+ maxOutputTokens: 4096,
33
+ });
34
+
35
+ return text;
36
+ }
37
+
38
+ /**
39
+ * Format a decomposition prompt from task and context
40
+ *
41
+ * Uses the same prompt template as swarm_plan_prompt
42
+ */
43
+ export function formatDecompositionPrompt(
44
+ task: string,
45
+ context?: string,
46
+ maxSubtasks: number = 6,
47
+ ): string {
48
+ const contextSection = context ? `## Context\n${context}` : "";
49
+
50
+ return `You are decomposing a task into parallelizable subtasks for a swarm of agents.
51
+
52
+ ## Task
53
+ ${task}
54
+
55
+ ${contextSection}
56
+
57
+ ## Requirements
58
+
59
+ 1. **Break into 2-${maxSubtasks} independent subtasks** that can run in parallel
60
+ 2. **Assign files** - each subtask must specify which files it will modify
61
+ 3. **No file overlap** - files cannot appear in multiple subtasks (they get exclusive locks)
62
+ 4. **Order by dependency** - if subtask B needs subtask A's output, A must come first in the array
63
+ 5. **Estimate complexity** - 1 (trivial) to 5 (complex)
64
+
65
+ ## Response Format
66
+
67
+ Respond with ONLY a JSON object matching this schema (no markdown, no explanation):
68
+
69
+ {
70
+ "epic": {
71
+ "title": "string",
72
+ "description": "string"
73
+ },
74
+ "subtasks": [
75
+ {
76
+ "title": "string",
77
+ "description": "string",
78
+ "files": ["string"],
79
+ "dependencies": [0],
80
+ "estimated_complexity": 1
81
+ }
82
+ ]
83
+ }
84
+
85
+ ## Guidelines
86
+
87
+ - **Plan aggressively** - when in doubt, split further
88
+ - **Prefer smaller, focused subtasks** over large complex ones
89
+ - **Include test files** in the same subtask as the code they test
90
+ - **Be specific about files** - use actual file paths, not placeholders
91
+
92
+ Now decompose the task. Respond with JSON only:`;
93
+ }
94
+
95
+ /**
96
+ * Extract JSON from LLM response
97
+ *
98
+ * Handles responses that may have markdown code blocks or extra text
99
+ */
100
+ export function extractJson(text: string): string {
101
+ // Try to find JSON in code blocks first
102
+ const codeBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
103
+ if (codeBlockMatch) {
104
+ return codeBlockMatch[1].trim();
105
+ }
106
+
107
+ // Try to find raw JSON object
108
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
109
+ if (jsonMatch) {
110
+ return jsonMatch[0];
111
+ }
112
+
113
+ // Return as-is if no JSON found
114
+ return text;
115
+ }
@@ -0,0 +1,200 @@
1
+ import { createScorer } from "evalite";
2
+ import type { BeadTree } from "../../src/schemas/index.js";
3
+
4
+ /**
5
+ * Custom scorers for evaluating swarm task decomposition quality
6
+ */
7
+
8
+ /**
9
+ * Checks that no files appear in multiple subtasks
10
+ *
11
+ * Independent subtasks are critical for parallel execution.
12
+ * File conflicts cause merge conflicts and coordination overhead.
13
+ *
14
+ * Score: 1.0 if no conflicts, 0.0 if conflicts found
15
+ */
16
+ export const subtaskIndependence = createScorer({
17
+ name: "Subtask Independence",
18
+ description: "Checks that no files appear in multiple subtasks",
19
+ scorer: ({ output }) => {
20
+ try {
21
+ const beadTree = JSON.parse(String(output)) as BeadTree;
22
+ const fileMap = new Map<string, number>();
23
+
24
+ // Track which files appear in which subtasks
25
+ beadTree.subtasks.forEach((subtask) => {
26
+ subtask.files?.forEach((file) => {
27
+ const count = fileMap.get(file) || 0;
28
+ fileMap.set(file, count + 1);
29
+ });
30
+ });
31
+
32
+ // Check for conflicts
33
+ const conflicts = Array.from(fileMap.entries()).filter(
34
+ ([_, count]) => count > 1,
35
+ );
36
+
37
+ if (conflicts.length > 0) {
38
+ return {
39
+ score: 0,
40
+ message: `File conflicts found: ${conflicts.map(([f]) => f).join(", ")}`,
41
+ };
42
+ }
43
+
44
+ return {
45
+ score: 1,
46
+ message: "No file conflicts - subtasks are independent",
47
+ };
48
+ } catch (error) {
49
+ return {
50
+ score: 0,
51
+ message: `Failed to parse BeadTree: ${error}`,
52
+ };
53
+ }
54
+ },
55
+ });
56
+
57
+ // ============================================================================
58
+ // Outcome-based scorers
59
+ // ============================================================================
60
+
61
+ export {
62
+ executionSuccess,
63
+ timeBalance,
64
+ scopeAccuracy,
65
+ scopeDrift,
66
+ noRework,
67
+ } from "./outcome-scorers.js";
68
+
69
+ /**
70
+ * Checks that subtasks cover the full task scope
71
+ *
72
+ * Incomplete coverage means:
73
+ * - Missing functionality
74
+ * - Follow-up work required
75
+ * - Task not actually complete
76
+ *
77
+ * Score: ratio of expected files covered (0.0 to 1.0)
78
+ * If no expected files specified, checks that subtasks exist
79
+ */
80
+ export const coverageCompleteness = createScorer({
81
+ name: "Coverage Completeness",
82
+ description: "Checks that subtasks cover the full task scope",
83
+ scorer: ({ output, expected }) => {
84
+ try {
85
+ const beadTree = JSON.parse(String(output)) as BeadTree;
86
+
87
+ // If expected files specified, check coverage
88
+ const expectedData = expected as Record<string, unknown> | undefined;
89
+ if (expectedData && Array.isArray(expectedData.requiredFiles)) {
90
+ const allFiles = new Set(
91
+ beadTree.subtasks.flatMap((st) => st.files || []),
92
+ );
93
+
94
+ const requiredFiles = expectedData.requiredFiles as string[];
95
+ const coveredFiles = requiredFiles.filter((f) => allFiles.has(f));
96
+ const coverage = coveredFiles.length / requiredFiles.length;
97
+
98
+ return {
99
+ score: coverage,
100
+ message: `${coveredFiles.length}/${requiredFiles.length} required files covered`,
101
+ };
102
+ }
103
+
104
+ // Otherwise, check min/max subtask count
105
+ const minSubtasks = (expectedData?.minSubtasks as number) || 1;
106
+ const maxSubtasks = (expectedData?.maxSubtasks as number) || 10;
107
+ const count = beadTree.subtasks.length;
108
+
109
+ if (count < minSubtasks) {
110
+ return {
111
+ score: 0,
112
+ message: `Too few subtasks: ${count} < ${minSubtasks}`,
113
+ };
114
+ }
115
+
116
+ if (count > maxSubtasks) {
117
+ return {
118
+ score: 0.5,
119
+ message: `Too many subtasks: ${count} > ${maxSubtasks} (over-decomposed)`,
120
+ };
121
+ }
122
+
123
+ return {
124
+ score: 1,
125
+ message: `Good subtask count: ${count} (${minSubtasks}-${maxSubtasks})`,
126
+ };
127
+ } catch (error) {
128
+ return {
129
+ score: 0,
130
+ message: `Failed to parse BeadTree: ${error}`,
131
+ };
132
+ }
133
+ },
134
+ });
135
+
136
+ /**
137
+ * Checks that each subtask has clear, actionable instructions
138
+ *
139
+ * Vague instructions lead to:
140
+ * - Agent confusion and blocking
141
+ * - Incorrect implementations
142
+ * - Need for coordinator intervention
143
+ *
144
+ * Score: Average of per-subtask instruction quality
145
+ */
146
+ export const instructionClarity = createScorer({
147
+ name: "Instruction Clarity",
148
+ description: "Checks that subtasks have clear, actionable instructions",
149
+ scorer: ({ output }) => {
150
+ try {
151
+ const beadTree = JSON.parse(String(output)) as BeadTree;
152
+
153
+ if (beadTree.subtasks.length === 0) {
154
+ return {
155
+ score: 0,
156
+ message: "No subtasks found",
157
+ };
158
+ }
159
+
160
+ // Check each subtask for clarity signals
161
+ const scores = beadTree.subtasks.map((subtask) => {
162
+ let score = 0.5; // baseline
163
+
164
+ // Has description?
165
+ if (subtask.description && subtask.description.length > 20) {
166
+ score += 0.2;
167
+ }
168
+
169
+ // Has files specified?
170
+ if (subtask.files && subtask.files.length > 0) {
171
+ score += 0.2;
172
+ }
173
+
174
+ // Title is specific (not generic)?
175
+ const genericWords = ["update", "fix", "add", "change", "modify"];
176
+ const titleLower = subtask.title.toLowerCase();
177
+ const isGeneric = genericWords.some(
178
+ (word) => titleLower === word || titleLower.startsWith(`${word} `),
179
+ );
180
+ if (!isGeneric) {
181
+ score += 0.1;
182
+ }
183
+
184
+ return Math.min(1.0, score);
185
+ });
186
+
187
+ const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
188
+
189
+ return {
190
+ score: avgScore,
191
+ message: `Average instruction clarity: ${(avgScore * 100).toFixed(0)}%`,
192
+ };
193
+ } catch (error) {
194
+ return {
195
+ score: 0,
196
+ message: `Failed to parse BeadTree: ${error}`,
197
+ };
198
+ }
199
+ },
200
+ });
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Outcome-based Scorers Tests
3
+ *
4
+ * Tests the 5 new outcome-based scorers by verifying their exports.
5
+ * Full functional testing happens via Evalite integration.
6
+ */
7
+ import { describe, it, expect } from "vitest";
8
+
9
+ describe("Outcome Scorers", () => {
10
+ it("exports all 5 outcome scorers from outcome-scorers.ts", async () => {
11
+ const module = await import("./outcome-scorers.js");
12
+ expect(module.executionSuccess).toBeDefined();
13
+ expect(module.timeBalance).toBeDefined();
14
+ expect(module.scopeAccuracy).toBeDefined();
15
+ expect(module.scopeDrift).toBeDefined();
16
+ expect(module.noRework).toBeDefined();
17
+ });
18
+
19
+ it("re-exports all 5 outcome scorers from index.ts", async () => {
20
+ const indexModule = await import("./index.js");
21
+ expect(indexModule.executionSuccess).toBeDefined();
22
+ expect(indexModule.timeBalance).toBeDefined();
23
+ expect(indexModule.scopeAccuracy).toBeDefined();
24
+ expect(indexModule.scopeDrift).toBeDefined();
25
+ expect(indexModule.noRework).toBeDefined();
26
+ });
27
+ });
@@ -0,0 +1,349 @@
1
+ import { createScorer } from "evalite";
2
+ import type { EvalRecord } from "../../src/eval-capture.js";
3
+
4
+ /**
5
+ * Outcome-based scorers for evaluating decomposition quality
6
+ *
7
+ * These scorers evaluate based on ACTUAL execution outcomes,
8
+ * not just the structure of the decomposition.
9
+ *
10
+ * Requires EvalRecord with outcomes populated.
11
+ */
12
+
13
+ /**
14
+ * Execution Success Scorer
15
+ *
16
+ * Measures whether all subtasks succeeded without errors.
17
+ * This is the ultimate measure - did the decomposition actually work?
18
+ *
19
+ * Score: 1.0 if all outcomes.success === true, 0.0 otherwise
20
+ */
21
+ export const executionSuccess = createScorer({
22
+ name: "Execution Success",
23
+ description: "All subtasks completed successfully without errors",
24
+ scorer: ({ output }) => {
25
+ try {
26
+ const record = JSON.parse(String(output)) as EvalRecord;
27
+
28
+ // Check if outcomes exist
29
+ if (!record.outcomes || record.outcomes.length === 0) {
30
+ return {
31
+ score: 0,
32
+ message: "No outcome data available",
33
+ };
34
+ }
35
+
36
+ // Check if all subtasks succeeded
37
+ const allSucceeded = record.outcomes.every((outcome) => outcome.success);
38
+
39
+ if (allSucceeded) {
40
+ return {
41
+ score: 1,
42
+ message: `All ${record.outcomes.length} subtasks succeeded`,
43
+ };
44
+ }
45
+
46
+ // Report failures
47
+ const failures = record.outcomes.filter((o) => !o.success);
48
+ const failureList = failures.map((f) => f.title || f.bead_id).join(", ");
49
+
50
+ return {
51
+ score: 0,
52
+ message: `${failures.length}/${record.outcomes.length} subtasks failed: ${failureList}`,
53
+ };
54
+ } catch (error) {
55
+ return {
56
+ score: 0,
57
+ message: `Failed to parse EvalRecord: ${error}`,
58
+ };
59
+ }
60
+ },
61
+ });
62
+
63
+ /**
64
+ * Time Balance Scorer
65
+ *
66
+ * Measures how evenly balanced the work was across subtasks.
67
+ * Unbalanced work means some agents finish early while others are bottlenecked.
68
+ *
69
+ * Score: 1.0 if max/min ratio < 2.0 (well balanced)
70
+ * 0.5 if ratio < 4.0 (moderately balanced)
71
+ * 0.0 if ratio >= 4.0 (poorly balanced)
72
+ */
73
+ export const timeBalance = createScorer({
74
+ name: "Time Balance",
75
+ description: "Work is evenly distributed across subtasks (max/min duration)",
76
+ scorer: ({ output }) => {
77
+ try {
78
+ const record = JSON.parse(String(output)) as EvalRecord;
79
+
80
+ // Check if outcomes exist
81
+ if (!record.outcomes || record.outcomes.length === 0) {
82
+ return {
83
+ score: 0,
84
+ message: "No outcome data available",
85
+ };
86
+ }
87
+
88
+ // Need at least 2 subtasks to measure balance
89
+ if (record.outcomes.length < 2) {
90
+ return {
91
+ score: 1,
92
+ message: "Only one subtask - perfect balance",
93
+ };
94
+ }
95
+
96
+ // Get durations (filter out zeros)
97
+ const durations = record.outcomes
98
+ .map((o) => o.duration_ms)
99
+ .filter((d) => d > 0);
100
+
101
+ if (durations.length === 0) {
102
+ return {
103
+ score: 0,
104
+ message: "No duration data available",
105
+ };
106
+ }
107
+
108
+ const maxDuration = Math.max(...durations);
109
+ const minDuration = Math.min(...durations);
110
+ const ratio = maxDuration / minDuration;
111
+
112
+ // Score based on ratio
113
+ let score: number;
114
+ let assessment: string;
115
+
116
+ if (ratio < 2.0) {
117
+ score = 1.0;
118
+ assessment = "well balanced";
119
+ } else if (ratio < 4.0) {
120
+ score = 0.5;
121
+ assessment = "moderately balanced";
122
+ } else {
123
+ score = 0.0;
124
+ assessment = "poorly balanced";
125
+ }
126
+
127
+ const maxSeconds = Math.round(maxDuration / 1000);
128
+ const minSeconds = Math.round(minDuration / 1000);
129
+
130
+ return {
131
+ score,
132
+ message: `Ratio ${ratio.toFixed(1)}x (${maxSeconds}s / ${minSeconds}s) - ${assessment}`,
133
+ };
134
+ } catch (error) {
135
+ return {
136
+ score: 0,
137
+ message: `Failed to parse EvalRecord: ${error}`,
138
+ };
139
+ }
140
+ },
141
+ });
142
+
143
+ /**
144
+ * Scope Accuracy Scorer
145
+ *
146
+ * Measures how accurately the decomposition predicted which files would be touched.
147
+ * High accuracy means the planner understood the work scope correctly.
148
+ *
149
+ * Score: intersection(actual, planned) / planned.length
150
+ * 1.0 = all planned files were touched, no extras
151
+ * 0.5 = half the planned files were touched
152
+ * 0.0 = none of the planned files were touched
153
+ */
154
+ export const scopeAccuracy = createScorer({
155
+ name: "Scope Accuracy",
156
+ description:
157
+ "Planned files match actual files touched (accuracy of scope prediction)",
158
+ scorer: ({ output }) => {
159
+ try {
160
+ const record = JSON.parse(String(output)) as EvalRecord;
161
+
162
+ // Check if outcomes exist
163
+ if (!record.outcomes || record.outcomes.length === 0) {
164
+ return {
165
+ score: 0,
166
+ message: "No outcome data available",
167
+ };
168
+ }
169
+
170
+ // Calculate accuracy per subtask
171
+ let totalPlanned = 0;
172
+ let totalCorrect = 0;
173
+
174
+ for (const outcome of record.outcomes) {
175
+ const planned = new Set(outcome.planned_files);
176
+ const actual = new Set(outcome.actual_files);
177
+
178
+ // Count intersection (files in both planned and actual)
179
+ const intersection = Array.from(planned).filter((f) => actual.has(f));
180
+
181
+ totalPlanned += planned.size;
182
+ totalCorrect += intersection.length;
183
+ }
184
+
185
+ if (totalPlanned === 0) {
186
+ return {
187
+ score: 0,
188
+ message: "No planned files to measure against",
189
+ };
190
+ }
191
+
192
+ const accuracy = totalCorrect / totalPlanned;
193
+
194
+ return {
195
+ score: accuracy,
196
+ message: `${totalCorrect}/${totalPlanned} planned files touched (${(accuracy * 100).toFixed(0)}% accuracy)`,
197
+ };
198
+ } catch (error) {
199
+ return {
200
+ score: 0,
201
+ message: `Failed to parse EvalRecord: ${error}`,
202
+ };
203
+ }
204
+ },
205
+ });
206
+
207
+ /**
208
+ * Scope Drift Scorer
209
+ *
210
+ * Penalizes when agents touch files NOT in their planned scope.
211
+ * Scope drift indicates poor planning or unexpected dependencies.
212
+ *
213
+ * Score: 1.0 if no drift (all actual files were planned)
214
+ * Decreases linearly with drift percentage
215
+ * 0.0 if drift > 50%
216
+ */
217
+ export const scopeDrift = createScorer({
218
+ name: "Scope Drift",
219
+ description:
220
+ "Agents stayed within their planned file scope (no unexpected files)",
221
+ scorer: ({ output }) => {
222
+ try {
223
+ const record = JSON.parse(String(output)) as EvalRecord;
224
+
225
+ // Check if outcomes exist
226
+ if (!record.outcomes || record.outcomes.length === 0) {
227
+ return {
228
+ score: 0,
229
+ message: "No outcome data available",
230
+ };
231
+ }
232
+
233
+ // Calculate drift per subtask
234
+ let totalActual = 0;
235
+ let totalDrift = 0;
236
+
237
+ for (const outcome of record.outcomes) {
238
+ const planned = new Set(outcome.planned_files);
239
+ const actual = new Set(outcome.actual_files);
240
+
241
+ // Count files in actual but NOT in planned
242
+ const drift = Array.from(actual).filter((f) => !planned.has(f));
243
+
244
+ totalActual += actual.size;
245
+ totalDrift += drift.length;
246
+ }
247
+
248
+ if (totalActual === 0) {
249
+ return {
250
+ score: 1,
251
+ message: "No files touched",
252
+ };
253
+ }
254
+
255
+ const driftRatio = totalDrift / totalActual;
256
+
257
+ // Score: 1.0 if no drift, linearly decrease to 0 at 50% drift
258
+ const score = Math.max(0, 1.0 - driftRatio * 2);
259
+
260
+ const driftPct = (driftRatio * 100).toFixed(0);
261
+
262
+ return {
263
+ score,
264
+ message: `${totalDrift}/${totalActual} files were unplanned (${driftPct}% drift)`,
265
+ };
266
+ } catch (error) {
267
+ return {
268
+ score: 0,
269
+ message: `Failed to parse EvalRecord: ${error}`,
270
+ };
271
+ }
272
+ },
273
+ });
274
+
275
+ /**
276
+ * No Rework Scorer
277
+ *
278
+ * Checks that no subtask touched files assigned to another subtask.
279
+ * Rework indicates poor decomposition or missing dependencies.
280
+ *
281
+ * Score: 1.0 if no rework (no subtask touched another's planned files)
282
+ * 0.0 if rework detected
283
+ */
284
+ export const noRework = createScorer({
285
+ name: "No Rework",
286
+ description: "No subtask touched files assigned to another subtask",
287
+ scorer: ({ output }) => {
288
+ try {
289
+ const record = JSON.parse(String(output)) as EvalRecord;
290
+
291
+ // Check if outcomes exist
292
+ if (!record.outcomes || record.outcomes.length === 0) {
293
+ return {
294
+ score: 0,
295
+ message: "No outcome data available",
296
+ };
297
+ }
298
+
299
+ // Build map of planned files per subtask
300
+ const plannedBySubtask = new Map<string, Set<string>>();
301
+
302
+ for (const outcome of record.outcomes) {
303
+ plannedBySubtask.set(outcome.bead_id, new Set(outcome.planned_files));
304
+ }
305
+
306
+ // Check each subtask for rework
307
+ const reworkCases: string[] = [];
308
+
309
+ for (const outcome of record.outcomes) {
310
+ const actualFiles = new Set(outcome.actual_files);
311
+
312
+ // Check if this subtask touched files planned for another subtask
313
+ for (const [otherBeadId, otherPlanned] of plannedBySubtask.entries()) {
314
+ if (otherBeadId === outcome.bead_id) {
315
+ continue; // Skip self
316
+ }
317
+
318
+ // Find intersection
319
+ const overlap = Array.from(actualFiles).filter((f) =>
320
+ otherPlanned.has(f),
321
+ );
322
+
323
+ if (overlap.length > 0) {
324
+ reworkCases.push(
325
+ `${outcome.title || outcome.bead_id} touched ${overlap.length} file(s) from ${otherBeadId}`,
326
+ );
327
+ }
328
+ }
329
+ }
330
+
331
+ if (reworkCases.length > 0) {
332
+ return {
333
+ score: 0,
334
+ message: `Rework detected: ${reworkCases.join("; ")}`,
335
+ };
336
+ }
337
+
338
+ return {
339
+ score: 1,
340
+ message: "No rework - all subtasks stayed in their lanes",
341
+ };
342
+ } catch (error) {
343
+ return {
344
+ score: 0,
345
+ message: `Failed to parse EvalRecord: ${error}`,
346
+ };
347
+ }
348
+ },
349
+ });