opencode-swarm-plugin 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Swarm Decomposition Quality Eval
3
+ *
4
+ * Tests the quality of task decomposition for swarm coordination.
5
+ * Uses real LLM calls via AI SDK + Vercel AI Gateway.
6
+ *
7
+ * Scorers evaluate:
8
+ * - Subtask independence (no file conflicts)
9
+ * - Complexity balance (even distribution)
10
+ * - Coverage completeness (all required files)
11
+ * - Instruction clarity (actionable descriptions)
12
+ *
13
+ * Run with: pnpm evalite evals/swarm-decomposition.eval.ts
14
+ *
15
+ * Requires: ANTHROPIC_API_KEY environment variable
16
+ */
17
+ import { evalite } from "evalite";
18
+ import {
19
+ subtaskIndependence,
20
+ coverageCompleteness,
21
+ instructionClarity,
22
+ } from "./scorers/index.js";
23
+ import { decompositionCases } from "./fixtures/decomposition-cases.js";
24
+ import {
25
+ generateDecomposition,
26
+ formatDecompositionPrompt,
27
+ extractJson,
28
+ } from "./lib/llm.js";
29
+ import {
30
+ loadEvalCases,
31
+ hasRealEvalData,
32
+ getEvalDataSummary,
33
+ } from "./lib/data-loader.js";
34
+
35
+ // Determine project key from current directory
36
+ const PROJECT_KEY = "opencode-swarm-plugin";
37
+ const PROJECT_PATH = process.cwd();
38
+
39
+ // Check if we have enough real data to use instead of fixtures
40
+ const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
41
+
42
+ // Load data based on availability
43
+ const evalCases = useRealData
44
+ ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
45
+ : decompositionCases.map((testCase) => ({
46
+ input: testCase.input,
47
+ expected: testCase.expected,
48
+ }));
49
+
50
+ // Log data source for transparency
51
+ if (useRealData) {
52
+ const summary = await getEvalDataSummary(PROJECT_KEY, PROJECT_PATH);
53
+ console.log(`[eval] Using real data from PGlite:`);
54
+ console.log(` - Total records: ${summary.totalRecords}`);
55
+ console.log(` - Success rate: ${(summary.successRate * 100).toFixed(1)}%`);
56
+ console.log(
57
+ ` - Strategies: ${Object.entries(summary.byStrategy)
58
+ .map(([s, c]) => `${s}(${c})`)
59
+ .join(", ")}`,
60
+ );
61
+ console.log(` - Eval cases: ${evalCases.length}`);
62
+ } else {
63
+ console.log(
64
+ `[eval] Using fixture data (${evalCases.length} cases) - not enough real data yet`,
65
+ );
66
+ }
67
+
68
+ /**
69
+ * Swarm Decomposition Quality Eval
70
+ *
71
+ * Tests decomposition quality with real LLM calls.
72
+ */
73
+ evalite("Swarm Decomposition Quality", {
74
+ // Test data from PGlite or fixtures
75
+ data: async () => evalCases,
76
+
77
+ // Task: generate real decomposition via Claude
78
+ task: async (input) => {
79
+ const prompt = formatDecompositionPrompt(input.task, input.context);
80
+ const response = await generateDecomposition(prompt);
81
+ return extractJson(response);
82
+ },
83
+
84
+ // Scorers evaluate decomposition quality
85
+ scorers: [subtaskIndependence, coverageCompleteness, instructionClarity],
86
+ });
87
+
88
+ /**
89
+ * Edge Case Eval: Minimal and Complex Tasks
90
+ *
91
+ * Tests handling of edge cases in decomposition.
92
+ */
93
+ evalite("Decomposition Edge Cases", {
94
+ data: async () => [
95
+ {
96
+ input: { task: "Fix typo in README.md" },
97
+ expected: { minSubtasks: 1, maxSubtasks: 2 },
98
+ },
99
+ {
100
+ input: { task: "Refactor entire codebase from JavaScript to TypeScript" },
101
+ expected: { minSubtasks: 4, maxSubtasks: 8 },
102
+ },
103
+ ],
104
+
105
+ task: async (input) => {
106
+ const prompt = formatDecompositionPrompt(input.task, undefined, 8);
107
+ const response = await generateDecomposition(prompt);
108
+ return extractJson(response);
109
+ },
110
+
111
+ scorers: [subtaskIndependence, coverageCompleteness],
112
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-swarm-plugin",
3
- "version": "0.20.0",
3
+ "version": "0.21.0",
4
4
  "description": "Multi-agent swarm coordination for OpenCode with learning capabilities, beads integration, and Agent Mail",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -27,6 +27,9 @@
27
27
  "test:all": "bun run test && bun run test:swarm",
28
28
  "typecheck": "tsc --noEmit",
29
29
  "clean": "rm -rf dist",
30
+ "eval:dev": "evalite watch evals/",
31
+ "eval:run": "evalite run evals/",
32
+ "eval:ci": "evalite run evals/ --threshold 80",
30
33
  "release": "npm run build && npm version patch && git push && npm run publish:otp",
31
34
  "release:minor": "npm run build && npm version minor && git push && npm run publish:otp",
32
35
  "release:major": "npm run build && npm version major && git push && npm run publish:otp",
@@ -41,11 +44,15 @@
41
44
  "gray-matter": "^4.0.3",
42
45
  "ioredis": "^5.4.1",
43
46
  "minimatch": "^10.1.1",
47
+ "nanoid": "^5.1.6",
44
48
  "zod": "4.1.8"
45
49
  },
46
50
  "devDependencies": {
47
51
  "@types/bun": "latest",
48
52
  "@types/minimatch": "^6.0.0",
53
+ "ai": "6.0.0-beta.150",
54
+ "bun-types": "^1.3.4",
55
+ "evalite": "^1.0.0-beta.10",
49
56
  "typescript": "^5.7.0",
50
57
  "vitest": "^4.0.15"
51
58
  },
package/src/beads.ts CHANGED
@@ -104,6 +104,8 @@ import {
104
104
  type BeadCreateArgs,
105
105
  type EpicCreateResult,
106
106
  } from "./schemas";
107
+ import { createEvent } from "./streams/events";
108
+ import { appendEvent } from "./streams/store";
107
109
 
108
110
  /**
109
111
  * Custom error for bead operations
@@ -321,6 +323,26 @@ export const beads_create_epic = tool({
321
323
  }),
322
324
  )
323
325
  .describe("Subtasks to create under the epic"),
326
+ strategy: tool.schema
327
+ .enum(["file-based", "feature-based", "risk-based"])
328
+ .optional()
329
+ .describe("Decomposition strategy used (default: feature-based)"),
330
+ task: tool.schema
331
+ .string()
332
+ .optional()
333
+ .describe("Original task description that was decomposed"),
334
+ project_key: tool.schema
335
+ .string()
336
+ .optional()
337
+ .describe("Project path for event emission"),
338
+ recovery_context: tool.schema
339
+ .object({
340
+ shared_context: tool.schema.string().optional(),
341
+ skills_to_load: tool.schema.array(tool.schema.string()).optional(),
342
+ coordinator_notes: tool.schema.string().optional(),
343
+ })
344
+ .optional()
345
+ .describe("Recovery context from checkpoint compaction"),
324
346
  },
325
347
  async execute(args, ctx) {
326
348
  const validated = EpicCreateArgsSchema.parse(args);
@@ -386,6 +408,33 @@ export const beads_create_epic = tool({
386
408
  subtasks: created.slice(1),
387
409
  };
388
410
 
411
+ // Emit DecompositionGeneratedEvent for learning system
412
+ if (args.project_key) {
413
+ try {
414
+ const event = createEvent("decomposition_generated", {
415
+ project_key: args.project_key,
416
+ epic_id: epic.id,
417
+ task: args.task || validated.epic_title,
418
+ context: validated.epic_description,
419
+ strategy: args.strategy || "feature-based",
420
+ epic_title: validated.epic_title,
421
+ subtasks: validated.subtasks.map((st) => ({
422
+ title: st.title,
423
+ files: st.files || [],
424
+ priority: st.priority,
425
+ })),
426
+ recovery_context: args.recovery_context,
427
+ });
428
+ await appendEvent(event, args.project_key);
429
+ } catch (error) {
430
+ // Non-fatal - log and continue
431
+ console.warn(
432
+ "[beads_create_epic] Failed to emit DecompositionGeneratedEvent:",
433
+ error,
434
+ );
435
+ }
436
+ }
437
+
389
438
  return JSON.stringify(result, null, 2);
390
439
  } catch (error) {
391
440
  // Partial failure - execute rollback automatically
@@ -0,0 +1,487 @@
1
+ /**
2
+ * Eval Data Capture - Captures real swarm execution data for evals
3
+ *
4
+ * Records decomposition inputs, outputs, and outcomes to JSONL files
5
+ * that can be used as ground truth for Evalite evals.
6
+ *
7
+ * Data flow:
8
+ * 1. swarm_decompose captures: task, context, generated decomposition
9
+ * 2. swarm_complete captures: outcome signals per subtask
10
+ * 3. swarm_record_outcome captures: learning signals
11
+ * 4. Human feedback (optional): accept/reject/modify
12
+ *
13
+ * @module eval-capture
14
+ */
15
+ import { z } from "zod";
16
+ import * as fs from "fs";
17
+ import * as path from "path";
18
+
19
+ // ============================================================================
20
+ // Schemas
21
+ // ============================================================================
22
+
23
+ /**
24
+ * Subtask outcome - what actually happened
25
+ */
26
+ export const SubtaskOutcomeSchema = z.object({
27
+ /** Subtask bead ID */
28
+ bead_id: z.string(),
29
+ /** Subtask title */
30
+ title: z.string(),
31
+ /** Planned files */
32
+ planned_files: z.array(z.string()),
33
+ /** Actual files touched */
34
+ actual_files: z.array(z.string()),
35
+ /** Duration in ms */
36
+ duration_ms: z.number().int().min(0),
37
+ /** Error count */
38
+ error_count: z.number().int().min(0),
39
+ /** Retry count */
40
+ retry_count: z.number().int().min(0),
41
+ /** Success */
42
+ success: z.boolean(),
43
+ /** Failure mode if failed */
44
+ failure_mode: z.string().optional(),
45
+ });
46
+ export type SubtaskOutcome = z.infer<typeof SubtaskOutcomeSchema>;
47
+
48
+ /**
49
+ * Complete eval record - input, output, and outcome
50
+ */
51
+ export const EvalRecordSchema = z.object({
52
+ /** Unique ID for this eval record */
53
+ id: z.string(),
54
+ /** Timestamp when decomposition was generated */
55
+ timestamp: z.string(), // ISO-8601
56
+ /** Project path */
57
+ project_path: z.string(),
58
+
59
+ // INPUT
60
+ /** Original task description */
61
+ task: z.string(),
62
+ /** Context provided (codebase info, CASS results, etc.) */
63
+ context: z.string().optional(),
64
+ /** Strategy used for decomposition */
65
+ strategy: z.enum(["file-based", "feature-based", "risk-based", "auto"]),
66
+ /** Max subtasks requested */
67
+ max_subtasks: z.number().int().min(1).max(10),
68
+
69
+ // OUTPUT (the decomposition)
70
+ /** Epic title */
71
+ epic_title: z.string(),
72
+ /** Epic description */
73
+ epic_description: z.string().optional(),
74
+ /** Generated subtasks */
75
+ subtasks: z.array(
76
+ z.object({
77
+ title: z.string(),
78
+ description: z.string().optional(),
79
+ files: z.array(z.string()),
80
+ dependencies: z.array(z.number()).optional(),
81
+ estimated_complexity: z.number().int().min(1).max(5).optional(),
82
+ }),
83
+ ),
84
+
85
+ // OUTCOME (what actually happened)
86
+ /** Subtask outcomes */
87
+ outcomes: z.array(SubtaskOutcomeSchema).optional(),
88
+ /** Overall success (all subtasks succeeded) */
89
+ overall_success: z.boolean().optional(),
90
+ /** Total duration (sum of all subtasks) */
91
+ total_duration_ms: z.number().int().min(0).optional(),
92
+ /** Total errors across all subtasks */
93
+ total_errors: z.number().int().min(0).optional(),
94
+
95
+ // HUMAN FEEDBACK (optional)
96
+ /** Human accepted the decomposition as-is */
97
+ human_accepted: z.boolean().optional(),
98
+ /** Human modified the decomposition */
99
+ human_modified: z.boolean().optional(),
100
+ /** Human feedback notes */
101
+ human_notes: z.string().optional(),
102
+
103
+ // COMPUTED METRICS
104
+ /** File overlap between subtasks (should be 0) */
105
+ file_overlap_count: z.number().int().min(0).optional(),
106
+ /** Scope accuracy: actual files / planned files */
107
+ scope_accuracy: z.number().min(0).max(2).optional(),
108
+ /** Time balance: max duration / min duration (lower is better) */
109
+ time_balance_ratio: z.number().min(1).optional(),
110
+ });
111
+ export type EvalRecord = z.infer<typeof EvalRecordSchema>;
112
+
113
+ /**
114
+ * Partial record for in-progress capture
115
+ */
116
+ export type PartialEvalRecord = Partial<EvalRecord> & {
117
+ id: string;
118
+ timestamp: string;
119
+ task: string;
120
+ };
121
+
122
+ // ============================================================================
123
+ // Storage
124
+ // ============================================================================
125
+
126
+ /**
127
+ * Default path for eval data
128
+ */
129
+ export const DEFAULT_EVAL_DATA_PATH = ".opencode/eval-data.jsonl";
130
+
131
+ /**
132
+ * Get the eval data file path for a project
133
+ */
134
+ export function getEvalDataPath(projectPath: string): string {
135
+ return path.join(projectPath, DEFAULT_EVAL_DATA_PATH);
136
+ }
137
+
138
+ /**
139
+ * Ensure the eval data directory exists
140
+ */
141
+ export function ensureEvalDataDir(projectPath: string): void {
142
+ const evalPath = getEvalDataPath(projectPath);
143
+ const dir = path.dirname(evalPath);
144
+ if (!fs.existsSync(dir)) {
145
+ fs.mkdirSync(dir, { recursive: true });
146
+ }
147
+ }
148
+
149
+ /**
150
+ * Append an eval record to the JSONL file
151
+ */
152
+ export function appendEvalRecord(
153
+ projectPath: string,
154
+ record: EvalRecord | PartialEvalRecord,
155
+ ): void {
156
+ ensureEvalDataDir(projectPath);
157
+ const evalPath = getEvalDataPath(projectPath);
158
+ const line = JSON.stringify(record) + "\n";
159
+ fs.appendFileSync(evalPath, line, "utf-8");
160
+ }
161
+
162
+ /**
163
+ * Read all eval records from a project
164
+ */
165
+ export function readEvalRecords(projectPath: string): EvalRecord[] {
166
+ const evalPath = getEvalDataPath(projectPath);
167
+ if (!fs.existsSync(evalPath)) {
168
+ return [];
169
+ }
170
+
171
+ const content = fs.readFileSync(evalPath, "utf-8");
172
+ const lines = content.trim().split("\n").filter(Boolean);
173
+
174
+ return lines.map((line) => {
175
+ const parsed = JSON.parse(line);
176
+ return EvalRecordSchema.parse(parsed);
177
+ });
178
+ }
179
+
180
+ /**
181
+ * Read partial records (for updating in-progress records)
182
+ */
183
+ export function readPartialRecords(projectPath: string): PartialEvalRecord[] {
184
+ const evalPath = getEvalDataPath(projectPath);
185
+ if (!fs.existsSync(evalPath)) {
186
+ return [];
187
+ }
188
+
189
+ const content = fs.readFileSync(evalPath, "utf-8");
190
+ const lines = content.trim().split("\n").filter(Boolean);
191
+
192
+ return lines.map((line) => JSON.parse(line) as PartialEvalRecord);
193
+ }
194
+
195
+ /**
196
+ * Update an existing record by ID
197
+ */
198
+ export function updateEvalRecord(
199
+ projectPath: string,
200
+ id: string,
201
+ updates: Partial<EvalRecord>,
202
+ ): boolean {
203
+ const records = readPartialRecords(projectPath);
204
+ const index = records.findIndex((r) => r.id === id);
205
+
206
+ if (index === -1) {
207
+ return false;
208
+ }
209
+
210
+ records[index] = { ...records[index], ...updates };
211
+
212
+ // Rewrite the file
213
+ const evalPath = getEvalDataPath(projectPath);
214
+ const content = records.map((r) => JSON.stringify(r)).join("\n") + "\n";
215
+ fs.writeFileSync(evalPath, content, "utf-8");
216
+
217
+ return true;
218
+ }
219
+
220
+ // ============================================================================
221
+ // Capture Functions
222
+ // ============================================================================
223
+
224
+ /**
225
+ * In-memory store for in-progress records (keyed by epic ID)
226
+ */
227
+ const inProgressRecords = new Map<string, PartialEvalRecord>();
228
+
229
+ /**
230
+ * Start capturing a decomposition
231
+ *
232
+ * Called when swarm_decompose generates a decomposition.
233
+ * Creates a partial record that will be completed when outcomes arrive.
234
+ */
235
+ export function captureDecomposition(params: {
236
+ epicId: string;
237
+ projectPath: string;
238
+ task: string;
239
+ context?: string;
240
+ strategy: "file-based" | "feature-based" | "risk-based" | "auto";
241
+ maxSubtasks: number;
242
+ epicTitle: string;
243
+ epicDescription?: string;
244
+ subtasks: Array<{
245
+ title: string;
246
+ description?: string;
247
+ files: string[];
248
+ dependencies?: number[];
249
+ estimated_complexity?: number;
250
+ }>;
251
+ }): PartialEvalRecord {
252
+ const record: PartialEvalRecord = {
253
+ id: params.epicId,
254
+ timestamp: new Date().toISOString(),
255
+ project_path: params.projectPath,
256
+ task: params.task,
257
+ context: params.context,
258
+ strategy: params.strategy,
259
+ max_subtasks: params.maxSubtasks,
260
+ epic_title: params.epicTitle,
261
+ epic_description: params.epicDescription,
262
+ subtasks: params.subtasks,
263
+ outcomes: [],
264
+ };
265
+
266
+ // Store in memory for later updates
267
+ inProgressRecords.set(params.epicId, record);
268
+
269
+ // Also persist to disk (partial)
270
+ appendEvalRecord(params.projectPath, record);
271
+
272
+ return record;
273
+ }
274
+
275
+ /**
276
+ * Capture a subtask outcome
277
+ *
278
+ * Called when swarm_complete finishes a subtask.
279
+ * Updates the in-progress record with outcome data.
280
+ */
281
+ export function captureSubtaskOutcome(params: {
282
+ epicId: string;
283
+ projectPath: string;
284
+ beadId: string;
285
+ title: string;
286
+ plannedFiles: string[];
287
+ actualFiles: string[];
288
+ durationMs: number;
289
+ errorCount: number;
290
+ retryCount: number;
291
+ success: boolean;
292
+ failureMode?: string;
293
+ }): void {
294
+ const outcome: SubtaskOutcome = {
295
+ bead_id: params.beadId,
296
+ title: params.title,
297
+ planned_files: params.plannedFiles,
298
+ actual_files: params.actualFiles,
299
+ duration_ms: params.durationMs,
300
+ error_count: params.errorCount,
301
+ retry_count: params.retryCount,
302
+ success: params.success,
303
+ failure_mode: params.failureMode,
304
+ };
305
+
306
+ // Update in-memory record
307
+ const record = inProgressRecords.get(params.epicId);
308
+ if (record) {
309
+ record.outcomes = record.outcomes || [];
310
+ record.outcomes.push(outcome);
311
+ }
312
+
313
+ // Update on disk
314
+ updateEvalRecord(params.projectPath, params.epicId, {
315
+ outcomes: record?.outcomes,
316
+ });
317
+ }
318
+
319
+ /**
320
+ * Finalize an eval record
321
+ *
322
+ * Called when all subtasks are complete.
323
+ * Computes aggregate metrics and marks record as complete.
324
+ */
325
+ export function finalizeEvalRecord(params: {
326
+ epicId: string;
327
+ projectPath: string;
328
+ }): EvalRecord | null {
329
+ const record = inProgressRecords.get(params.epicId);
330
+ if (!record || !record.outcomes || record.outcomes.length === 0) {
331
+ return null;
332
+ }
333
+
334
+ // Compute aggregate metrics
335
+ const outcomes = record.outcomes;
336
+
337
+ const overallSuccess = outcomes.every((o) => o.success);
338
+ const totalDurationMs = outcomes.reduce((sum, o) => sum + o.duration_ms, 0);
339
+ const totalErrors = outcomes.reduce((sum, o) => sum + o.error_count, 0);
340
+
341
+ // File overlap: count files that appear in multiple subtasks
342
+ const allPlannedFiles = record.subtasks?.flatMap((s) => s.files) || [];
343
+ const fileOccurrences = new Map<string, number>();
344
+ for (const file of allPlannedFiles) {
345
+ fileOccurrences.set(file, (fileOccurrences.get(file) || 0) + 1);
346
+ }
347
+ const fileOverlapCount = Array.from(fileOccurrences.values()).filter(
348
+ (count) => count > 1,
349
+ ).length;
350
+
351
+ // Scope accuracy: actual files / planned files
352
+ const plannedFileSet = new Set(allPlannedFiles);
353
+ const actualFileSet = new Set(outcomes.flatMap((o) => o.actual_files));
354
+ const scopeAccuracy =
355
+ plannedFileSet.size > 0 ? actualFileSet.size / plannedFileSet.size : 1;
356
+
357
+ // Time balance: max duration / min duration
358
+ const durations = outcomes.map((o) => o.duration_ms).filter((d) => d > 0);
359
+ const timeBalanceRatio =
360
+ durations.length > 1 ? Math.max(...durations) / Math.min(...durations) : 1;
361
+
362
+ // Update record with computed metrics
363
+ const finalRecord: EvalRecord = {
364
+ ...(record as EvalRecord),
365
+ overall_success: overallSuccess,
366
+ total_duration_ms: totalDurationMs,
367
+ total_errors: totalErrors,
368
+ file_overlap_count: fileOverlapCount,
369
+ scope_accuracy: scopeAccuracy,
370
+ time_balance_ratio: timeBalanceRatio,
371
+ };
372
+
373
+ // Update on disk
374
+ updateEvalRecord(params.projectPath, params.epicId, finalRecord);
375
+
376
+ // Remove from in-progress
377
+ inProgressRecords.delete(params.epicId);
378
+
379
+ return finalRecord;
380
+ }
381
+
382
+ /**
383
+ * Capture human feedback on a decomposition
384
+ */
385
+ export function captureHumanFeedback(params: {
386
+ epicId: string;
387
+ projectPath: string;
388
+ accepted: boolean;
389
+ modified: boolean;
390
+ notes?: string;
391
+ }): void {
392
+ updateEvalRecord(params.projectPath, params.epicId, {
393
+ human_accepted: params.accepted,
394
+ human_modified: params.modified,
395
+ human_notes: params.notes,
396
+ });
397
+ }
398
+
399
+ // ============================================================================
400
+ // Eval Data Export
401
+ // ============================================================================
402
+
403
+ /**
404
+ * Export eval records as Evalite-compatible test cases
405
+ *
406
+ * Filters to only complete records with outcomes.
407
+ */
408
+ export function exportForEvalite(projectPath: string): Array<{
409
+ input: { task: string; context?: string };
410
+ expected: {
411
+ minSubtasks: number;
412
+ maxSubtasks: number;
413
+ requiredFiles?: string[];
414
+ overallSuccess?: boolean;
415
+ };
416
+ actual: EvalRecord;
417
+ }> {
418
+ const records = readEvalRecords(projectPath);
419
+
420
+ return records
421
+ .filter((r) => r.outcomes && r.outcomes.length > 0)
422
+ .map((record) => ({
423
+ input: {
424
+ task: record.task,
425
+ context: record.context,
426
+ },
427
+ expected: {
428
+ minSubtasks: 2,
429
+ maxSubtasks: record.max_subtasks,
430
+ requiredFiles: record.subtasks.flatMap((s) => s.files),
431
+ overallSuccess: record.overall_success,
432
+ },
433
+ actual: record,
434
+ }));
435
+ }
436
+
437
+ /**
438
+ * Get statistics about captured eval data
439
+ */
440
+ export function getEvalDataStats(projectPath: string): {
441
+ totalRecords: number;
442
+ completeRecords: number;
443
+ successRate: number;
444
+ avgSubtasks: number;
445
+ avgDurationMs: number;
446
+ avgScopeAccuracy: number;
447
+ avgTimeBalance: number;
448
+ } {
449
+ const records = readEvalRecords(projectPath);
450
+ const complete = records.filter((r) => r.outcomes && r.outcomes.length > 0);
451
+
452
+ if (complete.length === 0) {
453
+ return {
454
+ totalRecords: records.length,
455
+ completeRecords: 0,
456
+ successRate: 0,
457
+ avgSubtasks: 0,
458
+ avgDurationMs: 0,
459
+ avgScopeAccuracy: 0,
460
+ avgTimeBalance: 0,
461
+ };
462
+ }
463
+
464
+ const successCount = complete.filter((r) => r.overall_success).length;
465
+ const avgSubtasks =
466
+ complete.reduce((sum, r) => sum + (r.outcomes?.length || 0), 0) /
467
+ complete.length;
468
+ const avgDurationMs =
469
+ complete.reduce((sum, r) => sum + (r.total_duration_ms || 0), 0) /
470
+ complete.length;
471
+ const avgScopeAccuracy =
472
+ complete.reduce((sum, r) => sum + (r.scope_accuracy || 1), 0) /
473
+ complete.length;
474
+ const avgTimeBalance =
475
+ complete.reduce((sum, r) => sum + (r.time_balance_ratio || 1), 0) /
476
+ complete.length;
477
+
478
+ return {
479
+ totalRecords: records.length,
480
+ completeRecords: complete.length,
481
+ successRate: successCount / complete.length,
482
+ avgSubtasks,
483
+ avgDurationMs,
484
+ avgScopeAccuracy,
485
+ avgTimeBalance,
486
+ };
487
+ }