opencode-swarm-plugin 0.39.1 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +76 -11
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +207 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.test.ts +1054 -719
  14. package/bin/swarm.ts +577 -0
  15. package/dist/compaction-hook.d.ts +10 -1
  16. package/dist/compaction-hook.d.ts.map +1 -1
  17. package/dist/compaction-observability.d.ts +173 -0
  18. package/dist/compaction-observability.d.ts.map +1 -0
  19. package/dist/compaction-prompt-scoring.d.ts +1 -0
  20. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  21. package/dist/eval-capture.d.ts +93 -0
  22. package/dist/eval-capture.d.ts.map +1 -1
  23. package/dist/eval-runner.d.ts +134 -0
  24. package/dist/eval-runner.d.ts.map +1 -0
  25. package/dist/hive.d.ts.map +1 -1
  26. package/dist/index.d.ts +65 -1
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +84043 -28070
  29. package/dist/memory-tools.d.ts +70 -2
  30. package/dist/memory-tools.d.ts.map +1 -1
  31. package/dist/memory.d.ts +37 -0
  32. package/dist/memory.d.ts.map +1 -1
  33. package/dist/observability-tools.d.ts +64 -0
  34. package/dist/observability-tools.d.ts.map +1 -1
  35. package/dist/plugin.js +83570 -27466
  36. package/dist/schemas/task.d.ts +3 -3
  37. package/dist/swarm-orchestrate.d.ts.map +1 -1
  38. package/dist/swarm-prompts.d.ts +32 -1
  39. package/dist/swarm-prompts.d.ts.map +1 -1
  40. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  41. package/evals/ARCHITECTURE.md +1189 -0
  42. package/evals/README.md +113 -0
  43. package/evals/example.eval.ts +3 -4
  44. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  45. package/evals/scorers/coordinator-discipline.evalite-test.ts +163 -0
  46. package/evals/scorers/coordinator-discipline.ts +82 -2
  47. package/evals/scorers/index.test.ts +146 -0
  48. package/evals/scorers/index.ts +104 -0
  49. package/evals/swarm-decomposition.eval.ts +13 -4
  50. package/examples/commands/swarm.md +291 -21
  51. package/package.json +4 -3
  52. package/src/compaction-hook.ts +258 -110
  53. package/src/compaction-observability.integration.test.ts +139 -0
  54. package/src/compaction-observability.test.ts +187 -0
  55. package/src/compaction-observability.ts +324 -0
  56. package/src/compaction-prompt-scorers.test.ts +10 -9
  57. package/src/compaction-prompt-scoring.ts +7 -5
  58. package/src/eval-capture.test.ts +204 -1
  59. package/src/eval-capture.ts +194 -2
  60. package/src/eval-runner.test.ts +223 -0
  61. package/src/eval-runner.ts +402 -0
  62. package/src/hive.ts +57 -22
  63. package/src/index.ts +54 -1
  64. package/src/memory-tools.test.ts +84 -0
  65. package/src/memory-tools.ts +68 -3
  66. package/src/memory.test.ts +2 -2
  67. package/src/memory.ts +122 -49
  68. package/src/observability-tools.test.ts +13 -0
  69. package/src/observability-tools.ts +277 -0
  70. package/src/swarm-orchestrate.test.ts +162 -0
  71. package/src/swarm-orchestrate.ts +7 -5
  72. package/src/swarm-prompts.test.ts +168 -4
  73. package/src/swarm-prompts.ts +228 -7
  74. package/.env +0 -2
  75. package/.turbo/turbo-test.log +0 -481
  76. package/.turbo/turbo-typecheck.log +0 -1
  77. package/dist/beads.d.ts +0 -386
  78. package/dist/beads.d.ts.map +0 -1
  79. package/dist/schemas/bead-events.d.ts +0 -698
  80. package/dist/schemas/bead-events.d.ts.map +0 -1
  81. package/dist/schemas/bead.d.ts +0 -255
  82. package/dist/schemas/bead.d.ts.map +0 -1
@@ -0,0 +1,223 @@
1
+ /**
2
+ * Tests for eval-runner - Programmatic evalite execution
3
+ *
4
+ * TDD: These tests MUST fail initially, then pass after implementation.
5
+ */
6
+
7
+ import { describe, test, expect, beforeAll, afterEach } from "bun:test";
8
+ import { runEvals } from "./eval-runner";
9
+ import path from "node:path";
10
+ import fs from "node:fs";
11
+ import { getEvalHistoryPath } from "./eval-history";
12
+
13
+ // Use project root for all tests
14
+ const PROJECT_ROOT = path.resolve(import.meta.dir, "..");
15
+
16
+ describe("runEvals", () => {
17
+ test("runs all evals when no suite filter provided", async () => {
18
+ const result = await runEvals({
19
+ cwd: PROJECT_ROOT,
20
+ });
21
+
22
+ // Even if some evals fail, we should get results
23
+ expect(typeof result.success).toBe("boolean");
24
+ expect(typeof result.totalSuites).toBe("number");
25
+ expect(typeof result.totalEvals).toBe("number");
26
+ expect(typeof result.averageScore).toBe("number");
27
+ expect(Array.isArray(result.suites)).toBe(true);
28
+
29
+ // Should have at least the example.eval.ts suite
30
+ expect(result.totalSuites).toBeGreaterThan(0);
31
+ expect(result.suites.length).toBeGreaterThan(0);
32
+ }, 60000); // 60s timeout for full eval run
33
+
34
+ test("filters evals by suite name", async () => {
35
+ const result = await runEvals({
36
+ cwd: PROJECT_ROOT,
37
+ suiteFilter: "example",
38
+ });
39
+
40
+ expect(result.success).toBe(true);
41
+ // All suite filepaths should contain "example"
42
+ for (const suite of result.suites) {
43
+ expect(suite.filepath.toLowerCase()).toContain("example");
44
+ }
45
+ }, 30000);
46
+
47
+ test("respects score threshold", async () => {
48
+ const result = await runEvals({
49
+ cwd: PROJECT_ROOT,
50
+ suiteFilter: "example", // Known good eval
51
+ scoreThreshold: 0, // Very low threshold, should pass
52
+ });
53
+
54
+ expect(result.success).toBe(true);
55
+ expect(result.averageScore).toBeGreaterThanOrEqual(0);
56
+ }, 30000);
57
+
58
+ test("returns structured suite results with scores", async () => {
59
+ const result = await runEvals({
60
+ cwd: PROJECT_ROOT,
61
+ suiteFilter: "example",
62
+ });
63
+
64
+ expect(result.suites.length).toBeGreaterThan(0);
65
+
66
+ const suite = result.suites[0];
67
+ expect(suite).toMatchObject({
68
+ name: expect.any(String),
69
+ filepath: expect.any(String),
70
+ status: expect.stringMatching(/^(success|fail|running)$/),
71
+ duration: expect.any(Number),
72
+ averageScore: expect.any(Number),
73
+ evalCount: expect.any(Number),
74
+ });
75
+ }, 30000);
76
+
77
+ test("handles errors gracefully", async () => {
78
+ const result = await runEvals({
79
+ cwd: "/nonexistent/path",
80
+ });
81
+
82
+ expect(result.success).toBe(false);
83
+ expect(result.error).toBeDefined();
84
+ expect(result.suites).toEqual([]);
85
+ }, 10000);
86
+
87
+ test("returns empty results when no evals match filter", async () => {
88
+ const result = await runEvals({
89
+ cwd: PROJECT_ROOT,
90
+ suiteFilter: "nonexistent-eval-name-xyz",
91
+ });
92
+
93
+ // Should succeed but with no suites
94
+ expect(result.success).toBe(true);
95
+ expect(result.totalSuites).toBe(0);
96
+ expect(result.suites).toEqual([]);
97
+ }, 10000);
98
+
99
+ test("records eval run to history after execution", async () => {
100
+ // Clean up any existing history before test
101
+ const historyPath = getEvalHistoryPath(PROJECT_ROOT);
102
+ const historyBackup = historyPath + ".backup";
103
+
104
+ // Backup existing history
105
+ if (fs.existsSync(historyPath)) {
106
+ fs.copyFileSync(historyPath, historyBackup);
107
+ }
108
+
109
+ try {
110
+ // Remove history file to get clean state
111
+ if (fs.existsSync(historyPath)) {
112
+ fs.unlinkSync(historyPath);
113
+ }
114
+
115
+ // Run evals
116
+ const result = await runEvals({
117
+ cwd: PROJECT_ROOT,
118
+ suiteFilter: "example",
119
+ });
120
+
121
+ // Should have succeeded
122
+ expect(result.success).toBe(true);
123
+ expect(result.suites.length).toBeGreaterThan(0);
124
+
125
+ // History file should have been created
126
+ expect(fs.existsSync(historyPath)).toBe(true);
127
+
128
+ // Read history file
129
+ const historyContent = fs.readFileSync(historyPath, "utf-8");
130
+ const lines = historyContent.trim().split("\n");
131
+
132
+ // Should have one line per suite
133
+ expect(lines.length).toBe(result.suites.length);
134
+
135
+ // Parse first line and verify structure
136
+ const firstRecord = JSON.parse(lines[0]);
137
+
138
+ // Verify structure has all required fields
139
+ expect(typeof firstRecord.timestamp).toBe("string");
140
+ expect(typeof firstRecord.eval_name).toBe("string");
141
+ expect(typeof firstRecord.score).toBe("number");
142
+ expect(typeof firstRecord.run_count).toBe("number");
143
+
144
+ // Verify eval_name matches suite name
145
+ expect(firstRecord.eval_name).toBe(result.suites[0].name);
146
+
147
+ // Verify score matches suite averageScore
148
+ expect(firstRecord.score).toBe(result.suites[0].averageScore);
149
+
150
+ // First run should have run_count = 1
151
+ expect(firstRecord.run_count).toBe(1);
152
+ } finally {
153
+ // Restore backup
154
+ if (fs.existsSync(historyBackup)) {
155
+ fs.copyFileSync(historyBackup, historyPath);
156
+ fs.unlinkSync(historyBackup);
157
+ }
158
+ }
159
+ }, 30000);
160
+
161
+ test("checks gates for each suite after recording", async () => {
162
+ const result = await runEvals({
163
+ cwd: PROJECT_ROOT,
164
+ suiteFilter: "example",
165
+ });
166
+
167
+ expect(result.success).toBe(true);
168
+ expect(result.gateResults).toBeDefined();
169
+ expect(Array.isArray(result.gateResults)).toBe(true);
170
+
171
+ // Should have gate result for each suite
172
+ expect(result.gateResults?.length).toBe(result.suites.length);
173
+
174
+ // Each gate result should have required fields
175
+ if (result.gateResults && result.gateResults.length > 0) {
176
+ const gateResult = result.gateResults[0];
177
+ expect(gateResult).toHaveProperty("suite");
178
+ expect(gateResult).toHaveProperty("passed");
179
+ expect(gateResult).toHaveProperty("phase");
180
+ expect(gateResult).toHaveProperty("message");
181
+ expect(gateResult).toHaveProperty("currentScore");
182
+ }
183
+ }, 30000);
184
+
185
+ test("calls learnFromEvalFailure when gate fails", async () => {
186
+ // This test requires manually creating a history with regression
187
+ // For now, we just verify the code path exists
188
+ // In practice, this would be tested with mocked checkGate returning failed=true
189
+
190
+ const result = await runEvals({
191
+ cwd: PROJECT_ROOT,
192
+ suiteFilter: "example",
193
+ });
194
+
195
+ // Gate results should be present even if no failures
196
+ expect(result.gateResults).toBeDefined();
197
+ }, 30000);
198
+
199
+ test("does NOT call learnFromEvalFailure when gate passes", async () => {
200
+ // Similar to above - verifies the happy path
201
+ // Real test would mock checkGate and verify learnFromEvalFailure NOT called
202
+
203
+ const result = await runEvals({
204
+ cwd: PROJECT_ROOT,
205
+ suiteFilter: "example",
206
+ });
207
+
208
+ // Should succeed with gate results
209
+ expect(result.success).toBe(true);
210
+ expect(result.gateResults).toBeDefined();
211
+ }, 30000);
212
+
213
+ test("includes gateResults in return value", async () => {
214
+ const result = await runEvals({
215
+ cwd: PROJECT_ROOT,
216
+ suiteFilter: "example",
217
+ });
218
+
219
+ // gateResults should be array (even if empty)
220
+ expect(result).toHaveProperty("gateResults");
221
+ expect(Array.isArray(result.gateResults)).toBe(true);
222
+ }, 30000);
223
+ });
@@ -0,0 +1,402 @@
1
+ /**
2
+ * Programmatic Evalite Runner
3
+ *
4
+ * Provides a type-safe API for running evalite evals programmatically.
5
+ * Wraps evalite's runEvalite function with structured result parsing.
6
+ *
7
+ * @module eval-runner
8
+ */
9
+
10
+ import { tool } from "@opencode-ai/plugin";
11
+ import { runEvalite } from "evalite/runner";
12
+ import { createInMemoryStorage } from "evalite/in-memory-storage";
13
+ import type { Evalite } from "evalite/types";
14
+ import fs from "node:fs/promises";
15
+ import path from "node:path";
16
+ import { recordEvalRun, getScoreHistory } from "./eval-history.js";
17
+ import { checkGate } from "./eval-gates.js";
18
+ import { learnFromEvalFailure } from "./eval-learning.js";
19
+ import { getMemoryAdapter } from "./memory-tools.js";
20
+
21
+ /**
22
+ * Options for running evals programmatically
23
+ */
24
+ export interface RunEvalsOptions {
25
+ /**
26
+ * Working directory containing eval files (defaults to process.cwd())
27
+ */
28
+ cwd?: string;
29
+
30
+ /**
31
+ * Optional filter to run specific eval suites (e.g., "coordinator", "compaction")
32
+ * Matches against eval file paths using substring matching
33
+ */
34
+ suiteFilter?: string;
35
+
36
+ /**
37
+ * Minimum average score threshold (0-100)
38
+ * If average score falls below this, result.success will be false
39
+ */
40
+ scoreThreshold?: number;
41
+
42
+ /**
43
+ * Optional path to write raw evalite JSON output
44
+ */
45
+ outputPath?: string;
46
+ }
47
+
48
+ /**
49
+ * Structured suite result with scores
50
+ */
51
+ export interface SuiteResult {
52
+ /** Suite name from evalite() call */
53
+ name: string;
54
+
55
+ /** Absolute path to eval file */
56
+ filepath: string;
57
+
58
+ /** Suite status: success, fail, or running */
59
+ status: "success" | "fail" | "running";
60
+
61
+ /** Total duration in milliseconds */
62
+ duration: number;
63
+
64
+ /** Average score across all evals in suite (0-1 scale) */
65
+ averageScore: number;
66
+
67
+ /** Number of evals in this suite */
68
+ evalCount: number;
69
+
70
+ /** Individual eval results (optional, can be large) */
71
+ evals?: Array<{
72
+ input: unknown;
73
+ output: unknown;
74
+ expected?: unknown;
75
+ scores: Array<{
76
+ name: string;
77
+ score: number;
78
+ description?: string;
79
+ }>;
80
+ }>;
81
+ }
82
+
83
+ /**
84
+ * Structured result from running evals
85
+ */
86
+ export interface RunEvalsResult {
87
+ /** Whether the run succeeded (all evals passed threshold) */
88
+ success: boolean;
89
+
90
+ /** Total number of suites executed */
91
+ totalSuites: number;
92
+
93
+ /** Total number of individual evals executed */
94
+ totalEvals: number;
95
+
96
+ /** Average score across all suites (0-1 scale) */
97
+ averageScore: number;
98
+
99
+ /** Individual suite results */
100
+ suites: SuiteResult[];
101
+
102
+ /** Error message if run failed */
103
+ error?: string;
104
+
105
+ /** Gate check results per suite */
106
+ gateResults?: Array<{
107
+ suite: string;
108
+ passed: boolean;
109
+ phase: string;
110
+ message: string;
111
+ baseline?: number;
112
+ currentScore: number;
113
+ regressionPercent?: number;
114
+ }>;
115
+ }
116
+
117
+ /**
118
+ * Run evalite evals programmatically
119
+ *
120
+ * @param options - Configuration for eval run
121
+ * @returns Structured results with scores per suite
122
+ *
123
+ * @example
124
+ * ```typescript
125
+ * // Run all evals
126
+ * const result = await runEvals({ cwd: "/path/to/project" });
127
+ * console.log(`Average score: ${result.averageScore}`);
128
+ *
129
+ * // Run specific suite
130
+ * const coordResult = await runEvals({
131
+ * cwd: "/path/to/project",
132
+ * suiteFilter: "coordinator"
133
+ * });
134
+ *
135
+ * // Enforce score threshold
136
+ * const gatedResult = await runEvals({
137
+ * cwd: "/path/to/project",
138
+ * scoreThreshold: 80
139
+ * });
140
+ * if (!gatedResult.success) {
141
+ * throw new Error(`Evals failed threshold: ${gatedResult.averageScore}`);
142
+ * }
143
+ * ```
144
+ */
145
+ export async function runEvals(
146
+ options: RunEvalsOptions = {}
147
+ ): Promise<RunEvalsResult> {
148
+ const {
149
+ cwd = process.cwd(),
150
+ suiteFilter,
151
+ scoreThreshold,
152
+ outputPath: userOutputPath,
153
+ } = options;
154
+
155
+ try {
156
+ // Resolve to project root (evals are in evals/ relative to project root)
157
+ // If cwd is src/, go up one level
158
+ const projectRoot = cwd.endsWith("src") ? path.dirname(cwd) : cwd;
159
+ const evalsDir = path.join(projectRoot, "evals");
160
+ let evalPath: string | undefined;
161
+
162
+ if (suiteFilter) {
163
+ // Find matching eval files
164
+ try {
165
+ const files = await fs.readdir(evalsDir);
166
+ const matchingFiles = files.filter((f) =>
167
+ f.toLowerCase().includes(suiteFilter.toLowerCase())
168
+ );
169
+
170
+ if (matchingFiles.length === 0) {
171
+ // No matches - return empty result (not an error)
172
+ return {
173
+ success: true,
174
+ totalSuites: 0,
175
+ totalEvals: 0,
176
+ averageScore: 0,
177
+ suites: [],
178
+ };
179
+ }
180
+
181
+ // Use first matching file (evalite will discover all via vitest)
182
+ evalPath = path.join(evalsDir, matchingFiles[0]);
183
+ } catch (err) {
184
+ // Directory doesn't exist or can't be read
185
+ return {
186
+ success: false,
187
+ totalSuites: 0,
188
+ totalEvals: 0,
189
+ averageScore: 0,
190
+ suites: [],
191
+ error: `Failed to read evals directory: ${err instanceof Error ? err.message : String(err)}`,
192
+ };
193
+ }
194
+ } else {
195
+ // No filter - run all evals in evals/
196
+ evalPath = evalsDir;
197
+ }
198
+
199
+ // Use temporary output path if user didn't provide one
200
+ const outputPath =
201
+ userOutputPath || path.join(projectRoot, `.evalite-results-${Date.now()}.json`);
202
+ const isTemporaryOutput = !userOutputPath;
203
+
204
+ // Run evalite programmatically
205
+ const storage = createInMemoryStorage();
206
+
207
+ await runEvalite({
208
+ path: evalPath, // undefined = run all
209
+ cwd: projectRoot, // Use project root as working directory
210
+ mode: "run-once",
211
+ scoreThreshold,
212
+ outputPath,
213
+ hideTable: true, // Suppress terminal output
214
+ storage,
215
+ disableServer: true, // No UI server needed
216
+ });
217
+
218
+ // Parse output file for structured results
219
+ let outputJson: string;
220
+ try {
221
+ outputJson = await fs.readFile(outputPath, "utf-8");
222
+ } catch (err) {
223
+ // Output file wasn't written - evalite crashed or no tests ran
224
+ return {
225
+ success: false,
226
+ totalSuites: 0,
227
+ totalEvals: 0,
228
+ averageScore: 0,
229
+ suites: [],
230
+ error: `No results file generated: ${err instanceof Error ? err.message : String(err)}`,
231
+ };
232
+ }
233
+
234
+ const output: Evalite.Exported.Output = JSON.parse(outputJson);
235
+
236
+ // Clean up temporary output file
237
+ if (isTemporaryOutput) {
238
+ await fs.unlink(outputPath).catch(() => {
239
+ /* ignore cleanup errors */
240
+ });
241
+ }
242
+
243
+ // Transform to structured result
244
+ const suites: SuiteResult[] = output.suites.map((suite) => ({
245
+ name: suite.name,
246
+ filepath: suite.filepath,
247
+ status: suite.status,
248
+ duration: suite.duration,
249
+ averageScore: suite.averageScore,
250
+ evalCount: suite.evals.length,
251
+ // Include evals if user wants detailed results
252
+ evals: suite.evals.map((e) => ({
253
+ input: e.input,
254
+ output: e.output,
255
+ expected: e.expected,
256
+ scores: e.scores.map((s) => ({
257
+ name: s.name,
258
+ score: s.score,
259
+ description: s.description,
260
+ })),
261
+ })),
262
+ }));
263
+
264
+ // Record eval runs to history
265
+ for (const suite of suites) {
266
+ const history = getScoreHistory(projectRoot, suite.name);
267
+ recordEvalRun(projectRoot, {
268
+ timestamp: new Date().toISOString(),
269
+ eval_name: suite.name,
270
+ score: suite.averageScore,
271
+ run_count: history.length + 1,
272
+ });
273
+ }
274
+
275
+ // Check gates for each suite
276
+ const gateResults = [];
277
+ for (const suite of suites) {
278
+ const history = getScoreHistory(projectRoot, suite.name);
279
+ const gate = checkGate(projectRoot, suite.name, suite.averageScore);
280
+ gateResults.push({ suite: suite.name, ...gate });
281
+
282
+ // If gate failed, trigger learning
283
+ if (!gate.passed) {
284
+ try {
285
+ const memoryAdapter = await getMemoryAdapter();
286
+ await learnFromEvalFailure(suite.name, suite.averageScore, history, memoryAdapter);
287
+ } catch (e) {
288
+ // Learning is best-effort, don't fail the eval run
289
+ console.warn(`Failed to store learning for ${suite.name}:`, e);
290
+ }
291
+ }
292
+ }
293
+
294
+ // Calculate overall metrics
295
+ const totalEvals = suites.reduce((sum, s) => sum + s.evalCount, 0);
296
+ const averageScore =
297
+ suites.length > 0
298
+ ? suites.reduce((sum, s) => sum + s.averageScore, 0) / suites.length
299
+ : 0;
300
+
301
+ // Determine success based on threshold
302
+ const thresholdPassed =
303
+ scoreThreshold === undefined || averageScore * 100 >= scoreThreshold;
304
+
305
+ return {
306
+ success: thresholdPassed,
307
+ totalSuites: suites.length,
308
+ totalEvals,
309
+ averageScore,
310
+ suites,
311
+ gateResults,
312
+ };
313
+ } catch (error) {
314
+ // Return error result
315
+ return {
316
+ success: false,
317
+ totalSuites: 0,
318
+ totalEvals: 0,
319
+ averageScore: 0,
320
+ suites: [],
321
+ error:
322
+ error instanceof Error
323
+ ? error.message
324
+ : String(error),
325
+ };
326
+ }
327
+ }
328
+
329
+ // ============================================================================
330
+ // Plugin Tool
331
+ // ============================================================================
332
+
333
+ /**
334
+ * Plugin tool for running evals programmatically
335
+ */
336
+ const eval_run = tool({
337
+ description: `Run evalite evals programmatically and get structured results with scores.
338
+
339
+ Use this to:
340
+ - Run all evals in evals/ directory
341
+ - Filter by specific eval suite (e.g., "coordinator", "compaction")
342
+ - Enforce score thresholds for quality gates
343
+ - Get per-suite and per-eval scores
344
+
345
+ Returns structured JSON with:
346
+ - success: boolean (true if all tests passed threshold)
347
+ - totalSuites: number of eval suites run
348
+ - totalEvals: number of individual test cases
349
+ - averageScore: 0-1 score across all suites
350
+ - suites: array of suite results with scores
351
+
352
+ Example usage:
353
+ - Run all evals: eval_run()
354
+ - Run coordinator evals: eval_run({ suiteFilter: "coordinator" })
355
+ - Enforce 80% threshold: eval_run({ scoreThreshold: 80 })`,
356
+
357
+ args: {
358
+ suiteFilter: tool.schema
359
+ .string()
360
+ .optional()
361
+ .describe(
362
+ 'Optional filter to run specific eval suite (e.g., "coordinator", "compaction"). Matches against eval file paths using substring matching.'
363
+ ),
364
+ scoreThreshold: tool.schema
365
+ .number()
366
+ .optional()
367
+ .describe(
368
+ "Optional minimum average score threshold (0-100). If average score falls below this, result.success will be false. Useful for CI quality gates."
369
+ ),
370
+ includeDetailedResults: tool.schema
371
+ .boolean()
372
+ .optional()
373
+ .describe(
374
+ "Include individual eval results with input/output/scores in response. Set to false (default) for summary only to save token usage."
375
+ ),
376
+ },
377
+
378
+ execute: async (args) => {
379
+ const result = await runEvals({
380
+ cwd: process.cwd(),
381
+ suiteFilter: args.suiteFilter as string | undefined,
382
+ scoreThreshold: args.scoreThreshold as number | undefined,
383
+ });
384
+
385
+ // Remove detailed evals if not requested (saves tokens)
386
+ const includeDetails = args.includeDetailedResults === true;
387
+ if (!includeDetails) {
388
+ for (const suite of result.suites) {
389
+ delete suite.evals;
390
+ }
391
+ }
392
+
393
+ return JSON.stringify(result, null, 2);
394
+ },
395
+ });
396
+
397
+ /**
398
+ * All eval tools exported for registration
399
+ */
400
+ export const evalTools = {
401
+ eval_run,
402
+ } as const;