opencode-swarm-plugin 0.40.0 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +65 -16
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +155 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.test.ts +661 -732
  14. package/bin/swarm.ts +274 -0
  15. package/dist/compaction-hook.d.ts +7 -5
  16. package/dist/compaction-hook.d.ts.map +1 -1
  17. package/dist/compaction-prompt-scoring.d.ts +1 -0
  18. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  19. package/dist/eval-runner.d.ts +134 -0
  20. package/dist/eval-runner.d.ts.map +1 -0
  21. package/dist/hive.d.ts.map +1 -1
  22. package/dist/index.d.ts +29 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +99741 -58858
  25. package/dist/memory-tools.d.ts +70 -2
  26. package/dist/memory-tools.d.ts.map +1 -1
  27. package/dist/memory.d.ts +37 -0
  28. package/dist/memory.d.ts.map +1 -1
  29. package/dist/observability-tools.d.ts +64 -0
  30. package/dist/observability-tools.d.ts.map +1 -1
  31. package/dist/plugin.js +99356 -58318
  32. package/dist/swarm-orchestrate.d.ts.map +1 -1
  33. package/dist/swarm-prompts.d.ts +32 -1
  34. package/dist/swarm-prompts.d.ts.map +1 -1
  35. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  36. package/evals/ARCHITECTURE.md +1189 -0
  37. package/evals/example.eval.ts +3 -4
  38. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  39. package/evals/scorers/coordinator-discipline.ts +0 -253
  40. package/evals/swarm-decomposition.eval.ts +4 -2
  41. package/package.json +4 -3
  42. package/src/compaction-prompt-scorers.test.ts +10 -9
  43. package/src/compaction-prompt-scoring.ts +7 -5
  44. package/src/eval-runner.test.ts +128 -1
  45. package/src/eval-runner.ts +46 -0
  46. package/src/hive.ts +43 -42
  47. package/src/memory-tools.test.ts +84 -0
  48. package/src/memory-tools.ts +68 -3
  49. package/src/memory.test.ts +2 -112
  50. package/src/memory.ts +88 -49
  51. package/src/observability-tools.test.ts +13 -0
  52. package/src/observability-tools.ts +277 -0
  53. package/src/swarm-orchestrate.test.ts +162 -0
  54. package/src/swarm-orchestrate.ts +7 -5
  55. package/src/swarm-prompts.test.ts +168 -4
  56. package/src/swarm-prompts.ts +228 -7
  57. package/.env +0 -2
  58. package/.turbo/turbo-test.log +0 -481
  59. package/.turbo/turbo-typecheck.log +0 -1
@@ -14,19 +14,18 @@ evalite("Example: Basic scorer test", {
14
14
  data: async () => {
15
15
  return [
16
16
  {
17
- input: "Test task",
18
- output: JSON.stringify({
17
+ input: {
19
18
  epic: { title: "Test Epic", description: "Test" },
20
19
  subtasks: [
21
20
  { title: "Subtask 1", files: ["a.ts"], estimated_complexity: 1 },
22
21
  { title: "Subtask 2", files: ["b.ts"], estimated_complexity: 1 },
23
22
  ],
24
- }),
23
+ },
25
24
  },
26
25
  ];
27
26
  },
28
27
  task: async (input) => {
29
- return input; // passthrough for testing
28
+ return JSON.stringify(input);
30
29
  },
31
30
  scorers: [subtaskIndependence],
32
31
  });
@@ -78,6 +78,8 @@ Coordinators do NOT edit code directly. These tools are FORBIDDEN:
78
78
  - edit
79
79
  - write
80
80
  - bash (for file modifications)
81
+ - swarmmail_reserve (only workers reserve)
82
+ - git commit (workers commit)
81
83
 
82
84
  Use swarm_spawn_subtask to delegate work to workers.
83
85
 
@@ -249,6 +251,8 @@ You are the COORDINATOR of epic mjkweh7q9n4.
249
251
  - edit
250
252
  - write
251
253
  - bash (for file mods)
254
+ - swarmmail_reserve (only workers)
255
+ - git commit (workers only)
252
256
 
253
257
  NEVER edit files yourself.
254
258
  ALWAYS delegate to workers.
@@ -289,6 +293,8 @@ You are coordinating epics:
289
293
  - edit
290
294
  - write
291
295
  - bash
296
+ - swarmmail_reserve
297
+ - git commit
292
298
 
293
299
  ALWAYS check status first.
294
300
  NEVER edit files directly.
@@ -334,259 +334,6 @@ export const timeToFirstSpawn = createScorer({
334
334
  },
335
335
  });
336
336
 
337
- /**
338
- * Researcher Spawn Rate Scorer
339
- *
340
- * Measures whether coordinator spawns researchers for unfamiliar technology.
341
- * Coordinators should delegate research instead of calling pdf-brain/context7 directly.
342
- *
343
- * Score: 1.0 if researcher_spawned events exist, 0.0 otherwise
344
- */
345
- export const researcherSpawnRate = createScorer({
346
- name: "Researcher Spawn Rate",
347
- description: "Coordinator spawned researchers for unfamiliar tech",
348
- scorer: ({ output }) => {
349
- try {
350
- const session = JSON.parse(String(output)) as CoordinatorSession;
351
-
352
- // Count researcher_spawned events
353
- const researchers = session.events.filter(
354
- (e) =>
355
- e.event_type === "DECISION" && e.decision_type === "researcher_spawned"
356
- );
357
-
358
- const count = researchers.length;
359
-
360
- if (count === 0) {
361
- return {
362
- score: 0.0,
363
- message: "No researchers spawned (may indicate coordinator queried docs directly)",
364
- };
365
- }
366
-
367
- return {
368
- score: 1.0,
369
- message: `${count} researcher(s) spawned`,
370
- };
371
- } catch (error) {
372
- return {
373
- score: 0,
374
- message: `Failed to parse CoordinatorSession: ${error}`,
375
- };
376
- }
377
- },
378
- });
379
-
380
- /**
381
- * Skill Loading Rate Scorer
382
- *
383
- * Measures whether coordinator loads relevant skills via skills_use().
384
- * Shows knowledge-seeking behavior.
385
- *
386
- * Score: 1.0 if skill_loaded events exist, 0.5 otherwise (not critical, but helpful)
387
- */
388
- export const skillLoadingRate = createScorer({
389
- name: "Skill Loading Rate",
390
- description: "Coordinator loaded relevant skills for domain knowledge",
391
- scorer: ({ output }) => {
392
- try {
393
- const session = JSON.parse(String(output)) as CoordinatorSession;
394
-
395
- // Count skill_loaded events
396
- const skills = session.events.filter(
397
- (e) =>
398
- e.event_type === "DECISION" && e.decision_type === "skill_loaded"
399
- );
400
-
401
- const count = skills.length;
402
-
403
- if (count === 0) {
404
- return {
405
- score: 0.5,
406
- message: "No skills loaded (not critical, but helpful)",
407
- };
408
- }
409
-
410
- return {
411
- score: 1.0,
412
- message: `${count} skill(s) loaded`,
413
- };
414
- } catch (error) {
415
- return {
416
- score: 0,
417
- message: `Failed to parse CoordinatorSession: ${error}`,
418
- };
419
- }
420
- },
421
- });
422
-
423
- /**
424
- * Inbox Monitoring Rate Scorer
425
- *
426
- * Measures how frequently coordinator checks inbox for worker messages.
427
- * Regular monitoring (every ~15min or when workers finish) shows good coordination.
428
- *
429
- * Score based on inbox_checked events relative to worker activity:
430
- * - 0 checks = 0.0 (coordinator not monitoring)
431
- * - 1+ checks = 1.0 (coordinator is responsive)
432
- */
433
- export const inboxMonitoringRate = createScorer({
434
- name: "Inbox Monitoring Rate",
435
- description: "Coordinator checked inbox regularly for worker messages",
436
- scorer: ({ output }) => {
437
- try {
438
- const session = JSON.parse(String(output)) as CoordinatorSession;
439
-
440
- // Count inbox_checked events
441
- const checks = session.events.filter(
442
- (e) =>
443
- e.event_type === "DECISION" && e.decision_type === "inbox_checked"
444
- );
445
-
446
- // Count worker activity (spawns + outcomes)
447
- const workerActivity = session.events.filter(
448
- (e) =>
449
- (e.event_type === "DECISION" && e.decision_type === "worker_spawned") ||
450
- (e.event_type === "OUTCOME" &&
451
- ["subtask_success", "subtask_failed", "blocker_detected"].includes(
452
- e.outcome_type
453
- ))
454
- );
455
-
456
- const checkCount = checks.length;
457
- const activityCount = workerActivity.length;
458
-
459
- if (activityCount === 0) {
460
- return {
461
- score: 1.0,
462
- message: "No worker activity to monitor",
463
- };
464
- }
465
-
466
- if (checkCount === 0) {
467
- return {
468
- score: 0.0,
469
- message: `${activityCount} worker events, 0 inbox checks (not monitoring)`,
470
- };
471
- }
472
-
473
- return {
474
- score: 1.0,
475
- message: `${checkCount} inbox check(s) for ${activityCount} worker events`,
476
- };
477
- } catch (error) {
478
- return {
479
- score: 0,
480
- message: `Failed to parse CoordinatorSession: ${error}`,
481
- };
482
- }
483
- },
484
- });
485
-
486
- /**
487
- * Blocker Response Time Scorer
488
- *
489
- * Measures how quickly coordinator responds to blocked workers.
490
- * Time between blocker_detected (OUTCOME) and blocker_resolved (DECISION).
491
- *
492
- * Normalization:
493
- * - < 5min: 1.0 (excellent)
494
- * - 5-15min: linear decay to 0.5
495
- * - > 15min: 0.0 (too slow, worker is idle)
496
- *
497
- * Score: Average response time across all blockers
498
- */
499
- export const blockerResponseTime = createScorer({
500
- name: "Blocker Response Time",
501
- description: "Coordinator unblocked workers quickly",
502
- scorer: ({ output }) => {
503
- try {
504
- const session = JSON.parse(String(output)) as CoordinatorSession;
505
-
506
- // Find blocker_detected events
507
- const blockers = session.events.filter(
508
- (e) =>
509
- e.event_type === "OUTCOME" && e.outcome_type === "blocker_detected"
510
- );
511
-
512
- if (blockers.length === 0) {
513
- return {
514
- score: 1.0,
515
- message: "No blockers detected",
516
- };
517
- }
518
-
519
- // Find blocker_resolved events
520
- const resolutions = session.events.filter(
521
- (e) =>
522
- e.event_type === "DECISION" && e.decision_type === "blocker_resolved"
523
- );
524
-
525
- if (resolutions.length === 0) {
526
- return {
527
- score: 0.0,
528
- message: `${blockers.length} blocker(s) detected, 0 resolved (workers still blocked)`,
529
- };
530
- }
531
-
532
- // Match blockers to resolutions by subtask_id and calculate response times
533
- const responseTimes: number[] = [];
534
- for (const blocker of blockers) {
535
- const subtaskId = (blocker.payload as any).subtask_id;
536
- const blockerTime = new Date(blocker.timestamp).getTime();
537
-
538
- // Find resolution for this subtask
539
- const resolution = resolutions.find(
540
- (r) => (r.payload as any).subtask_id === subtaskId
541
- );
542
-
543
- if (resolution) {
544
- const resolutionTime = new Date(resolution.timestamp).getTime();
545
- const deltaMs = resolutionTime - blockerTime;
546
- responseTimes.push(deltaMs);
547
- }
548
- }
549
-
550
- if (responseTimes.length === 0) {
551
- return {
552
- score: 0.5,
553
- message: `${blockers.length} blocker(s) detected, ${resolutions.length} resolution(s), but no matches by subtask_id`,
554
- };
555
- }
556
-
557
- // Calculate average response time
558
- const avgResponseMs =
559
- responseTimes.reduce((sum, t) => sum + t, 0) / responseTimes.length;
560
-
561
- // Normalize: < 5min = 1.0, > 15min = 0.0, linear in between
562
- const EXCELLENT_MS = 5 * 60 * 1000; // 5 min
563
- const POOR_MS = 15 * 60 * 1000; // 15 min
564
-
565
- let score: number;
566
- if (avgResponseMs < EXCELLENT_MS) {
567
- score = 1.0;
568
- } else if (avgResponseMs > POOR_MS) {
569
- score = 0.0;
570
- } else {
571
- // Linear decay from 1.0 to 0.0
572
- score = 1.0 - (avgResponseMs - EXCELLENT_MS) / (POOR_MS - EXCELLENT_MS);
573
- }
574
-
575
- const avgMinutes = Math.round(avgResponseMs / 1000 / 60);
576
-
577
- return {
578
- score,
579
- message: `Avg response time: ${avgMinutes}min (${responseTimes.length}/${blockers.length} blockers resolved)`,
580
- };
581
- } catch (error) {
582
- return {
583
- score: 0,
584
- message: `Failed to parse CoordinatorSession: ${error}`,
585
- };
586
- }
587
- },
588
- });
589
-
590
337
  /**
591
338
  * Overall Discipline Scorer
592
339
  *
@@ -34,7 +34,9 @@ import {
34
34
  } from "./lib/data-loader.js";
35
35
 
36
36
  // Determine project key from current directory
37
- const PROJECT_KEY = "opencode-swarm-plugin";
37
+ // NOTE: project_key in eval_records is the full path (from getHiveWorkingDirectory),
38
+ // not a short name. Use process.cwd() to match.
39
+ const PROJECT_KEY = process.cwd();
38
40
  const PROJECT_PATH = process.cwd();
39
41
 
40
42
  // Check if we have enough real data to use instead of fixtures
@@ -42,7 +44,7 @@ const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
42
44
 
43
45
  // Load data based on availability
44
46
  const evalCases = useRealData
45
- ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
47
+ ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH }) // PROJECT_KEY is now process.cwd()
46
48
  : decompositionCases.map((testCase) => ({
47
49
  input: testCase.input,
48
50
  expected: testCase.expected,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-swarm-plugin",
3
- "version": "0.40.0",
3
+ "version": "0.42.0",
4
4
  "description": "Multi-agent swarm coordination for OpenCode with learning capabilities, beads integration, and Agent Mail",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -23,7 +23,7 @@
23
23
  "registry": "https://registry.npmjs.org/"
24
24
  },
25
25
  "scripts": {
26
- "build": "bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail && tsc",
26
+ "build": "bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && tsc",
27
27
  "dev": "bun --watch src/index.ts",
28
28
  "test": "bun test --timeout 10000 src/anti-patterns.test.ts src/mandate-promotion.test.ts src/mandate-storage.test.ts src/output-guardrails.test.ts src/pattern-maturity.test.ts src/skills.test.ts src/structured.test.ts src/schemas/",
29
29
  "test:integration": "bun test --timeout 60000 src/*.integration.test.ts",
@@ -34,6 +34,7 @@
34
34
  "eval:decomposition": "bun --env-file=.env run bunx evalite run evals/swarm-decomposition.eval.ts",
35
35
  "eval:coordinator": "bun --env-file=.env run bunx evalite run evals/coordinator-session.eval.ts",
36
36
  "eval:compaction": "bun --env-file=.env run bunx evalite run evals/compaction-prompt.eval.ts",
37
+ "eval:gate": "bun run bin/eval-gate.ts",
37
38
  "migrate:sessions": "bun run scripts/migrate-unknown-sessions.ts",
38
39
  "postinstall": "node -e \"console.log('\\n\\x1b[33m Run \\x1b[36mswarm setup\\x1b[33m to configure OpenCode integration\\x1b[0m\\n')\""
39
40
  },
@@ -46,7 +47,7 @@
46
47
  "minimatch": "^10.1.1",
47
48
  "pino": "^9.6.0",
48
49
  "pino-roll": "^1.3.0",
49
- "swarm-mail": "1.5.2",
50
+ "swarm-mail": "1.5.4",
50
51
  "yaml": "^2.8.2",
51
52
  "zod": "4.1.8"
52
53
  },
@@ -173,16 +173,17 @@ describe("forbiddenToolsPresent scorer", () => {
173
173
  - Edit (use swarm_spawn_subtask)
174
174
  - Write (use swarm_spawn_subtask)
175
175
  - swarmmail_reserve (only workers reserve)
176
- - bash with git commit (workers commit)`,
176
+ - git commit (workers commit)
177
+ - bash (for file modifications)`,
177
178
  };
178
179
 
179
180
  const result = scoreForbiddenToolsPresent(prompt);
180
181
 
181
182
  expect(result.score).toBe(1.0);
182
- expect(result.message).toContain("All 4 forbidden tools");
183
+ expect(result.message).toContain("All 5 forbidden tools");
183
184
  });
184
185
 
185
- test("scores 0.75 when 3 out of 4 tools listed", () => {
186
+ test("scores 0.6 when 3 out of 5 tools listed", () => {
186
187
  const prompt: CompactionPrompt = {
187
188
  content: `🚫 FORBIDDEN TOOLS:
188
189
  - Edit
@@ -192,19 +193,19 @@ describe("forbiddenToolsPresent scorer", () => {
192
193
 
193
194
  const result = scoreForbiddenToolsPresent(prompt);
194
195
 
195
- expect(result.score).toBe(0.75);
196
- expect(result.message).toContain("3/4");
196
+ expect(result.score).toBe(0.6);
197
+ expect(result.message).toContain("3/5");
197
198
  });
198
199
 
199
- test("scores 0.5 when 2 out of 4 tools listed", () => {
200
+ test("scores 0.4 when 2 out of 5 tools listed", () => {
200
201
  const prompt: CompactionPrompt = {
201
202
  content: `Don't use Edit or Write directly.`,
202
203
  };
203
204
 
204
205
  const result = scoreForbiddenToolsPresent(prompt);
205
206
 
206
- expect(result.score).toBe(0.5);
207
- expect(result.message).toContain("2/4");
207
+ expect(result.score).toBe(0.4);
208
+ expect(result.message).toContain("2/5");
208
209
  });
209
210
 
210
211
  test("scores 0.0 when no forbidden tools listed", () => {
@@ -215,7 +216,7 @@ describe("forbiddenToolsPresent scorer", () => {
215
216
  const result = scoreForbiddenToolsPresent(prompt);
216
217
 
217
218
  expect(result.score).toBe(0.0);
218
- expect(result.message).toContain("0/4");
219
+ expect(result.message).toContain("0/5");
219
220
  });
220
221
  });
221
222
 
@@ -203,6 +203,7 @@ export function scoreCoordinatorIdentity(
203
203
  * 2. Write
204
204
  * 3. swarmmail_reserve (only workers reserve)
205
205
  * 4. git commit (workers commit)
206
+ * 5. bash (for file modifications)
206
207
  *
207
208
  * @returns ratio of forbidden tools mentioned (0.0 to 1.0)
208
209
  */
@@ -211,10 +212,11 @@ export function scoreForbiddenToolsPresent(
211
212
  ): ScorerResult {
212
213
  // Check for forbidden tool mentions
213
214
  const forbiddenTools = [
214
- /\bEdit\b/,
215
- /\bWrite\b/,
215
+ /\bEdit\b/i,
216
+ /\bWrite\b/i,
216
217
  /swarmmail_reserve/,
217
218
  /git commit/,
219
+ /\bbash\b/i,
218
220
  ];
219
221
 
220
222
  const foundTools = forbiddenTools.filter((pattern) =>
@@ -226,20 +228,20 @@ export function scoreForbiddenToolsPresent(
226
228
  if (score === 1.0) {
227
229
  return {
228
230
  score: 1.0,
229
- message: "All 4 forbidden tools listed",
231
+ message: "All 5 forbidden tools listed",
230
232
  };
231
233
  }
232
234
 
233
235
  if (score === 0) {
234
236
  return {
235
237
  score: 0.0,
236
- message: "No forbidden tools listed (0/4)",
238
+ message: "No forbidden tools listed (0/5)",
237
239
  };
238
240
  }
239
241
 
240
242
  return {
241
243
  score,
242
- message: `${foundTools.length}/4 forbidden tools listed`,
244
+ message: `${foundTools.length}/5 forbidden tools listed`,
243
245
  };
244
246
  }
245
247
 
@@ -4,9 +4,11 @@
4
4
  * TDD: These tests MUST fail initially, then pass after implementation.
5
5
  */
6
6
 
7
- import { describe, test, expect, beforeAll } from "bun:test";
7
+ import { describe, test, expect, beforeAll, afterEach } from "bun:test";
8
8
  import { runEvals } from "./eval-runner";
9
9
  import path from "node:path";
10
+ import fs from "node:fs";
11
+ import { getEvalHistoryPath } from "./eval-history";
10
12
 
11
13
  // Use project root for all tests
12
14
  const PROJECT_ROOT = path.resolve(import.meta.dir, "..");
@@ -93,4 +95,129 @@ describe("runEvals", () => {
93
95
  expect(result.totalSuites).toBe(0);
94
96
  expect(result.suites).toEqual([]);
95
97
  }, 10000);
98
+
99
+ test("records eval run to history after execution", async () => {
100
+ // Clean up any existing history before test
101
+ const historyPath = getEvalHistoryPath(PROJECT_ROOT);
102
+ const historyBackup = historyPath + ".backup";
103
+
104
+ // Backup existing history
105
+ if (fs.existsSync(historyPath)) {
106
+ fs.copyFileSync(historyPath, historyBackup);
107
+ }
108
+
109
+ try {
110
+ // Remove history file to get clean state
111
+ if (fs.existsSync(historyPath)) {
112
+ fs.unlinkSync(historyPath);
113
+ }
114
+
115
+ // Run evals
116
+ const result = await runEvals({
117
+ cwd: PROJECT_ROOT,
118
+ suiteFilter: "example",
119
+ });
120
+
121
+ // Should have succeeded
122
+ expect(result.success).toBe(true);
123
+ expect(result.suites.length).toBeGreaterThan(0);
124
+
125
+ // History file should have been created
126
+ expect(fs.existsSync(historyPath)).toBe(true);
127
+
128
+ // Read history file
129
+ const historyContent = fs.readFileSync(historyPath, "utf-8");
130
+ const lines = historyContent.trim().split("\n");
131
+
132
+ // Should have one line per suite
133
+ expect(lines.length).toBe(result.suites.length);
134
+
135
+ // Parse first line and verify structure
136
+ const firstRecord = JSON.parse(lines[0]);
137
+
138
+ // Verify structure has all required fields
139
+ expect(typeof firstRecord.timestamp).toBe("string");
140
+ expect(typeof firstRecord.eval_name).toBe("string");
141
+ expect(typeof firstRecord.score).toBe("number");
142
+ expect(typeof firstRecord.run_count).toBe("number");
143
+
144
+ // Verify eval_name matches suite name
145
+ expect(firstRecord.eval_name).toBe(result.suites[0].name);
146
+
147
+ // Verify score matches suite averageScore
148
+ expect(firstRecord.score).toBe(result.suites[0].averageScore);
149
+
150
+ // First run should have run_count = 1
151
+ expect(firstRecord.run_count).toBe(1);
152
+ } finally {
153
+ // Restore backup
154
+ if (fs.existsSync(historyBackup)) {
155
+ fs.copyFileSync(historyBackup, historyPath);
156
+ fs.unlinkSync(historyBackup);
157
+ }
158
+ }
159
+ }, 30000);
160
+
161
+ test("checks gates for each suite after recording", async () => {
162
+ const result = await runEvals({
163
+ cwd: PROJECT_ROOT,
164
+ suiteFilter: "example",
165
+ });
166
+
167
+ expect(result.success).toBe(true);
168
+ expect(result.gateResults).toBeDefined();
169
+ expect(Array.isArray(result.gateResults)).toBe(true);
170
+
171
+ // Should have gate result for each suite
172
+ expect(result.gateResults?.length).toBe(result.suites.length);
173
+
174
+ // Each gate result should have required fields
175
+ if (result.gateResults && result.gateResults.length > 0) {
176
+ const gateResult = result.gateResults[0];
177
+ expect(gateResult).toHaveProperty("suite");
178
+ expect(gateResult).toHaveProperty("passed");
179
+ expect(gateResult).toHaveProperty("phase");
180
+ expect(gateResult).toHaveProperty("message");
181
+ expect(gateResult).toHaveProperty("currentScore");
182
+ }
183
+ }, 30000);
184
+
185
+ test("calls learnFromEvalFailure when gate fails", async () => {
186
+ // This test requires manually creating a history with regression
187
+ // For now, we just verify the code path exists
188
+ // In practice, this would be tested with mocked checkGate returning failed=true
189
+
190
+ const result = await runEvals({
191
+ cwd: PROJECT_ROOT,
192
+ suiteFilter: "example",
193
+ });
194
+
195
+ // Gate results should be present even if no failures
196
+ expect(result.gateResults).toBeDefined();
197
+ }, 30000);
198
+
199
+ test("does NOT call learnFromEvalFailure when gate passes", async () => {
200
+ // Similar to above - verifies the happy path
201
+ // Real test would mock checkGate and verify learnFromEvalFailure NOT called
202
+
203
+ const result = await runEvals({
204
+ cwd: PROJECT_ROOT,
205
+ suiteFilter: "example",
206
+ });
207
+
208
+ // Should succeed with gate results
209
+ expect(result.success).toBe(true);
210
+ expect(result.gateResults).toBeDefined();
211
+ }, 30000);
212
+
213
+ test("includes gateResults in return value", async () => {
214
+ const result = await runEvals({
215
+ cwd: PROJECT_ROOT,
216
+ suiteFilter: "example",
217
+ });
218
+
219
+ // gateResults should be array (even if empty)
220
+ expect(result).toHaveProperty("gateResults");
221
+ expect(Array.isArray(result.gateResults)).toBe(true);
222
+ }, 30000);
96
223
  });
@@ -13,6 +13,10 @@ import { createInMemoryStorage } from "evalite/in-memory-storage";
13
13
  import type { Evalite } from "evalite/types";
14
14
  import fs from "node:fs/promises";
15
15
  import path from "node:path";
16
+ import { recordEvalRun, getScoreHistory } from "./eval-history.js";
17
+ import { checkGate } from "./eval-gates.js";
18
+ import { learnFromEvalFailure } from "./eval-learning.js";
19
+ import { getMemoryAdapter } from "./memory-tools.js";
16
20
 
17
21
  /**
18
22
  * Options for running evals programmatically
@@ -97,6 +101,17 @@ export interface RunEvalsResult {
97
101
 
98
102
  /** Error message if run failed */
99
103
  error?: string;
104
+
105
+ /** Gate check results per suite */
106
+ gateResults?: Array<{
107
+ suite: string;
108
+ passed: boolean;
109
+ phase: string;
110
+ message: string;
111
+ baseline?: number;
112
+ currentScore: number;
113
+ regressionPercent?: number;
114
+ }>;
100
115
  }
101
116
 
102
117
  /**
@@ -246,6 +261,36 @@ export async function runEvals(
246
261
  })),
247
262
  }));
248
263
 
264
+ // Record eval runs to history
265
+ for (const suite of suites) {
266
+ const history = getScoreHistory(projectRoot, suite.name);
267
+ recordEvalRun(projectRoot, {
268
+ timestamp: new Date().toISOString(),
269
+ eval_name: suite.name,
270
+ score: suite.averageScore,
271
+ run_count: history.length + 1,
272
+ });
273
+ }
274
+
275
+ // Check gates for each suite
276
+ const gateResults = [];
277
+ for (const suite of suites) {
278
+ const history = getScoreHistory(projectRoot, suite.name);
279
+ const gate = checkGate(projectRoot, suite.name, suite.averageScore);
280
+ gateResults.push({ suite: suite.name, ...gate });
281
+
282
+ // If gate failed, trigger learning
283
+ if (!gate.passed) {
284
+ try {
285
+ const memoryAdapter = await getMemoryAdapter();
286
+ await learnFromEvalFailure(suite.name, suite.averageScore, history, memoryAdapter);
287
+ } catch (e) {
288
+ // Learning is best-effort, don't fail the eval run
289
+ console.warn(`Failed to store learning for ${suite.name}:`, e);
290
+ }
291
+ }
292
+ }
293
+
249
294
  // Calculate overall metrics
250
295
  const totalEvals = suites.reduce((sum, s) => sum + s.evalCount, 0);
251
296
  const averageScore =
@@ -263,6 +308,7 @@ export async function runEvals(
263
308
  totalEvals,
264
309
  averageScore,
265
310
  suites,
311
+ gateResults,
266
312
  };
267
313
  } catch (error) {
268
314
  // Return error result