opencode-swarm-plugin 0.40.0 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
- package/.hive/analysis/session-data-quality-audit.md +320 -0
- package/.hive/eval-results.json +481 -24
- package/.hive/issues.jsonl +65 -16
- package/.hive/memories.jsonl +159 -1
- package/.opencode/eval-history.jsonl +315 -0
- package/.turbo/turbo-build.log +5 -5
- package/CHANGELOG.md +155 -0
- package/README.md +2 -0
- package/SCORER-ANALYSIS.md +598 -0
- package/bin/eval-gate.test.ts +158 -0
- package/bin/eval-gate.ts +74 -0
- package/bin/swarm.test.ts +661 -732
- package/bin/swarm.ts +274 -0
- package/dist/compaction-hook.d.ts +7 -5
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-prompt-scoring.d.ts +1 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -1
- package/dist/eval-runner.d.ts +134 -0
- package/dist/eval-runner.d.ts.map +1 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +29 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +99741 -58858
- package/dist/memory-tools.d.ts +70 -2
- package/dist/memory-tools.d.ts.map +1 -1
- package/dist/memory.d.ts +37 -0
- package/dist/memory.d.ts.map +1 -1
- package/dist/observability-tools.d.ts +64 -0
- package/dist/observability-tools.d.ts.map +1 -1
- package/dist/plugin.js +99356 -58318
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +32 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
- package/evals/ARCHITECTURE.md +1189 -0
- package/evals/example.eval.ts +3 -4
- package/evals/fixtures/compaction-prompt-cases.ts +6 -0
- package/evals/scorers/coordinator-discipline.ts +0 -253
- package/evals/swarm-decomposition.eval.ts +4 -2
- package/package.json +4 -3
- package/src/compaction-prompt-scorers.test.ts +10 -9
- package/src/compaction-prompt-scoring.ts +7 -5
- package/src/eval-runner.test.ts +128 -1
- package/src/eval-runner.ts +46 -0
- package/src/hive.ts +43 -42
- package/src/memory-tools.test.ts +84 -0
- package/src/memory-tools.ts +68 -3
- package/src/memory.test.ts +2 -112
- package/src/memory.ts +88 -49
- package/src/observability-tools.test.ts +13 -0
- package/src/observability-tools.ts +277 -0
- package/src/swarm-orchestrate.test.ts +162 -0
- package/src/swarm-orchestrate.ts +7 -5
- package/src/swarm-prompts.test.ts +168 -4
- package/src/swarm-prompts.ts +228 -7
- package/.env +0 -2
- package/.turbo/turbo-test.log +0 -481
- package/.turbo/turbo-typecheck.log +0 -1
package/evals/example.eval.ts
CHANGED
|
@@ -14,19 +14,18 @@ evalite("Example: Basic scorer test", {
|
|
|
14
14
|
data: async () => {
|
|
15
15
|
return [
|
|
16
16
|
{
|
|
17
|
-
input:
|
|
18
|
-
output: JSON.stringify({
|
|
17
|
+
input: {
|
|
19
18
|
epic: { title: "Test Epic", description: "Test" },
|
|
20
19
|
subtasks: [
|
|
21
20
|
{ title: "Subtask 1", files: ["a.ts"], estimated_complexity: 1 },
|
|
22
21
|
{ title: "Subtask 2", files: ["b.ts"], estimated_complexity: 1 },
|
|
23
22
|
],
|
|
24
|
-
}
|
|
23
|
+
},
|
|
25
24
|
},
|
|
26
25
|
];
|
|
27
26
|
},
|
|
28
27
|
task: async (input) => {
|
|
29
|
-
return input;
|
|
28
|
+
return JSON.stringify(input);
|
|
30
29
|
},
|
|
31
30
|
scorers: [subtaskIndependence],
|
|
32
31
|
});
|
|
@@ -78,6 +78,8 @@ Coordinators do NOT edit code directly. These tools are FORBIDDEN:
|
|
|
78
78
|
- edit
|
|
79
79
|
- write
|
|
80
80
|
- bash (for file modifications)
|
|
81
|
+
- swarmmail_reserve (only workers reserve)
|
|
82
|
+
- git commit (workers commit)
|
|
81
83
|
|
|
82
84
|
Use swarm_spawn_subtask to delegate work to workers.
|
|
83
85
|
|
|
@@ -249,6 +251,8 @@ You are the COORDINATOR of epic mjkweh7q9n4.
|
|
|
249
251
|
- edit
|
|
250
252
|
- write
|
|
251
253
|
- bash (for file mods)
|
|
254
|
+
- swarmmail_reserve (only workers)
|
|
255
|
+
- git commit (workers only)
|
|
252
256
|
|
|
253
257
|
NEVER edit files yourself.
|
|
254
258
|
ALWAYS delegate to workers.
|
|
@@ -289,6 +293,8 @@ You are coordinating epics:
|
|
|
289
293
|
- edit
|
|
290
294
|
- write
|
|
291
295
|
- bash
|
|
296
|
+
- swarmmail_reserve
|
|
297
|
+
- git commit
|
|
292
298
|
|
|
293
299
|
ALWAYS check status first.
|
|
294
300
|
NEVER edit files directly.
|
|
@@ -334,259 +334,6 @@ export const timeToFirstSpawn = createScorer({
|
|
|
334
334
|
},
|
|
335
335
|
});
|
|
336
336
|
|
|
337
|
-
/**
|
|
338
|
-
* Researcher Spawn Rate Scorer
|
|
339
|
-
*
|
|
340
|
-
* Measures whether coordinator spawns researchers for unfamiliar technology.
|
|
341
|
-
* Coordinators should delegate research instead of calling pdf-brain/context7 directly.
|
|
342
|
-
*
|
|
343
|
-
* Score: 1.0 if researcher_spawned events exist, 0.0 otherwise
|
|
344
|
-
*/
|
|
345
|
-
export const researcherSpawnRate = createScorer({
|
|
346
|
-
name: "Researcher Spawn Rate",
|
|
347
|
-
description: "Coordinator spawned researchers for unfamiliar tech",
|
|
348
|
-
scorer: ({ output }) => {
|
|
349
|
-
try {
|
|
350
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
351
|
-
|
|
352
|
-
// Count researcher_spawned events
|
|
353
|
-
const researchers = session.events.filter(
|
|
354
|
-
(e) =>
|
|
355
|
-
e.event_type === "DECISION" && e.decision_type === "researcher_spawned"
|
|
356
|
-
);
|
|
357
|
-
|
|
358
|
-
const count = researchers.length;
|
|
359
|
-
|
|
360
|
-
if (count === 0) {
|
|
361
|
-
return {
|
|
362
|
-
score: 0.0,
|
|
363
|
-
message: "No researchers spawned (may indicate coordinator queried docs directly)",
|
|
364
|
-
};
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
return {
|
|
368
|
-
score: 1.0,
|
|
369
|
-
message: `${count} researcher(s) spawned`,
|
|
370
|
-
};
|
|
371
|
-
} catch (error) {
|
|
372
|
-
return {
|
|
373
|
-
score: 0,
|
|
374
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
375
|
-
};
|
|
376
|
-
}
|
|
377
|
-
},
|
|
378
|
-
});
|
|
379
|
-
|
|
380
|
-
/**
|
|
381
|
-
* Skill Loading Rate Scorer
|
|
382
|
-
*
|
|
383
|
-
* Measures whether coordinator loads relevant skills via skills_use().
|
|
384
|
-
* Shows knowledge-seeking behavior.
|
|
385
|
-
*
|
|
386
|
-
* Score: 1.0 if skill_loaded events exist, 0.5 otherwise (not critical, but helpful)
|
|
387
|
-
*/
|
|
388
|
-
export const skillLoadingRate = createScorer({
|
|
389
|
-
name: "Skill Loading Rate",
|
|
390
|
-
description: "Coordinator loaded relevant skills for domain knowledge",
|
|
391
|
-
scorer: ({ output }) => {
|
|
392
|
-
try {
|
|
393
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
394
|
-
|
|
395
|
-
// Count skill_loaded events
|
|
396
|
-
const skills = session.events.filter(
|
|
397
|
-
(e) =>
|
|
398
|
-
e.event_type === "DECISION" && e.decision_type === "skill_loaded"
|
|
399
|
-
);
|
|
400
|
-
|
|
401
|
-
const count = skills.length;
|
|
402
|
-
|
|
403
|
-
if (count === 0) {
|
|
404
|
-
return {
|
|
405
|
-
score: 0.5,
|
|
406
|
-
message: "No skills loaded (not critical, but helpful)",
|
|
407
|
-
};
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
return {
|
|
411
|
-
score: 1.0,
|
|
412
|
-
message: `${count} skill(s) loaded`,
|
|
413
|
-
};
|
|
414
|
-
} catch (error) {
|
|
415
|
-
return {
|
|
416
|
-
score: 0,
|
|
417
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
418
|
-
};
|
|
419
|
-
}
|
|
420
|
-
},
|
|
421
|
-
});
|
|
422
|
-
|
|
423
|
-
/**
|
|
424
|
-
* Inbox Monitoring Rate Scorer
|
|
425
|
-
*
|
|
426
|
-
* Measures how frequently coordinator checks inbox for worker messages.
|
|
427
|
-
* Regular monitoring (every ~15min or when workers finish) shows good coordination.
|
|
428
|
-
*
|
|
429
|
-
* Score based on inbox_checked events relative to worker activity:
|
|
430
|
-
* - 0 checks = 0.0 (coordinator not monitoring)
|
|
431
|
-
* - 1+ checks = 1.0 (coordinator is responsive)
|
|
432
|
-
*/
|
|
433
|
-
export const inboxMonitoringRate = createScorer({
|
|
434
|
-
name: "Inbox Monitoring Rate",
|
|
435
|
-
description: "Coordinator checked inbox regularly for worker messages",
|
|
436
|
-
scorer: ({ output }) => {
|
|
437
|
-
try {
|
|
438
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
439
|
-
|
|
440
|
-
// Count inbox_checked events
|
|
441
|
-
const checks = session.events.filter(
|
|
442
|
-
(e) =>
|
|
443
|
-
e.event_type === "DECISION" && e.decision_type === "inbox_checked"
|
|
444
|
-
);
|
|
445
|
-
|
|
446
|
-
// Count worker activity (spawns + outcomes)
|
|
447
|
-
const workerActivity = session.events.filter(
|
|
448
|
-
(e) =>
|
|
449
|
-
(e.event_type === "DECISION" && e.decision_type === "worker_spawned") ||
|
|
450
|
-
(e.event_type === "OUTCOME" &&
|
|
451
|
-
["subtask_success", "subtask_failed", "blocker_detected"].includes(
|
|
452
|
-
e.outcome_type
|
|
453
|
-
))
|
|
454
|
-
);
|
|
455
|
-
|
|
456
|
-
const checkCount = checks.length;
|
|
457
|
-
const activityCount = workerActivity.length;
|
|
458
|
-
|
|
459
|
-
if (activityCount === 0) {
|
|
460
|
-
return {
|
|
461
|
-
score: 1.0,
|
|
462
|
-
message: "No worker activity to monitor",
|
|
463
|
-
};
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
if (checkCount === 0) {
|
|
467
|
-
return {
|
|
468
|
-
score: 0.0,
|
|
469
|
-
message: `${activityCount} worker events, 0 inbox checks (not monitoring)`,
|
|
470
|
-
};
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
return {
|
|
474
|
-
score: 1.0,
|
|
475
|
-
message: `${checkCount} inbox check(s) for ${activityCount} worker events`,
|
|
476
|
-
};
|
|
477
|
-
} catch (error) {
|
|
478
|
-
return {
|
|
479
|
-
score: 0,
|
|
480
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
481
|
-
};
|
|
482
|
-
}
|
|
483
|
-
},
|
|
484
|
-
});
|
|
485
|
-
|
|
486
|
-
/**
|
|
487
|
-
* Blocker Response Time Scorer
|
|
488
|
-
*
|
|
489
|
-
* Measures how quickly coordinator responds to blocked workers.
|
|
490
|
-
* Time between blocker_detected (OUTCOME) and blocker_resolved (DECISION).
|
|
491
|
-
*
|
|
492
|
-
* Normalization:
|
|
493
|
-
* - < 5min: 1.0 (excellent)
|
|
494
|
-
* - 5-15min: linear decay to 0.5
|
|
495
|
-
* - > 15min: 0.0 (too slow, worker is idle)
|
|
496
|
-
*
|
|
497
|
-
* Score: Average response time across all blockers
|
|
498
|
-
*/
|
|
499
|
-
export const blockerResponseTime = createScorer({
|
|
500
|
-
name: "Blocker Response Time",
|
|
501
|
-
description: "Coordinator unblocked workers quickly",
|
|
502
|
-
scorer: ({ output }) => {
|
|
503
|
-
try {
|
|
504
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
505
|
-
|
|
506
|
-
// Find blocker_detected events
|
|
507
|
-
const blockers = session.events.filter(
|
|
508
|
-
(e) =>
|
|
509
|
-
e.event_type === "OUTCOME" && e.outcome_type === "blocker_detected"
|
|
510
|
-
);
|
|
511
|
-
|
|
512
|
-
if (blockers.length === 0) {
|
|
513
|
-
return {
|
|
514
|
-
score: 1.0,
|
|
515
|
-
message: "No blockers detected",
|
|
516
|
-
};
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
// Find blocker_resolved events
|
|
520
|
-
const resolutions = session.events.filter(
|
|
521
|
-
(e) =>
|
|
522
|
-
e.event_type === "DECISION" && e.decision_type === "blocker_resolved"
|
|
523
|
-
);
|
|
524
|
-
|
|
525
|
-
if (resolutions.length === 0) {
|
|
526
|
-
return {
|
|
527
|
-
score: 0.0,
|
|
528
|
-
message: `${blockers.length} blocker(s) detected, 0 resolved (workers still blocked)`,
|
|
529
|
-
};
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
// Match blockers to resolutions by subtask_id and calculate response times
|
|
533
|
-
const responseTimes: number[] = [];
|
|
534
|
-
for (const blocker of blockers) {
|
|
535
|
-
const subtaskId = (blocker.payload as any).subtask_id;
|
|
536
|
-
const blockerTime = new Date(blocker.timestamp).getTime();
|
|
537
|
-
|
|
538
|
-
// Find resolution for this subtask
|
|
539
|
-
const resolution = resolutions.find(
|
|
540
|
-
(r) => (r.payload as any).subtask_id === subtaskId
|
|
541
|
-
);
|
|
542
|
-
|
|
543
|
-
if (resolution) {
|
|
544
|
-
const resolutionTime = new Date(resolution.timestamp).getTime();
|
|
545
|
-
const deltaMs = resolutionTime - blockerTime;
|
|
546
|
-
responseTimes.push(deltaMs);
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
if (responseTimes.length === 0) {
|
|
551
|
-
return {
|
|
552
|
-
score: 0.5,
|
|
553
|
-
message: `${blockers.length} blocker(s) detected, ${resolutions.length} resolution(s), but no matches by subtask_id`,
|
|
554
|
-
};
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
// Calculate average response time
|
|
558
|
-
const avgResponseMs =
|
|
559
|
-
responseTimes.reduce((sum, t) => sum + t, 0) / responseTimes.length;
|
|
560
|
-
|
|
561
|
-
// Normalize: < 5min = 1.0, > 15min = 0.0, linear in between
|
|
562
|
-
const EXCELLENT_MS = 5 * 60 * 1000; // 5 min
|
|
563
|
-
const POOR_MS = 15 * 60 * 1000; // 15 min
|
|
564
|
-
|
|
565
|
-
let score: number;
|
|
566
|
-
if (avgResponseMs < EXCELLENT_MS) {
|
|
567
|
-
score = 1.0;
|
|
568
|
-
} else if (avgResponseMs > POOR_MS) {
|
|
569
|
-
score = 0.0;
|
|
570
|
-
} else {
|
|
571
|
-
// Linear decay from 1.0 to 0.0
|
|
572
|
-
score = 1.0 - (avgResponseMs - EXCELLENT_MS) / (POOR_MS - EXCELLENT_MS);
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
const avgMinutes = Math.round(avgResponseMs / 1000 / 60);
|
|
576
|
-
|
|
577
|
-
return {
|
|
578
|
-
score,
|
|
579
|
-
message: `Avg response time: ${avgMinutes}min (${responseTimes.length}/${blockers.length} blockers resolved)`,
|
|
580
|
-
};
|
|
581
|
-
} catch (error) {
|
|
582
|
-
return {
|
|
583
|
-
score: 0,
|
|
584
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
585
|
-
};
|
|
586
|
-
}
|
|
587
|
-
},
|
|
588
|
-
});
|
|
589
|
-
|
|
590
337
|
/**
|
|
591
338
|
* Overall Discipline Scorer
|
|
592
339
|
*
|
|
@@ -34,7 +34,9 @@ import {
|
|
|
34
34
|
} from "./lib/data-loader.js";
|
|
35
35
|
|
|
36
36
|
// Determine project key from current directory
|
|
37
|
-
|
|
37
|
+
// NOTE: project_key in eval_records is the full path (from getHiveWorkingDirectory),
|
|
38
|
+
// not a short name. Use process.cwd() to match.
|
|
39
|
+
const PROJECT_KEY = process.cwd();
|
|
38
40
|
const PROJECT_PATH = process.cwd();
|
|
39
41
|
|
|
40
42
|
// Check if we have enough real data to use instead of fixtures
|
|
@@ -42,7 +44,7 @@ const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
|
|
|
42
44
|
|
|
43
45
|
// Load data based on availability
|
|
44
46
|
const evalCases = useRealData
|
|
45
|
-
? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
|
|
47
|
+
? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH }) // PROJECT_KEY is now process.cwd()
|
|
46
48
|
: decompositionCases.map((testCase) => ({
|
|
47
49
|
input: testCase.input,
|
|
48
50
|
expected: testCase.expected,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-swarm-plugin",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.42.0",
|
|
4
4
|
"description": "Multi-agent swarm coordination for OpenCode with learning capabilities, beads integration, and Agent Mail",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
"registry": "https://registry.npmjs.org/"
|
|
24
24
|
},
|
|
25
25
|
"scripts": {
|
|
26
|
-
"build": "bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail && tsc",
|
|
26
|
+
"build": "bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && tsc",
|
|
27
27
|
"dev": "bun --watch src/index.ts",
|
|
28
28
|
"test": "bun test --timeout 10000 src/anti-patterns.test.ts src/mandate-promotion.test.ts src/mandate-storage.test.ts src/output-guardrails.test.ts src/pattern-maturity.test.ts src/skills.test.ts src/structured.test.ts src/schemas/",
|
|
29
29
|
"test:integration": "bun test --timeout 60000 src/*.integration.test.ts",
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
"eval:decomposition": "bun --env-file=.env run bunx evalite run evals/swarm-decomposition.eval.ts",
|
|
35
35
|
"eval:coordinator": "bun --env-file=.env run bunx evalite run evals/coordinator-session.eval.ts",
|
|
36
36
|
"eval:compaction": "bun --env-file=.env run bunx evalite run evals/compaction-prompt.eval.ts",
|
|
37
|
+
"eval:gate": "bun run bin/eval-gate.ts",
|
|
37
38
|
"migrate:sessions": "bun run scripts/migrate-unknown-sessions.ts",
|
|
38
39
|
"postinstall": "node -e \"console.log('\\n\\x1b[33m Run \\x1b[36mswarm setup\\x1b[33m to configure OpenCode integration\\x1b[0m\\n')\""
|
|
39
40
|
},
|
|
@@ -46,7 +47,7 @@
|
|
|
46
47
|
"minimatch": "^10.1.1",
|
|
47
48
|
"pino": "^9.6.0",
|
|
48
49
|
"pino-roll": "^1.3.0",
|
|
49
|
-
"swarm-mail": "1.5.
|
|
50
|
+
"swarm-mail": "1.5.4",
|
|
50
51
|
"yaml": "^2.8.2",
|
|
51
52
|
"zod": "4.1.8"
|
|
52
53
|
},
|
|
@@ -173,16 +173,17 @@ describe("forbiddenToolsPresent scorer", () => {
|
|
|
173
173
|
- Edit (use swarm_spawn_subtask)
|
|
174
174
|
- Write (use swarm_spawn_subtask)
|
|
175
175
|
- swarmmail_reserve (only workers reserve)
|
|
176
|
-
-
|
|
176
|
+
- git commit (workers commit)
|
|
177
|
+
- bash (for file modifications)`,
|
|
177
178
|
};
|
|
178
179
|
|
|
179
180
|
const result = scoreForbiddenToolsPresent(prompt);
|
|
180
181
|
|
|
181
182
|
expect(result.score).toBe(1.0);
|
|
182
|
-
expect(result.message).toContain("All
|
|
183
|
+
expect(result.message).toContain("All 5 forbidden tools");
|
|
183
184
|
});
|
|
184
185
|
|
|
185
|
-
test("scores 0.
|
|
186
|
+
test("scores 0.6 when 3 out of 5 tools listed", () => {
|
|
186
187
|
const prompt: CompactionPrompt = {
|
|
187
188
|
content: `🚫 FORBIDDEN TOOLS:
|
|
188
189
|
- Edit
|
|
@@ -192,19 +193,19 @@ describe("forbiddenToolsPresent scorer", () => {
|
|
|
192
193
|
|
|
193
194
|
const result = scoreForbiddenToolsPresent(prompt);
|
|
194
195
|
|
|
195
|
-
expect(result.score).toBe(0.
|
|
196
|
-
expect(result.message).toContain("3/
|
|
196
|
+
expect(result.score).toBe(0.6);
|
|
197
|
+
expect(result.message).toContain("3/5");
|
|
197
198
|
});
|
|
198
199
|
|
|
199
|
-
test("scores 0.
|
|
200
|
+
test("scores 0.4 when 2 out of 5 tools listed", () => {
|
|
200
201
|
const prompt: CompactionPrompt = {
|
|
201
202
|
content: `Don't use Edit or Write directly.`,
|
|
202
203
|
};
|
|
203
204
|
|
|
204
205
|
const result = scoreForbiddenToolsPresent(prompt);
|
|
205
206
|
|
|
206
|
-
expect(result.score).toBe(0.
|
|
207
|
-
expect(result.message).toContain("2/
|
|
207
|
+
expect(result.score).toBe(0.4);
|
|
208
|
+
expect(result.message).toContain("2/5");
|
|
208
209
|
});
|
|
209
210
|
|
|
210
211
|
test("scores 0.0 when no forbidden tools listed", () => {
|
|
@@ -215,7 +216,7 @@ describe("forbiddenToolsPresent scorer", () => {
|
|
|
215
216
|
const result = scoreForbiddenToolsPresent(prompt);
|
|
216
217
|
|
|
217
218
|
expect(result.score).toBe(0.0);
|
|
218
|
-
expect(result.message).toContain("0/
|
|
219
|
+
expect(result.message).toContain("0/5");
|
|
219
220
|
});
|
|
220
221
|
});
|
|
221
222
|
|
|
@@ -203,6 +203,7 @@ export function scoreCoordinatorIdentity(
|
|
|
203
203
|
* 2. Write
|
|
204
204
|
* 3. swarmmail_reserve (only workers reserve)
|
|
205
205
|
* 4. git commit (workers commit)
|
|
206
|
+
* 5. bash (for file modifications)
|
|
206
207
|
*
|
|
207
208
|
* @returns ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
208
209
|
*/
|
|
@@ -211,10 +212,11 @@ export function scoreForbiddenToolsPresent(
|
|
|
211
212
|
): ScorerResult {
|
|
212
213
|
// Check for forbidden tool mentions
|
|
213
214
|
const forbiddenTools = [
|
|
214
|
-
/\bEdit\b
|
|
215
|
-
/\bWrite\b
|
|
215
|
+
/\bEdit\b/i,
|
|
216
|
+
/\bWrite\b/i,
|
|
216
217
|
/swarmmail_reserve/,
|
|
217
218
|
/git commit/,
|
|
219
|
+
/\bbash\b/i,
|
|
218
220
|
];
|
|
219
221
|
|
|
220
222
|
const foundTools = forbiddenTools.filter((pattern) =>
|
|
@@ -226,20 +228,20 @@ export function scoreForbiddenToolsPresent(
|
|
|
226
228
|
if (score === 1.0) {
|
|
227
229
|
return {
|
|
228
230
|
score: 1.0,
|
|
229
|
-
message: "All
|
|
231
|
+
message: "All 5 forbidden tools listed",
|
|
230
232
|
};
|
|
231
233
|
}
|
|
232
234
|
|
|
233
235
|
if (score === 0) {
|
|
234
236
|
return {
|
|
235
237
|
score: 0.0,
|
|
236
|
-
message: "No forbidden tools listed (0/
|
|
238
|
+
message: "No forbidden tools listed (0/5)",
|
|
237
239
|
};
|
|
238
240
|
}
|
|
239
241
|
|
|
240
242
|
return {
|
|
241
243
|
score,
|
|
242
|
-
message: `${foundTools.length}/
|
|
244
|
+
message: `${foundTools.length}/5 forbidden tools listed`,
|
|
243
245
|
};
|
|
244
246
|
}
|
|
245
247
|
|
package/src/eval-runner.test.ts
CHANGED
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
* TDD: These tests MUST fail initially, then pass after implementation.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
-
import { describe, test, expect, beforeAll } from "bun:test";
|
|
7
|
+
import { describe, test, expect, beforeAll, afterEach } from "bun:test";
|
|
8
8
|
import { runEvals } from "./eval-runner";
|
|
9
9
|
import path from "node:path";
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import { getEvalHistoryPath } from "./eval-history";
|
|
10
12
|
|
|
11
13
|
// Use project root for all tests
|
|
12
14
|
const PROJECT_ROOT = path.resolve(import.meta.dir, "..");
|
|
@@ -93,4 +95,129 @@ describe("runEvals", () => {
|
|
|
93
95
|
expect(result.totalSuites).toBe(0);
|
|
94
96
|
expect(result.suites).toEqual([]);
|
|
95
97
|
}, 10000);
|
|
98
|
+
|
|
99
|
+
test("records eval run to history after execution", async () => {
|
|
100
|
+
// Clean up any existing history before test
|
|
101
|
+
const historyPath = getEvalHistoryPath(PROJECT_ROOT);
|
|
102
|
+
const historyBackup = historyPath + ".backup";
|
|
103
|
+
|
|
104
|
+
// Backup existing history
|
|
105
|
+
if (fs.existsSync(historyPath)) {
|
|
106
|
+
fs.copyFileSync(historyPath, historyBackup);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
// Remove history file to get clean state
|
|
111
|
+
if (fs.existsSync(historyPath)) {
|
|
112
|
+
fs.unlinkSync(historyPath);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Run evals
|
|
116
|
+
const result = await runEvals({
|
|
117
|
+
cwd: PROJECT_ROOT,
|
|
118
|
+
suiteFilter: "example",
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// Should have succeeded
|
|
122
|
+
expect(result.success).toBe(true);
|
|
123
|
+
expect(result.suites.length).toBeGreaterThan(0);
|
|
124
|
+
|
|
125
|
+
// History file should have been created
|
|
126
|
+
expect(fs.existsSync(historyPath)).toBe(true);
|
|
127
|
+
|
|
128
|
+
// Read history file
|
|
129
|
+
const historyContent = fs.readFileSync(historyPath, "utf-8");
|
|
130
|
+
const lines = historyContent.trim().split("\n");
|
|
131
|
+
|
|
132
|
+
// Should have one line per suite
|
|
133
|
+
expect(lines.length).toBe(result.suites.length);
|
|
134
|
+
|
|
135
|
+
// Parse first line and verify structure
|
|
136
|
+
const firstRecord = JSON.parse(lines[0]);
|
|
137
|
+
|
|
138
|
+
// Verify structure has all required fields
|
|
139
|
+
expect(typeof firstRecord.timestamp).toBe("string");
|
|
140
|
+
expect(typeof firstRecord.eval_name).toBe("string");
|
|
141
|
+
expect(typeof firstRecord.score).toBe("number");
|
|
142
|
+
expect(typeof firstRecord.run_count).toBe("number");
|
|
143
|
+
|
|
144
|
+
// Verify eval_name matches suite name
|
|
145
|
+
expect(firstRecord.eval_name).toBe(result.suites[0].name);
|
|
146
|
+
|
|
147
|
+
// Verify score matches suite averageScore
|
|
148
|
+
expect(firstRecord.score).toBe(result.suites[0].averageScore);
|
|
149
|
+
|
|
150
|
+
// First run should have run_count = 1
|
|
151
|
+
expect(firstRecord.run_count).toBe(1);
|
|
152
|
+
} finally {
|
|
153
|
+
// Restore backup
|
|
154
|
+
if (fs.existsSync(historyBackup)) {
|
|
155
|
+
fs.copyFileSync(historyBackup, historyPath);
|
|
156
|
+
fs.unlinkSync(historyBackup);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}, 30000);
|
|
160
|
+
|
|
161
|
+
test("checks gates for each suite after recording", async () => {
|
|
162
|
+
const result = await runEvals({
|
|
163
|
+
cwd: PROJECT_ROOT,
|
|
164
|
+
suiteFilter: "example",
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
expect(result.success).toBe(true);
|
|
168
|
+
expect(result.gateResults).toBeDefined();
|
|
169
|
+
expect(Array.isArray(result.gateResults)).toBe(true);
|
|
170
|
+
|
|
171
|
+
// Should have gate result for each suite
|
|
172
|
+
expect(result.gateResults?.length).toBe(result.suites.length);
|
|
173
|
+
|
|
174
|
+
// Each gate result should have required fields
|
|
175
|
+
if (result.gateResults && result.gateResults.length > 0) {
|
|
176
|
+
const gateResult = result.gateResults[0];
|
|
177
|
+
expect(gateResult).toHaveProperty("suite");
|
|
178
|
+
expect(gateResult).toHaveProperty("passed");
|
|
179
|
+
expect(gateResult).toHaveProperty("phase");
|
|
180
|
+
expect(gateResult).toHaveProperty("message");
|
|
181
|
+
expect(gateResult).toHaveProperty("currentScore");
|
|
182
|
+
}
|
|
183
|
+
}, 30000);
|
|
184
|
+
|
|
185
|
+
test("calls learnFromEvalFailure when gate fails", async () => {
|
|
186
|
+
// This test requires manually creating a history with regression
|
|
187
|
+
// For now, we just verify the code path exists
|
|
188
|
+
// In practice, this would be tested with mocked checkGate returning failed=true
|
|
189
|
+
|
|
190
|
+
const result = await runEvals({
|
|
191
|
+
cwd: PROJECT_ROOT,
|
|
192
|
+
suiteFilter: "example",
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
// Gate results should be present even if no failures
|
|
196
|
+
expect(result.gateResults).toBeDefined();
|
|
197
|
+
}, 30000);
|
|
198
|
+
|
|
199
|
+
test("does NOT call learnFromEvalFailure when gate passes", async () => {
|
|
200
|
+
// Similar to above - verifies the happy path
|
|
201
|
+
// Real test would mock checkGate and verify learnFromEvalFailure NOT called
|
|
202
|
+
|
|
203
|
+
const result = await runEvals({
|
|
204
|
+
cwd: PROJECT_ROOT,
|
|
205
|
+
suiteFilter: "example",
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Should succeed with gate results
|
|
209
|
+
expect(result.success).toBe(true);
|
|
210
|
+
expect(result.gateResults).toBeDefined();
|
|
211
|
+
}, 30000);
|
|
212
|
+
|
|
213
|
+
test("includes gateResults in return value", async () => {
|
|
214
|
+
const result = await runEvals({
|
|
215
|
+
cwd: PROJECT_ROOT,
|
|
216
|
+
suiteFilter: "example",
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
// gateResults should be array (even if empty)
|
|
220
|
+
expect(result).toHaveProperty("gateResults");
|
|
221
|
+
expect(Array.isArray(result.gateResults)).toBe(true);
|
|
222
|
+
}, 30000);
|
|
96
223
|
});
|
package/src/eval-runner.ts
CHANGED
|
@@ -13,6 +13,10 @@ import { createInMemoryStorage } from "evalite/in-memory-storage";
|
|
|
13
13
|
import type { Evalite } from "evalite/types";
|
|
14
14
|
import fs from "node:fs/promises";
|
|
15
15
|
import path from "node:path";
|
|
16
|
+
import { recordEvalRun, getScoreHistory } from "./eval-history.js";
|
|
17
|
+
import { checkGate } from "./eval-gates.js";
|
|
18
|
+
import { learnFromEvalFailure } from "./eval-learning.js";
|
|
19
|
+
import { getMemoryAdapter } from "./memory-tools.js";
|
|
16
20
|
|
|
17
21
|
/**
|
|
18
22
|
* Options for running evals programmatically
|
|
@@ -97,6 +101,17 @@ export interface RunEvalsResult {
|
|
|
97
101
|
|
|
98
102
|
/** Error message if run failed */
|
|
99
103
|
error?: string;
|
|
104
|
+
|
|
105
|
+
/** Gate check results per suite */
|
|
106
|
+
gateResults?: Array<{
|
|
107
|
+
suite: string;
|
|
108
|
+
passed: boolean;
|
|
109
|
+
phase: string;
|
|
110
|
+
message: string;
|
|
111
|
+
baseline?: number;
|
|
112
|
+
currentScore: number;
|
|
113
|
+
regressionPercent?: number;
|
|
114
|
+
}>;
|
|
100
115
|
}
|
|
101
116
|
|
|
102
117
|
/**
|
|
@@ -246,6 +261,36 @@ export async function runEvals(
|
|
|
246
261
|
})),
|
|
247
262
|
}));
|
|
248
263
|
|
|
264
|
+
// Record eval runs to history
|
|
265
|
+
for (const suite of suites) {
|
|
266
|
+
const history = getScoreHistory(projectRoot, suite.name);
|
|
267
|
+
recordEvalRun(projectRoot, {
|
|
268
|
+
timestamp: new Date().toISOString(),
|
|
269
|
+
eval_name: suite.name,
|
|
270
|
+
score: suite.averageScore,
|
|
271
|
+
run_count: history.length + 1,
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Check gates for each suite
|
|
276
|
+
const gateResults = [];
|
|
277
|
+
for (const suite of suites) {
|
|
278
|
+
const history = getScoreHistory(projectRoot, suite.name);
|
|
279
|
+
const gate = checkGate(projectRoot, suite.name, suite.averageScore);
|
|
280
|
+
gateResults.push({ suite: suite.name, ...gate });
|
|
281
|
+
|
|
282
|
+
// If gate failed, trigger learning
|
|
283
|
+
if (!gate.passed) {
|
|
284
|
+
try {
|
|
285
|
+
const memoryAdapter = await getMemoryAdapter();
|
|
286
|
+
await learnFromEvalFailure(suite.name, suite.averageScore, history, memoryAdapter);
|
|
287
|
+
} catch (e) {
|
|
288
|
+
// Learning is best-effort, don't fail the eval run
|
|
289
|
+
console.warn(`Failed to store learning for ${suite.name}:`, e);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
249
294
|
// Calculate overall metrics
|
|
250
295
|
const totalEvals = suites.reduce((sum, s) => sum + s.evalCount, 0);
|
|
251
296
|
const averageScore =
|
|
@@ -263,6 +308,7 @@ export async function runEvals(
|
|
|
263
308
|
totalEvals,
|
|
264
309
|
averageScore,
|
|
265
310
|
suites,
|
|
311
|
+
gateResults,
|
|
266
312
|
};
|
|
267
313
|
} catch (error) {
|
|
268
314
|
// Return error result
|