opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
package/bin/swarm.test.ts CHANGED
@@ -197,6 +197,412 @@ READ-ONLY research agent. Never modifies code - only gathers intel and stores fi
197
197
  // Log Command Tests (TDD)
198
198
  // ============================================================================
199
199
 
200
+ // ============================================================================
201
+ // Session Log Tests (TDD)
202
+ // ============================================================================
203
+
204
+ import type { CoordinatorEvent } from "../src/eval-capture";
205
+
206
+ const TEST_SESSIONS_DIR = join(tmpdir(), "swarm-test-sessions");
207
+
208
+ describe("swarm log sessions", () => {
209
+ beforeEach(() => {
210
+ // Create test sessions directory
211
+ if (!existsSync(TEST_SESSIONS_DIR)) {
212
+ mkdirSync(TEST_SESSIONS_DIR, { recursive: true });
213
+ }
214
+ });
215
+
216
+ afterEach(() => {
217
+ // Cleanup test directory
218
+ if (existsSync(TEST_SESSIONS_DIR)) {
219
+ rmSync(TEST_SESSIONS_DIR, { recursive: true, force: true });
220
+ }
221
+ });
222
+
223
+ // ========================================================================
224
+ // Helper Functions (to be implemented in swarm.ts)
225
+ // ========================================================================
226
+
227
+ function createTestSession(
228
+ sessionId: string,
229
+ epicId: string,
230
+ eventCount: number,
231
+ baseTimestamp?: number,
232
+ ): void {
233
+ const filePath = join(TEST_SESSIONS_DIR, `${sessionId}.jsonl`);
234
+ const lines: string[] = [];
235
+ const base = baseTimestamp || Date.now();
236
+
237
+ for (let i = 0; i < eventCount; i++) {
238
+ const event: CoordinatorEvent = {
239
+ session_id: sessionId,
240
+ epic_id: epicId,
241
+ timestamp: new Date(base - (eventCount - i) * 1000).toISOString(),
242
+ event_type: "DECISION",
243
+ decision_type: "worker_spawned",
244
+ payload: { worker_id: `worker-${i}` },
245
+ };
246
+ lines.push(JSON.stringify(event));
247
+ }
248
+
249
+ writeFileSync(filePath, lines.join("\n") + "\n");
250
+ }
251
+
252
+ /**
253
+ * Parse a session file and return events
254
+ */
255
+ function parseSessionFile(filePath: string): CoordinatorEvent[] {
256
+ if (!existsSync(filePath)) {
257
+ throw new Error(`Session file not found: ${filePath}`);
258
+ }
259
+
260
+ const content = readFileSync(filePath, "utf-8");
261
+ const lines = content.split("\n").filter((line) => line.trim());
262
+ const events: CoordinatorEvent[] = [];
263
+
264
+ for (const line of lines) {
265
+ try {
266
+ const parsed = JSON.parse(line);
267
+ events.push(parsed);
268
+ } catch {
269
+ // Skip invalid JSON lines
270
+ }
271
+ }
272
+
273
+ return events;
274
+ }
275
+
276
+ /**
277
+ * List all session files in a directory
278
+ */
279
+ function listSessionFiles(
280
+ dir: string,
281
+ ): Array<{
282
+ session_id: string;
283
+ file_path: string;
284
+ event_count: number;
285
+ start_time: string;
286
+ end_time?: string;
287
+ }> {
288
+ if (!existsSync(dir)) return [];
289
+
290
+ const files = readdirSync(dir).filter((f) => f.endsWith(".jsonl"));
291
+ const sessions: Array<{
292
+ session_id: string;
293
+ file_path: string;
294
+ event_count: number;
295
+ start_time: string;
296
+ end_time?: string;
297
+ }> = [];
298
+
299
+ for (const file of files) {
300
+ const filePath = join(dir, file);
301
+ try {
302
+ const events = parseSessionFile(filePath);
303
+ if (events.length === 0) continue;
304
+
305
+ const timestamps = events.map((e) => new Date(e.timestamp).getTime());
306
+ const startTime = new Date(Math.min(...timestamps)).toISOString();
307
+ const endTime =
308
+ timestamps.length > 1
309
+ ? new Date(Math.max(...timestamps)).toISOString()
310
+ : undefined;
311
+
312
+ sessions.push({
313
+ session_id: events[0].session_id,
314
+ file_path: filePath,
315
+ event_count: events.length,
316
+ start_time: startTime,
317
+ end_time: endTime,
318
+ });
319
+ } catch {
320
+ // Skip invalid files
321
+ }
322
+ }
323
+
324
+ // Sort by start time (newest first)
325
+ return sessions.sort((a, b) =>
326
+ new Date(b.start_time).getTime() - new Date(a.start_time).getTime()
327
+ );
328
+ }
329
+
330
+ /**
331
+ * Get the latest session file
332
+ */
333
+ function getLatestSession(
334
+ dir: string,
335
+ ): {
336
+ session_id: string;
337
+ file_path: string;
338
+ event_count: number;
339
+ start_time: string;
340
+ end_time?: string;
341
+ } | null {
342
+ const sessions = listSessionFiles(dir);
343
+ return sessions.length > 0 ? sessions[0] : null;
344
+ }
345
+
346
+ /**
347
+ * Filter events by type
348
+ */
349
+ function filterEventsByType(
350
+ events: CoordinatorEvent[],
351
+ eventType: string,
352
+ ): CoordinatorEvent[] {
353
+ if (eventType === "all") return events;
354
+ return events.filter((e) => e.event_type === eventType.toUpperCase());
355
+ }
356
+
357
+ /**
358
+ * Filter events by time
359
+ */
360
+ function filterEventsSince(
361
+ events: CoordinatorEvent[],
362
+ sinceMs: number,
363
+ ): CoordinatorEvent[] {
364
+ const cutoffTime = Date.now() - sinceMs;
365
+ return events.filter((e) =>
366
+ new Date(e.timestamp).getTime() >= cutoffTime
367
+ );
368
+ }
369
+
370
+ // ========================================================================
371
+ // Tests
372
+ // ========================================================================
373
+
374
+ describe("listSessionFiles", () => {
375
+ test("returns empty array when directory doesn't exist", () => {
376
+ const result = listSessionFiles("/nonexistent/directory");
377
+ expect(result).toEqual([]);
378
+ });
379
+
380
+ test("returns empty array when directory is empty", () => {
381
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
382
+ expect(result).toEqual([]);
383
+ });
384
+
385
+ test("lists all session files with metadata", () => {
386
+ createTestSession("ses_abc123", "epic-1", 5);
387
+ createTestSession("ses_def456", "epic-2", 3);
388
+
389
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
390
+
391
+ expect(result).toHaveLength(2);
392
+ expect(result[0].session_id).toMatch(/^ses_/);
393
+ expect(result[0].event_count).toBeGreaterThan(0);
394
+ expect(result[0].start_time).toBeTruthy();
395
+ });
396
+
397
+ test("calculates event count correctly", () => {
398
+ createTestSession("ses_test", "epic-1", 10);
399
+
400
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
401
+
402
+ expect(result[0].event_count).toBe(10);
403
+ });
404
+
405
+ test("extracts start and end times from events", () => {
406
+ createTestSession("ses_test", "epic-1", 5);
407
+
408
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
409
+
410
+ expect(result[0].start_time).toBeTruthy();
411
+ expect(new Date(result[0].start_time).getTime()).toBeLessThan(Date.now());
412
+ });
413
+
414
+ test("sorts sessions by start time (newest first)", () => {
415
+ // Create sessions with explicit different timestamps
416
+ const oldTime = Date.now() - 60000; // 1 minute ago
417
+ const newTime = Date.now();
418
+
419
+ createTestSession("ses_old", "epic-1", 2, oldTime);
420
+ createTestSession("ses_new", "epic-2", 2, newTime);
421
+
422
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
423
+
424
+ expect(result[0].session_id).toBe("ses_new");
425
+ expect(result[1].session_id).toBe("ses_old");
426
+ });
427
+ });
428
+
429
+ describe("parseSessionFile", () => {
430
+ test("parses valid JSONL session file", () => {
431
+ createTestSession("ses_parse", "epic-1", 3);
432
+ const filePath = join(TEST_SESSIONS_DIR, "ses_parse.jsonl");
433
+
434
+ const events = parseSessionFile(filePath);
435
+
436
+ expect(events).toHaveLength(3);
437
+ expect(events[0].session_id).toBe("ses_parse");
438
+ expect(events[0].event_type).toBe("DECISION");
439
+ });
440
+
441
+ test("handles file with trailing newlines", () => {
442
+ const filePath = join(TEST_SESSIONS_DIR, "ses_trailing.jsonl");
443
+ writeFileSync(
444
+ filePath,
445
+ '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\n\n\n',
446
+ );
447
+
448
+ const events = parseSessionFile(filePath);
449
+
450
+ expect(events).toHaveLength(1);
451
+ });
452
+
453
+ test("skips invalid JSON lines", () => {
454
+ const filePath = join(TEST_SESSIONS_DIR, "ses_invalid.jsonl");
455
+ writeFileSync(
456
+ filePath,
457
+ '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\ninvalid json\n{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{}}\n',
458
+ );
459
+
460
+ const events = parseSessionFile(filePath);
461
+
462
+ expect(events).toHaveLength(2);
463
+ });
464
+
465
+ test("throws error for non-existent file", () => {
466
+ expect(() => parseSessionFile("/nonexistent/file.jsonl")).toThrow();
467
+ });
468
+ });
469
+
470
+ describe("getLatestSession", () => {
471
+ test("returns null when directory is empty", () => {
472
+ const result = getLatestSession(TEST_SESSIONS_DIR);
473
+ expect(result).toBeNull();
474
+ });
475
+
476
+ test("returns the most recent session", () => {
477
+ const oldTime = Date.now() - 60000; // 1 minute ago
478
+ const newTime = Date.now();
479
+
480
+ createTestSession("ses_old", "epic-1", 2, oldTime);
481
+ createTestSession("ses_new", "epic-2", 3, newTime);
482
+
483
+ const result = getLatestSession(TEST_SESSIONS_DIR);
484
+
485
+ expect(result).not.toBeNull();
486
+ expect(result!.session_id).toBe("ses_new");
487
+ });
488
+ });
489
+
490
+ describe("filterEventsByType", () => {
491
+ test("filters DECISION events only", () => {
492
+ const events: CoordinatorEvent[] = [
493
+ {
494
+ session_id: "s1",
495
+ epic_id: "e1",
496
+ timestamp: "2025-01-01T00:00:00Z",
497
+ event_type: "DECISION",
498
+ decision_type: "worker_spawned",
499
+ payload: {},
500
+ },
501
+ {
502
+ session_id: "s1",
503
+ epic_id: "e1",
504
+ timestamp: "2025-01-01T00:01:00Z",
505
+ event_type: "VIOLATION",
506
+ violation_type: "coordinator_edited_file",
507
+ payload: {},
508
+ },
509
+ {
510
+ session_id: "s1",
511
+ epic_id: "e1",
512
+ timestamp: "2025-01-01T00:02:00Z",
513
+ event_type: "DECISION",
514
+ decision_type: "review_completed",
515
+ payload: {},
516
+ },
517
+ ];
518
+
519
+ const result = filterEventsByType(events, "DECISION");
520
+
521
+ expect(result).toHaveLength(2);
522
+ expect(result.every((e) => e.event_type === "DECISION")).toBe(true);
523
+ });
524
+
525
+ test("returns all events when type is 'all'", () => {
526
+ const events: CoordinatorEvent[] = [
527
+ {
528
+ session_id: "s1",
529
+ epic_id: "e1",
530
+ timestamp: "2025-01-01T00:00:00Z",
531
+ event_type: "DECISION",
532
+ decision_type: "worker_spawned",
533
+ payload: {},
534
+ },
535
+ {
536
+ session_id: "s1",
537
+ epic_id: "e1",
538
+ timestamp: "2025-01-01T00:01:00Z",
539
+ event_type: "VIOLATION",
540
+ violation_type: "coordinator_edited_file",
541
+ payload: {},
542
+ },
543
+ ];
544
+
545
+ const result = filterEventsByType(events, "all");
546
+
547
+ expect(result).toHaveLength(2);
548
+ });
549
+ });
550
+
551
+ describe("filterEventsSince", () => {
552
+ test("filters events within time window", () => {
553
+ const now = Date.now();
554
+ const events: CoordinatorEvent[] = [
555
+ {
556
+ session_id: "s1",
557
+ epic_id: "e1",
558
+ timestamp: new Date(now - 10000).toISOString(), // 10s ago
559
+ event_type: "DECISION",
560
+ decision_type: "worker_spawned",
561
+ payload: {},
562
+ },
563
+ {
564
+ session_id: "s1",
565
+ epic_id: "e1",
566
+ timestamp: new Date(now - 60000).toISOString(), // 1m ago
567
+ event_type: "VIOLATION",
568
+ violation_type: "coordinator_edited_file",
569
+ payload: {},
570
+ },
571
+ {
572
+ session_id: "s1",
573
+ epic_id: "e1",
574
+ timestamp: new Date(now - 3000).toISOString(), // 3s ago
575
+ event_type: "OUTCOME",
576
+ outcome_type: "subtask_success",
577
+ payload: {},
578
+ },
579
+ ];
580
+
581
+ const result = filterEventsSince(events, 30000); // Last 30s
582
+
583
+ expect(result).toHaveLength(2); // 10s and 3s ago
584
+ });
585
+
586
+ test("returns all events when sinceMs is very large", () => {
587
+ const now = Date.now();
588
+ const events: CoordinatorEvent[] = [
589
+ {
590
+ session_id: "s1",
591
+ epic_id: "e1",
592
+ timestamp: new Date(now - 1000).toISOString(),
593
+ event_type: "DECISION",
594
+ decision_type: "worker_spawned",
595
+ payload: {},
596
+ },
597
+ ];
598
+
599
+ const result = filterEventsSince(events, 86400000); // 1 day
600
+
601
+ expect(result).toHaveLength(1);
602
+ });
603
+ });
604
+ });
605
+
200
606
  // ============================================================================
201
607
  // Cells Command Tests (TDD)
202
608
  // ============================================================================
@@ -639,3 +1045,478 @@ describe("Log command helpers", () => {
639
1045
  });
640
1046
  });
641
1047
  });
1048
+
1049
+ // ============================================================================
1050
+ // Eval Commands Tests (TDD)
1051
+ // ============================================================================
1052
+
1053
+ describe("Eval commands", () => {
1054
+ describe("formatEvalStatus", () => {
1055
+ test("displays phase, thresholds, and recent scores", () => {
1056
+ const status = {
1057
+ phase: "stabilization" as const,
1058
+ runCount: 25,
1059
+ thresholds: {
1060
+ stabilization: 0.1,
1061
+ production: 0.05,
1062
+ },
1063
+ recentScores: [
1064
+ { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
1065
+ { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
1066
+ { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
1067
+ ],
1068
+ };
1069
+
1070
+ const output = formatEvalStatus(status);
1071
+
1072
+ // Should show phase
1073
+ expect(output).toContain("stabilization");
1074
+
1075
+ // Should show run count
1076
+ expect(output).toContain("25");
1077
+
1078
+ // Should show thresholds
1079
+ expect(output).toContain("10%"); // stabilization threshold
1080
+ expect(output).toContain("5%"); // production threshold
1081
+
1082
+ // Should show recent scores
1083
+ expect(output).toContain("0.85");
1084
+ expect(output).toContain("0.87");
1085
+ expect(output).toContain("0.82");
1086
+ });
1087
+
1088
+ test("shows bootstrap phase message", () => {
1089
+ const status = {
1090
+ phase: "bootstrap" as const,
1091
+ runCount: 5,
1092
+ thresholds: {
1093
+ stabilization: 0.1,
1094
+ production: 0.05,
1095
+ },
1096
+ recentScores: [],
1097
+ };
1098
+
1099
+ const output = formatEvalStatus(status);
1100
+
1101
+ expect(output).toContain("bootstrap");
1102
+ expect(output).toContain("collecting data");
1103
+ });
1104
+
1105
+ test("shows production phase message", () => {
1106
+ const status = {
1107
+ phase: "production" as const,
1108
+ runCount: 75,
1109
+ thresholds: {
1110
+ stabilization: 0.1,
1111
+ production: 0.05,
1112
+ },
1113
+ recentScores: [],
1114
+ };
1115
+
1116
+ const output = formatEvalStatus(status);
1117
+
1118
+ expect(output).toContain("production");
1119
+ });
1120
+ });
1121
+
1122
+ describe("formatEvalHistory", () => {
1123
+ test("shows eval entries with timestamps and scores", () => {
1124
+ const history = [
1125
+ {
1126
+ timestamp: "2024-12-24T10:00:00.000Z",
1127
+ eval_name: "swarm-decomposition",
1128
+ score: 0.85,
1129
+ run_count: 1,
1130
+ },
1131
+ {
1132
+ timestamp: "2024-12-24T11:00:00.000Z",
1133
+ eval_name: "swarm-decomposition",
1134
+ score: 0.87,
1135
+ run_count: 2,
1136
+ },
1137
+ {
1138
+ timestamp: "2024-12-24T12:00:00.000Z",
1139
+ eval_name: "coordinator-behavior",
1140
+ score: 0.92,
1141
+ run_count: 1,
1142
+ },
1143
+ ];
1144
+
1145
+ const output = formatEvalHistory(history);
1146
+
1147
+ // Should show all eval names
1148
+ expect(output).toContain("swarm-decomposition");
1149
+ expect(output).toContain("coordinator-behavior");
1150
+
1151
+ // Should show scores
1152
+ expect(output).toContain("0.85");
1153
+ expect(output).toContain("0.87");
1154
+ expect(output).toContain("0.92");
1155
+
1156
+ // Should show run counts
1157
+ expect(output).toContain("run #1");
1158
+ expect(output).toContain("run #2");
1159
+ });
1160
+
1161
+ test("returns empty message for no history", () => {
1162
+ const output = formatEvalHistory([]);
1163
+ expect(output).toContain("No eval history");
1164
+ });
1165
+
1166
+ test("formats timestamps as readable dates", () => {
1167
+ const history = [
1168
+ {
1169
+ timestamp: "2024-12-24T10:00:00.000Z",
1170
+ eval_name: "test",
1171
+ score: 0.85,
1172
+ run_count: 1,
1173
+ },
1174
+ ];
1175
+
1176
+ const output = formatEvalHistory(history);
1177
+
1178
+ // Should contain a formatted date (not raw ISO)
1179
+ expect(output).not.toContain("2024-12-24T10:00:00.000Z");
1180
+ expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
1181
+ });
1182
+ });
1183
+
1184
+ describe("generateSparkline", () => {
1185
+ test("generates sparkline from scores", () => {
1186
+ const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
1187
+ const sparkline = generateSparkline(scores);
1188
+
1189
+ // Should use sparkline characters
1190
+ expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
1191
+
1192
+ // Length should match input
1193
+ expect(sparkline.length).toBe(scores.length);
1194
+
1195
+ // Should show ascending trend
1196
+ expect(sparkline).toContain("▁"); // Low score
1197
+ expect(sparkline).toContain("█"); // High score
1198
+ });
1199
+
1200
+ test("handles single score", () => {
1201
+ const sparkline = generateSparkline([0.5]);
1202
+ expect(sparkline.length).toBe(1);
1203
+ expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
1204
+ });
1205
+
1206
+ test("handles all same scores", () => {
1207
+ const sparkline = generateSparkline([0.5, 0.5, 0.5]);
1208
+ expect(sparkline.length).toBe(3);
1209
+ // All should be same character
1210
+ expect(new Set(sparkline.split("")).size).toBe(1);
1211
+ });
1212
+
1213
+ test("returns empty for empty array", () => {
1214
+ const sparkline = generateSparkline([]);
1215
+ expect(sparkline).toBe("");
1216
+ });
1217
+ });
1218
+
1219
+ describe("formatEvalRunResult", () => {
1220
+ test("shows pass/fail with gate result", () => {
1221
+ const result = {
1222
+ passed: true,
1223
+ phase: "production" as const,
1224
+ message: "Production phase: 2.5% regression - acceptable",
1225
+ baseline: 0.85,
1226
+ currentScore: 0.83,
1227
+ regressionPercent: 0.025,
1228
+ };
1229
+
1230
+ const output = formatEvalRunResult(result);
1231
+
1232
+ expect(output).toContain("PASS");
1233
+ expect(output).toContain("production");
1234
+ expect(output).toContain("0.83"); // current score
1235
+ expect(output).toContain("2.5%"); // regression
1236
+ });
1237
+
1238
+ test("shows failure with details", () => {
1239
+ const result = {
1240
+ passed: false,
1241
+ phase: "production" as const,
1242
+ message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
1243
+ baseline: 0.85,
1244
+ currentScore: 0.78,
1245
+ regressionPercent: 0.08,
1246
+ };
1247
+
1248
+ const output = formatEvalRunResult(result);
1249
+
1250
+ expect(output).toContain("FAIL");
1251
+ expect(output).toContain("8.0%");
1252
+ expect(output).toContain("exceeds");
1253
+ });
1254
+
1255
+ test("shows bootstrap phase without baseline", () => {
1256
+ const result = {
1257
+ passed: true,
1258
+ phase: "bootstrap" as const,
1259
+ message: "Bootstrap phase (5/10 runs) - collecting data",
1260
+ currentScore: 0.85,
1261
+ };
1262
+
1263
+ const output = formatEvalRunResult(result);
1264
+
1265
+ expect(output).toContain("bootstrap");
1266
+ expect(output).toContain("collecting data");
1267
+ expect(output).not.toContain("baseline");
1268
+ });
1269
+ });
1270
+ });
1271
+
1272
+ // ============================================================================
1273
+ // Eval Command Helpers (Implementation)
1274
+ // ============================================================================
1275
+
1276
+ /**
1277
+ * Generate sparkline from array of scores (0-1 range)
1278
+ */
1279
+ function generateSparkline(scores: number[]): string {
1280
+ if (scores.length === 0) return "";
1281
+
1282
+ const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
1283
+ const min = Math.min(...scores);
1284
+ const max = Math.max(...scores);
1285
+ const range = max - min;
1286
+
1287
+ if (range === 0) {
1288
+ // All scores the same
1289
+ return chars[4].repeat(scores.length);
1290
+ }
1291
+
1292
+ return scores
1293
+ .map((score) => {
1294
+ const normalized = (score - min) / range;
1295
+ const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
1296
+ return chars[index];
1297
+ })
1298
+ .join("");
1299
+ }
1300
+
1301
+ /**
1302
+ * Format eval status for display
1303
+ */
1304
+ function formatEvalStatus(status: {
1305
+ phase: "bootstrap" | "stabilization" | "production";
1306
+ runCount: number;
1307
+ thresholds: { stabilization: number; production: number };
1308
+ recentScores: Array<{ timestamp: string; score: number }>;
1309
+ }): string {
1310
+ const lines: string[] = [];
1311
+
1312
+ // Phase banner
1313
+ const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
1314
+ lines.push(`${phaseEmoji} Phase: ${status.phase}`);
1315
+ lines.push(`Runs: ${status.runCount}`);
1316
+ lines.push("");
1317
+
1318
+ // Thresholds
1319
+ lines.push("Thresholds:");
1320
+ lines.push(` Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
1321
+ lines.push(` Production: ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
1322
+ lines.push("");
1323
+
1324
+ // Recent scores with sparkline
1325
+ if (status.recentScores.length > 0) {
1326
+ lines.push("Recent scores:");
1327
+ const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
1328
+ lines.push(` ${sparkline}`);
1329
+ for (const { timestamp, score } of status.recentScores) {
1330
+ const time = new Date(timestamp).toLocaleString();
1331
+ lines.push(` ${time}: ${score.toFixed(2)}`);
1332
+ }
1333
+ } else {
1334
+ lines.push("No scores yet - collecting data");
1335
+ }
1336
+
1337
+ return lines.join("\n");
1338
+ }
1339
+
1340
+ /**
1341
+ * Format eval history for display
1342
+ */
1343
+ function formatEvalHistory(history: Array<{
1344
+ timestamp: string;
1345
+ eval_name: string;
1346
+ score: number;
1347
+ run_count: number;
1348
+ }>): string {
1349
+ if (history.length === 0) {
1350
+ return "No eval history found";
1351
+ }
1352
+
1353
+ const lines: string[] = [];
1354
+ lines.push("Eval History:");
1355
+ lines.push("");
1356
+
1357
+ // Group by eval name
1358
+ const grouped = new Map<string, typeof history>();
1359
+ for (const entry of history) {
1360
+ if (!grouped.has(entry.eval_name)) {
1361
+ grouped.set(entry.eval_name, []);
1362
+ }
1363
+ grouped.get(entry.eval_name)!.push(entry);
1364
+ }
1365
+
1366
+ // Display each eval group
1367
+ for (const [evalName, entries] of grouped) {
1368
+ lines.push(`${evalName}:`);
1369
+ const sparkline = generateSparkline(entries.map((e) => e.score));
1370
+ lines.push(` Trend: ${sparkline}`);
1371
+
1372
+ // Show latest 5 entries
1373
+ const latest = entries.slice(-5);
1374
+ for (const entry of latest) {
1375
+ const time = new Date(entry.timestamp).toLocaleTimeString();
1376
+ lines.push(` ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
1377
+ }
1378
+
1379
+ if (entries.length > 5) {
1380
+ lines.push(` ... and ${entries.length - 5} more`);
1381
+ }
1382
+
1383
+ lines.push("");
1384
+ }
1385
+
1386
+ return lines.join("\n");
1387
+ }
1388
+
1389
+ /**
1390
+ * Format eval run result (gate check)
1391
+ */
1392
+ function formatEvalRunResult(result: {
1393
+ passed: boolean;
1394
+ phase: "bootstrap" | "stabilization" | "production";
1395
+ message: string;
1396
+ baseline?: number;
1397
+ currentScore: number;
1398
+ regressionPercent?: number;
1399
+ }): string {
1400
+ const lines: string[] = [];
1401
+
1402
+ // Pass/fail banner
1403
+ const status = result.passed ? "✅ PASS" : "❌ FAIL";
1404
+ lines.push(status);
1405
+ lines.push("");
1406
+
1407
+ // Phase and score
1408
+ lines.push(`Phase: ${result.phase}`);
1409
+ lines.push(`Score: ${result.currentScore.toFixed(2)}`);
1410
+
1411
+ if (result.baseline !== undefined) {
1412
+ lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
1413
+ }
1414
+
1415
+ if (result.regressionPercent !== undefined) {
1416
+ const sign = result.regressionPercent > 0 ? "+" : "";
1417
+ lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
1418
+ }
1419
+
1420
+ lines.push("");
1421
+ lines.push(result.message);
1422
+
1423
+ return lines.join("\n");
1424
+ }
1425
+
1426
+ // ============================================================================
1427
+ // Eval Run Tests
1428
+ // ============================================================================
1429
+
1430
+ describe("Eval Run CI Mode", () => {
1431
+ let testDir: string;
1432
+
1433
+ beforeEach(() => {
1434
+ testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
1435
+ mkdirSync(testDir, { recursive: true });
1436
+ });
1437
+
1438
+ afterEach(() => {
1439
+ if (existsSync(testDir)) {
1440
+ rmSync(testDir, { recursive: true, force: true });
1441
+ }
1442
+ });
1443
+
1444
+ test("writes eval results JSON file", async () => {
1445
+ // Import the function we need to test
1446
+ const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
1447
+ const { checkGate } = await import("../src/eval-gates.js");
1448
+ const { ensureHiveDirectory } = await import("../src/hive.js");
1449
+
1450
+ // Set up test data
1451
+ const evalName = "test-eval";
1452
+ const mockScore = 0.85;
1453
+
1454
+ // Ensure directory exists
1455
+ ensureHiveDirectory(testDir);
1456
+
1457
+ // Get history and record run (simulating what eval run does)
1458
+ const history = getScoreHistory(testDir, evalName);
1459
+ recordEvalRun(testDir, {
1460
+ timestamp: new Date().toISOString(),
1461
+ eval_name: evalName,
1462
+ score: mockScore,
1463
+ run_count: history.length + 1,
1464
+ });
1465
+
1466
+ // Check gate
1467
+ const gateResult = checkGate(testDir, evalName, mockScore);
1468
+
1469
+ // Write results file (simulating CI mode)
1470
+ const resultsPath = join(testDir, ".hive", "eval-results.json");
1471
+ const results = { [evalName]: gateResult };
1472
+ writeFileSync(resultsPath, JSON.stringify(results, null, 2));
1473
+
1474
+ // Verify file exists and has correct structure
1475
+ expect(existsSync(resultsPath)).toBe(true);
1476
+
1477
+ const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
1478
+ expect(savedResults).toHaveProperty(evalName);
1479
+ expect(savedResults[evalName]).toMatchObject({
1480
+ passed: true,
1481
+ phase: "bootstrap",
1482
+ currentScore: mockScore,
1483
+ });
1484
+ });
1485
+
1486
+ test("bootstrap phase always passes", async () => {
1487
+ const { checkGate } = await import("../src/eval-gates.js");
1488
+
1489
+ // Even with a low score, bootstrap phase should pass
1490
+ const result = checkGate(testDir, "test-eval", 0.1);
1491
+
1492
+ expect(result.passed).toBe(true);
1493
+ expect(result.phase).toBe("bootstrap");
1494
+ expect(result.message).toContain("Bootstrap phase");
1495
+ });
1496
+
1497
+ test("production phase fails on regression", async () => {
1498
+ const { recordEvalRun } = await import("../src/eval-history.js");
1499
+ const { checkGate } = await import("../src/eval-gates.js");
1500
+ const { ensureHiveDirectory } = await import("../src/hive.js");
1501
+
1502
+ ensureHiveDirectory(testDir);
1503
+
1504
+ // Simulate 60 runs with consistent high scores to reach production phase
1505
+ for (let i = 0; i < 60; i++) {
1506
+ recordEvalRun(testDir, {
1507
+ timestamp: new Date().toISOString(),
1508
+ eval_name: "test-eval",
1509
+ score: 0.9,
1510
+ run_count: i + 1,
1511
+ });
1512
+ }
1513
+
1514
+ // Now test with a regressed score (>5% drop from 0.9 baseline)
1515
+ const regressedScore = 0.8; // 11% drop
1516
+ const result = checkGate(testDir, "test-eval", regressedScore);
1517
+
1518
+ expect(result.passed).toBe(false);
1519
+ expect(result.phase).toBe("production");
1520
+ expect(result.message).toContain("FAIL");
1521
+ });
1522
+ });