opencode-swarm-plugin 0.39.1 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +76 -11
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +207 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.test.ts +1054 -719
  14. package/bin/swarm.ts +577 -0
  15. package/dist/compaction-hook.d.ts +10 -1
  16. package/dist/compaction-hook.d.ts.map +1 -1
  17. package/dist/compaction-observability.d.ts +173 -0
  18. package/dist/compaction-observability.d.ts.map +1 -0
  19. package/dist/compaction-prompt-scoring.d.ts +1 -0
  20. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  21. package/dist/eval-capture.d.ts +93 -0
  22. package/dist/eval-capture.d.ts.map +1 -1
  23. package/dist/eval-runner.d.ts +134 -0
  24. package/dist/eval-runner.d.ts.map +1 -0
  25. package/dist/hive.d.ts.map +1 -1
  26. package/dist/index.d.ts +65 -1
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +84043 -28070
  29. package/dist/memory-tools.d.ts +70 -2
  30. package/dist/memory-tools.d.ts.map +1 -1
  31. package/dist/memory.d.ts +37 -0
  32. package/dist/memory.d.ts.map +1 -1
  33. package/dist/observability-tools.d.ts +64 -0
  34. package/dist/observability-tools.d.ts.map +1 -1
  35. package/dist/plugin.js +83570 -27466
  36. package/dist/schemas/task.d.ts +3 -3
  37. package/dist/swarm-orchestrate.d.ts.map +1 -1
  38. package/dist/swarm-prompts.d.ts +32 -1
  39. package/dist/swarm-prompts.d.ts.map +1 -1
  40. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  41. package/evals/ARCHITECTURE.md +1189 -0
  42. package/evals/README.md +113 -0
  43. package/evals/example.eval.ts +3 -4
  44. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  45. package/evals/scorers/coordinator-discipline.evalite-test.ts +163 -0
  46. package/evals/scorers/coordinator-discipline.ts +82 -2
  47. package/evals/scorers/index.test.ts +146 -0
  48. package/evals/scorers/index.ts +104 -0
  49. package/evals/swarm-decomposition.eval.ts +13 -4
  50. package/examples/commands/swarm.md +291 -21
  51. package/package.json +4 -3
  52. package/src/compaction-hook.ts +258 -110
  53. package/src/compaction-observability.integration.test.ts +139 -0
  54. package/src/compaction-observability.test.ts +187 -0
  55. package/src/compaction-observability.ts +324 -0
  56. package/src/compaction-prompt-scorers.test.ts +10 -9
  57. package/src/compaction-prompt-scoring.ts +7 -5
  58. package/src/eval-capture.test.ts +204 -1
  59. package/src/eval-capture.ts +194 -2
  60. package/src/eval-runner.test.ts +223 -0
  61. package/src/eval-runner.ts +402 -0
  62. package/src/hive.ts +57 -22
  63. package/src/index.ts +54 -1
  64. package/src/memory-tools.test.ts +84 -0
  65. package/src/memory-tools.ts +68 -3
  66. package/src/memory.test.ts +2 -2
  67. package/src/memory.ts +122 -49
  68. package/src/observability-tools.test.ts +13 -0
  69. package/src/observability-tools.ts +277 -0
  70. package/src/swarm-orchestrate.test.ts +162 -0
  71. package/src/swarm-orchestrate.ts +7 -5
  72. package/src/swarm-prompts.test.ts +168 -4
  73. package/src/swarm-prompts.ts +228 -7
  74. package/.env +0 -2
  75. package/.turbo/turbo-test.log +0 -481
  76. package/.turbo/turbo-typecheck.log +0 -1
  77. package/dist/beads.d.ts +0 -386
  78. package/dist/beads.d.ts.map +0 -1
  79. package/dist/schemas/bead-events.d.ts +0 -698
  80. package/dist/schemas/bead-events.d.ts.map +0 -1
  81. package/dist/schemas/bead.d.ts +0 -255
  82. package/dist/schemas/bead.d.ts.map +0 -1
package/bin/swarm.test.ts CHANGED
@@ -1,11 +1,10 @@
1
1
  #!/usr/bin/env bun
2
2
  /**
3
- * Tests for swarm CLI file operation helpers
3
+ * Tests for swarm CLI helpers
4
4
  *
5
- * These tests verify the verbose output helpers used in `swarm setup`:
6
- * - writeFileWithStatus: logs created/updated/unchanged status
7
- * - mkdirWithStatus: logs directory creation
8
- * - rmWithStatus: logs file removal
5
+ * These tests verify the CLI helpers:
6
+ * - File operation helpers (writeFileWithStatus, mkdirWithStatus, rmWithStatus)
7
+ * - Swarm history helpers (formatSwarmHistory, parseHistoryArgs, filterHistoryByStatus)
9
8
  */
10
9
  import { describe, test, expect, beforeEach, afterEach } from "bun:test";
11
10
  import { mkdirSync, rmSync, writeFileSync, existsSync, readFileSync, readdirSync } from "fs";
@@ -197,6 +196,412 @@ READ-ONLY research agent. Never modifies code - only gathers intel and stores fi
197
196
  // Log Command Tests (TDD)
198
197
  // ============================================================================
199
198
 
199
+ // ============================================================================
200
+ // Session Log Tests (TDD)
201
+ // ============================================================================
202
+
203
+ import type { CoordinatorEvent } from "../src/eval-capture";
204
+
205
+ const TEST_SESSIONS_DIR = join(tmpdir(), "swarm-test-sessions");
206
+
207
+ describe("swarm log sessions", () => {
208
+ beforeEach(() => {
209
+ // Create test sessions directory
210
+ if (!existsSync(TEST_SESSIONS_DIR)) {
211
+ mkdirSync(TEST_SESSIONS_DIR, { recursive: true });
212
+ }
213
+ });
214
+
215
+ afterEach(() => {
216
+ // Cleanup test directory
217
+ if (existsSync(TEST_SESSIONS_DIR)) {
218
+ rmSync(TEST_SESSIONS_DIR, { recursive: true, force: true });
219
+ }
220
+ });
221
+
222
+ // ========================================================================
223
+ // Helper Functions (to be implemented in swarm.ts)
224
+ // ========================================================================
225
+
226
+ function createTestSession(
227
+ sessionId: string,
228
+ epicId: string,
229
+ eventCount: number,
230
+ baseTimestamp?: number,
231
+ ): void {
232
+ const filePath = join(TEST_SESSIONS_DIR, `${sessionId}.jsonl`);
233
+ const lines: string[] = [];
234
+ const base = baseTimestamp || Date.now();
235
+
236
+ for (let i = 0; i < eventCount; i++) {
237
+ const event: CoordinatorEvent = {
238
+ session_id: sessionId,
239
+ epic_id: epicId,
240
+ timestamp: new Date(base - (eventCount - i) * 1000).toISOString(),
241
+ event_type: "DECISION",
242
+ decision_type: "worker_spawned",
243
+ payload: { worker_id: `worker-${i}` },
244
+ };
245
+ lines.push(JSON.stringify(event));
246
+ }
247
+
248
+ writeFileSync(filePath, lines.join("\n") + "\n");
249
+ }
250
+
251
+ /**
252
+ * Parse a session file and return events
253
+ */
254
+ function parseSessionFile(filePath: string): CoordinatorEvent[] {
255
+ if (!existsSync(filePath)) {
256
+ throw new Error(`Session file not found: ${filePath}`);
257
+ }
258
+
259
+ const content = readFileSync(filePath, "utf-8");
260
+ const lines = content.split("\n").filter((line) => line.trim());
261
+ const events: CoordinatorEvent[] = [];
262
+
263
+ for (const line of lines) {
264
+ try {
265
+ const parsed = JSON.parse(line);
266
+ events.push(parsed);
267
+ } catch {
268
+ // Skip invalid JSON lines
269
+ }
270
+ }
271
+
272
+ return events;
273
+ }
274
+
275
+ /**
276
+ * List all session files in a directory
277
+ */
278
+ function listSessionFiles(
279
+ dir: string,
280
+ ): Array<{
281
+ session_id: string;
282
+ file_path: string;
283
+ event_count: number;
284
+ start_time: string;
285
+ end_time?: string;
286
+ }> {
287
+ if (!existsSync(dir)) return [];
288
+
289
+ const files = readdirSync(dir).filter((f) => f.endsWith(".jsonl"));
290
+ const sessions: Array<{
291
+ session_id: string;
292
+ file_path: string;
293
+ event_count: number;
294
+ start_time: string;
295
+ end_time?: string;
296
+ }> = [];
297
+
298
+ for (const file of files) {
299
+ const filePath = join(dir, file);
300
+ try {
301
+ const events = parseSessionFile(filePath);
302
+ if (events.length === 0) continue;
303
+
304
+ const timestamps = events.map((e) => new Date(e.timestamp).getTime());
305
+ const startTime = new Date(Math.min(...timestamps)).toISOString();
306
+ const endTime =
307
+ timestamps.length > 1
308
+ ? new Date(Math.max(...timestamps)).toISOString()
309
+ : undefined;
310
+
311
+ sessions.push({
312
+ session_id: events[0].session_id,
313
+ file_path: filePath,
314
+ event_count: events.length,
315
+ start_time: startTime,
316
+ end_time: endTime,
317
+ });
318
+ } catch {
319
+ // Skip invalid files
320
+ }
321
+ }
322
+
323
+ // Sort by start time (newest first)
324
+ return sessions.sort((a, b) =>
325
+ new Date(b.start_time).getTime() - new Date(a.start_time).getTime()
326
+ );
327
+ }
328
+
329
+ /**
330
+ * Get the latest session file
331
+ */
332
+ function getLatestSession(
333
+ dir: string,
334
+ ): {
335
+ session_id: string;
336
+ file_path: string;
337
+ event_count: number;
338
+ start_time: string;
339
+ end_time?: string;
340
+ } | null {
341
+ const sessions = listSessionFiles(dir);
342
+ return sessions.length > 0 ? sessions[0] : null;
343
+ }
344
+
345
+ /**
346
+ * Filter events by type
347
+ */
348
+ function filterEventsByType(
349
+ events: CoordinatorEvent[],
350
+ eventType: string,
351
+ ): CoordinatorEvent[] {
352
+ if (eventType === "all") return events;
353
+ return events.filter((e) => e.event_type === eventType.toUpperCase());
354
+ }
355
+
356
+ /**
357
+ * Filter events by time
358
+ */
359
+ function filterEventsSince(
360
+ events: CoordinatorEvent[],
361
+ sinceMs: number,
362
+ ): CoordinatorEvent[] {
363
+ const cutoffTime = Date.now() - sinceMs;
364
+ return events.filter((e) =>
365
+ new Date(e.timestamp).getTime() >= cutoffTime
366
+ );
367
+ }
368
+
369
+ // ========================================================================
370
+ // Tests
371
+ // ========================================================================
372
+
373
+ describe("listSessionFiles", () => {
374
+ test("returns empty array when directory doesn't exist", () => {
375
+ const result = listSessionFiles("/nonexistent/directory");
376
+ expect(result).toEqual([]);
377
+ });
378
+
379
+ test("returns empty array when directory is empty", () => {
380
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
381
+ expect(result).toEqual([]);
382
+ });
383
+
384
+ test("lists all session files with metadata", () => {
385
+ createTestSession("ses_abc123", "epic-1", 5);
386
+ createTestSession("ses_def456", "epic-2", 3);
387
+
388
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
389
+
390
+ expect(result).toHaveLength(2);
391
+ expect(result[0].session_id).toMatch(/^ses_/);
392
+ expect(result[0].event_count).toBeGreaterThan(0);
393
+ expect(result[0].start_time).toBeTruthy();
394
+ });
395
+
396
+ test("calculates event count correctly", () => {
397
+ createTestSession("ses_test", "epic-1", 10);
398
+
399
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
400
+
401
+ expect(result[0].event_count).toBe(10);
402
+ });
403
+
404
+ test("extracts start and end times from events", () => {
405
+ createTestSession("ses_test", "epic-1", 5);
406
+
407
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
408
+
409
+ expect(result[0].start_time).toBeTruthy();
410
+ expect(new Date(result[0].start_time).getTime()).toBeLessThan(Date.now());
411
+ });
412
+
413
+ test("sorts sessions by start time (newest first)", () => {
414
+ // Create sessions with explicit different timestamps
415
+ const oldTime = Date.now() - 60000; // 1 minute ago
416
+ const newTime = Date.now();
417
+
418
+ createTestSession("ses_old", "epic-1", 2, oldTime);
419
+ createTestSession("ses_new", "epic-2", 2, newTime);
420
+
421
+ const result = listSessionFiles(TEST_SESSIONS_DIR);
422
+
423
+ expect(result[0].session_id).toBe("ses_new");
424
+ expect(result[1].session_id).toBe("ses_old");
425
+ });
426
+ });
427
+
428
+ describe("parseSessionFile", () => {
429
+ test("parses valid JSONL session file", () => {
430
+ createTestSession("ses_parse", "epic-1", 3);
431
+ const filePath = join(TEST_SESSIONS_DIR, "ses_parse.jsonl");
432
+
433
+ const events = parseSessionFile(filePath);
434
+
435
+ expect(events).toHaveLength(3);
436
+ expect(events[0].session_id).toBe("ses_parse");
437
+ expect(events[0].event_type).toBe("DECISION");
438
+ });
439
+
440
+ test("handles file with trailing newlines", () => {
441
+ const filePath = join(TEST_SESSIONS_DIR, "ses_trailing.jsonl");
442
+ writeFileSync(
443
+ filePath,
444
+ '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\n\n\n',
445
+ );
446
+
447
+ const events = parseSessionFile(filePath);
448
+
449
+ expect(events).toHaveLength(1);
450
+ });
451
+
452
+ test("skips invalid JSON lines", () => {
453
+ const filePath = join(TEST_SESSIONS_DIR, "ses_invalid.jsonl");
454
+ writeFileSync(
455
+ filePath,
456
+ '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\ninvalid json\n{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{}}\n',
457
+ );
458
+
459
+ const events = parseSessionFile(filePath);
460
+
461
+ expect(events).toHaveLength(2);
462
+ });
463
+
464
+ test("throws error for non-existent file", () => {
465
+ expect(() => parseSessionFile("/nonexistent/file.jsonl")).toThrow();
466
+ });
467
+ });
468
+
469
+ describe("getLatestSession", () => {
470
+ test("returns null when directory is empty", () => {
471
+ const result = getLatestSession(TEST_SESSIONS_DIR);
472
+ expect(result).toBeNull();
473
+ });
474
+
475
+ test("returns the most recent session", () => {
476
+ const oldTime = Date.now() - 60000; // 1 minute ago
477
+ const newTime = Date.now();
478
+
479
+ createTestSession("ses_old", "epic-1", 2, oldTime);
480
+ createTestSession("ses_new", "epic-2", 3, newTime);
481
+
482
+ const result = getLatestSession(TEST_SESSIONS_DIR);
483
+
484
+ expect(result).not.toBeNull();
485
+ expect(result!.session_id).toBe("ses_new");
486
+ });
487
+ });
488
+
489
+ describe("filterEventsByType", () => {
490
+ test("filters DECISION events only", () => {
491
+ const events: CoordinatorEvent[] = [
492
+ {
493
+ session_id: "s1",
494
+ epic_id: "e1",
495
+ timestamp: "2025-01-01T00:00:00Z",
496
+ event_type: "DECISION",
497
+ decision_type: "worker_spawned",
498
+ payload: {},
499
+ },
500
+ {
501
+ session_id: "s1",
502
+ epic_id: "e1",
503
+ timestamp: "2025-01-01T00:00:01Z",
504
+ event_type: "VIOLATION",
505
+ violation_type: "direct_edit",
506
+ payload: {},
507
+ },
508
+ {
509
+ session_id: "s1",
510
+ epic_id: "e1",
511
+ timestamp: "2025-01-01T00:00:02Z",
512
+ event_type: "DECISION",
513
+ decision_type: "worker_spawned",
514
+ payload: {},
515
+ },
516
+ ];
517
+
518
+ const result = filterEventsByType(events, "DECISION");
519
+
520
+ expect(result).toHaveLength(2);
521
+ expect(result.every((e) => e.event_type === "DECISION")).toBe(true);
522
+ });
523
+
524
+ test("returns all events when type is 'all'", () => {
525
+ const events: CoordinatorEvent[] = [
526
+ {
527
+ session_id: "s1",
528
+ epic_id: "e1",
529
+ timestamp: "2025-01-01T00:00:00Z",
530
+ event_type: "DECISION",
531
+ decision_type: "worker_spawned",
532
+ payload: {},
533
+ },
534
+ {
535
+ session_id: "s1",
536
+ epic_id: "e1",
537
+ timestamp: "2025-01-01T00:00:01Z",
538
+ event_type: "VIOLATION",
539
+ violation_type: "direct_edit",
540
+ payload: {},
541
+ },
542
+ ];
543
+
544
+ const result = filterEventsByType(events, "all");
545
+
546
+ expect(result).toHaveLength(2);
547
+ });
548
+ });
549
+
550
+ describe("filterEventsSince", () => {
551
+ test("filters events within time window", () => {
552
+ const now = Date.now();
553
+ const events: CoordinatorEvent[] = [
554
+ {
555
+ session_id: "s1",
556
+ epic_id: "e1",
557
+ timestamp: new Date(now - 5000).toISOString(), // 5s ago
558
+ event_type: "DECISION",
559
+ decision_type: "worker_spawned",
560
+ payload: {},
561
+ },
562
+ {
563
+ session_id: "s1",
564
+ epic_id: "e1",
565
+ timestamp: new Date(now - 10000).toISOString(), // 10s ago
566
+ event_type: "DECISION",
567
+ decision_type: "worker_spawned",
568
+ payload: {},
569
+ },
570
+ {
571
+ session_id: "s1",
572
+ epic_id: "e1",
573
+ timestamp: new Date(now - 60000).toISOString(), // 1min ago
574
+ event_type: "DECISION",
575
+ decision_type: "worker_spawned",
576
+ payload: {},
577
+ },
578
+ ];
579
+
580
+ const result = filterEventsSince(events, 30000); // Last 30s
581
+
582
+ expect(result).toHaveLength(2); // 10s and 3s ago
583
+ });
584
+
585
+ test("returns all events when sinceMs is very large", () => {
586
+ const now = Date.now();
587
+ const events: CoordinatorEvent[] = [
588
+ {
589
+ session_id: "s1",
590
+ epic_id: "e1",
591
+ timestamp: new Date(now - 1000).toISOString(),
592
+ event_type: "DECISION",
593
+ decision_type: "worker_spawned",
594
+ payload: {},
595
+ },
596
+ ];
597
+
598
+ const result = filterEventsSince(events, 86400000); // 1 day
599
+
600
+ expect(result).toHaveLength(1);
601
+ });
602
+ });
603
+ });
604
+
200
605
  // ============================================================================
201
606
  // Cells Command Tests (TDD)
202
607
  // ============================================================================
@@ -276,841 +681,771 @@ describe("Cells command", () => {
276
681
  },
277
682
  ];
278
683
 
279
- const table = formatCellsTable(cells);
280
-
281
- // Should contain headers
282
- expect(table).toContain("ID");
283
- expect(table).toContain("TITLE");
284
- expect(table).toContain("STATUS");
285
- expect(table).toContain("PRIORITY");
286
-
287
- // Should contain cell data
288
- expect(table).toContain("test-abc123-xyz");
289
- expect(table).toContain("Fix bug");
290
- expect(table).toContain("open");
291
- expect(table).toContain("0");
292
-
293
- expect(table).toContain("test-def456-abc");
294
- expect(table).toContain("Add feature");
295
- expect(table).toContain("in_progress");
296
- expect(table).toContain("2");
297
- });
684
+ const result = formatCellsTable(cells);
298
685
 
299
- test("returns 'No cells found' for empty array", () => {
300
- const table = formatCellsTable([]);
301
- expect(table).toBe("No cells found");
686
+ expect(result).toContain("ID");
687
+ expect(result).toContain("TITLE");
688
+ expect(result).toContain("STATUS");
689
+ expect(result).toContain("PRIORITY");
690
+ expect(result).toContain("Fix bug");
691
+ expect(result).toContain("Add feature");
692
+ expect(result).toContain("open");
693
+ expect(result).toContain("in_progress");
302
694
  });
303
- });
304
- });
305
-
306
- describe("Log command helpers", () => {
307
- let testDir: string;
308
-
309
- beforeEach(() => {
310
- testDir = join(tmpdir(), `swarm-log-test-${Date.now()}`);
311
- mkdirSync(testDir, { recursive: true });
312
- });
313
-
314
- afterEach(() => {
315
- if (existsSync(testDir)) {
316
- rmSync(testDir, { recursive: true, force: true });
317
- }
318
- });
319
-
320
- describe("parseLogLine", () => {
321
- function parseLogLine(line: string): { level: number; time: string; module: string; msg: string } | null {
322
- try {
323
- const parsed = JSON.parse(line);
324
- if (typeof parsed.level === "number" && parsed.time && parsed.msg) {
325
- return {
326
- level: parsed.level,
327
- time: parsed.time,
328
- module: parsed.module || "unknown",
329
- msg: parsed.msg,
330
- };
331
- }
332
- } catch {
333
- // Invalid JSON
334
- }
335
- return null;
336
- }
337
695
 
338
- test("parses valid log line", () => {
339
- const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","module":"compaction","msg":"started"}';
340
- const result = parseLogLine(line);
341
-
342
- expect(result).not.toBeNull();
343
- expect(result?.level).toBe(30);
344
- expect(result?.module).toBe("compaction");
345
- expect(result?.msg).toBe("started");
346
- });
696
+ test("truncates long titles with ellipsis", () => {
697
+ const cells = [
698
+ {
699
+ id: "test-abc",
700
+ title: "A".repeat(100),
701
+ status: "open",
702
+ priority: 0,
703
+ type: "task",
704
+ created_at: 1234567890,
705
+ updated_at: 1234567890,
706
+ },
707
+ ];
347
708
 
348
- test("returns null for invalid JSON", () => {
349
- const line = "not json";
350
- expect(parseLogLine(line)).toBeNull();
351
- });
709
+ const result = formatCellsTable(cells);
352
710
 
353
- test("defaults module to 'unknown' if missing", () => {
354
- const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"test"}';
355
- const result = parseLogLine(line);
356
-
357
- expect(result?.module).toBe("unknown");
711
+ expect(result).toContain("...");
712
+ expect(result.split("\n")[2]).toMatch(/A{47}\.\.\./);
358
713
  });
359
- });
360
714
 
361
- describe("filterLogsByLevel", () => {
362
- function filterLogsByLevel(logs: Array<{ level: number }>, minLevel: number): Array<{ level: number }> {
363
- return logs.filter((log) => log.level >= minLevel);
364
- }
715
+ test("returns 'No cells found' for empty array", () => {
716
+ const result = formatCellsTable([]);
365
717
 
366
- test("filters logs by minimum level", () => {
367
- const logs = [
368
- { level: 10 }, // trace
369
- { level: 30 }, // info
370
- { level: 50 }, // error
371
- ];
372
-
373
- const result = filterLogsByLevel(logs, 30);
374
- expect(result).toHaveLength(2);
375
- expect(result[0].level).toBe(30);
376
- expect(result[1].level).toBe(50);
718
+ expect(result).toBe("No cells found");
377
719
  });
378
720
 
379
- test("includes all logs when minLevel is 0", () => {
380
- const logs = [
381
- { level: 10 },
382
- { level: 20 },
383
- { level: 30 },
721
+ test("aligns columns correctly", () => {
722
+ const cells = [
723
+ {
724
+ id: "short",
725
+ title: "T",
726
+ status: "open",
727
+ priority: 0,
728
+ type: "task",
729
+ created_at: 1234567890,
730
+ updated_at: 1234567890,
731
+ },
732
+ {
733
+ id: "very-long-id-here",
734
+ title: "Very long title here",
735
+ status: "in_progress",
736
+ priority: 2,
737
+ type: "task",
738
+ created_at: 1234567890,
739
+ updated_at: 1234567890,
740
+ },
384
741
  ];
385
-
386
- const result = filterLogsByLevel(logs, 0);
387
- expect(result).toHaveLength(3);
388
- });
389
- });
390
742
 
391
- describe("filterLogsByModule", () => {
392
- function filterLogsByModule(logs: Array<{ module: string }>, module: string): Array<{ module: string }> {
393
- return logs.filter((log) => log.module === module);
394
- }
743
+ const result = formatCellsTable(cells);
744
+ const lines = result.split("\n");
395
745
 
396
- test("filters logs by exact module name", () => {
397
- const logs = [
398
- { module: "compaction" },
399
- { module: "swarm" },
400
- { module: "compaction" },
401
- ];
402
-
403
- const result = filterLogsByModule(logs, "compaction");
404
- expect(result).toHaveLength(2);
405
- });
406
-
407
- test("returns empty array when no match", () => {
408
- const logs = [
409
- { module: "compaction" },
410
- ];
411
-
412
- const result = filterLogsByModule(logs, "swarm");
413
- expect(result).toHaveLength(0);
746
+ // All lines should be same length (aligned)
747
+ const lengths = lines.map(l => l.length);
748
+ expect(Math.max(...lengths) - Math.min(...lengths)).toBeLessThan(3);
414
749
  });
415
750
  });
751
+ });
416
752
 
417
- describe("filterLogsBySince", () => {
418
- function parseDuration(duration: string): number | null {
419
- const match = duration.match(/^(\d+)([smhd])$/);
420
- if (!match) return null;
421
-
422
- const [, num, unit] = match;
423
- const value = parseInt(num, 10);
424
-
425
- const multipliers: Record<string, number> = {
426
- s: 1000,
427
- m: 60 * 1000,
428
- h: 60 * 60 * 1000,
429
- d: 24 * 60 * 60 * 1000,
430
- };
431
-
432
- return value * multipliers[unit];
433
- }
434
-
435
- function filterLogsBySince(logs: Array<{ time: string }>, sinceMs: number): Array<{ time: string }> {
436
- const cutoffTime = Date.now() - sinceMs;
437
- return logs.filter((log) => new Date(log.time).getTime() >= cutoffTime);
438
- }
439
-
440
- test("parseDuration handles seconds", () => {
441
- expect(parseDuration("30s")).toBe(30 * 1000);
442
- });
753
+ // ============================================================================
754
+ // Eval Gate Tests (TDD)
755
+ // ============================================================================
443
756
 
444
- test("parseDuration handles minutes", () => {
445
- expect(parseDuration("5m")).toBe(5 * 60 * 1000);
446
- });
757
+ interface EvalRunRecord {
758
+ timestamp: string;
759
+ eval_name: string;
760
+ score: number;
761
+ run_count: number;
762
+ }
447
763
 
448
- test("parseDuration handles hours", () => {
449
- expect(parseDuration("2h")).toBe(2 * 60 * 60 * 1000);
450
- });
764
+ interface GateResult {
765
+ passed: boolean;
766
+ phase: "bootstrap" | "stabilization" | "production";
767
+ message: string;
768
+ baseline?: number;
769
+ variance?: number;
770
+ }
451
771
 
452
- test("parseDuration handles days", () => {
453
- expect(parseDuration("1d")).toBe(24 * 60 * 60 * 1000);
454
- });
772
+ /**
773
+ * Calculate variance for phase transitions
774
+ */
775
+ function calculateVariance(scores: number[]): number {
776
+ if (scores.length <= 1) return 0;
455
777
 
456
- test("parseDuration returns null for invalid format", () => {
457
- expect(parseDuration("invalid")).toBeNull();
458
- expect(parseDuration("30x")).toBeNull();
459
- expect(parseDuration("30")).toBeNull();
460
- });
778
+ const mean = scores.reduce((sum, x) => sum + x, 0) / scores.length;
779
+ const squaredDiffs = scores.map((x) => Math.pow(x - mean, 2));
780
+ const variance = squaredDiffs.reduce((sum, x) => sum + x, 0) / scores.length;
461
781
 
462
- test("filterLogsBySince filters old logs", () => {
463
- const now = Date.now();
464
- const logs = [
465
- { time: new Date(now - 10000).toISOString() }, // 10s ago
466
- { time: new Date(now - 120000).toISOString() }, // 2m ago
467
- { time: new Date(now - 1000).toISOString() }, // 1s ago
468
- ];
469
-
470
- const result = filterLogsBySince(logs, 60000); // Last 1m
471
- expect(result).toHaveLength(2); // Only logs within last minute
472
- });
473
- });
782
+ return variance;
783
+ }
474
784
 
475
- describe("formatLogLine", () => {
476
- function levelToName(level: number): string {
477
- if (level >= 60) return "FATAL";
478
- if (level >= 50) return "ERROR";
479
- if (level >= 40) return "WARN ";
480
- if (level >= 30) return "INFO ";
481
- if (level >= 20) return "DEBUG";
482
- return "TRACE";
483
- }
785
+ /**
786
+ * Read all eval run records from .hive/eval-history.jsonl
787
+ */
788
+ function readAllRecords(projectPath: string): EvalRunRecord[] {
789
+ const recordsPath = join(projectPath, ".hive", "eval-history.jsonl");
484
790
 
485
- function formatLogLine(log: { level: number; time: string; module: string; msg: string }): string {
486
- const timestamp = new Date(log.time).toLocaleTimeString();
487
- const levelName = levelToName(log.level);
488
- const module = log.module.padEnd(12);
489
- return `${timestamp} ${levelName} ${module} ${log.msg}`;
490
- }
791
+ if (!existsSync(recordsPath)) {
792
+ return [];
793
+ }
491
794
 
492
- test("formats log line with timestamp and level", () => {
493
- const log = {
494
- level: 30,
495
- time: "2024-12-24T16:00:00.000Z",
496
- module: "compaction",
497
- msg: "started",
498
- };
499
-
500
- const result = formatLogLine(log);
501
- expect(result).toContain("INFO");
502
- expect(result).toContain("compaction");
503
- expect(result).toContain("started");
504
- });
795
+ const content = readFileSync(recordsPath, "utf-8");
796
+ const lines = content.split("\n").filter((line) => line.trim());
505
797
 
506
- test("pads module name for alignment", () => {
507
- const log1 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "a", msg: "test" });
508
- const log2 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "compaction", msg: "test" });
509
-
510
- // Module names should be padded to 12 chars
511
- expect(log1).toContain("a test"); // 'a' + 11 spaces
512
- expect(log2).toContain("compaction test"); // 'compaction' + 3 spaces (10 chars + 2)
513
- });
798
+ return lines.map((line) => JSON.parse(line) as EvalRunRecord);
799
+ }
514
800
 
515
- test("levelToName maps all levels correctly", () => {
516
- expect(levelToName(10)).toBe("TRACE");
517
- expect(levelToName(20)).toBe("DEBUG");
518
- expect(levelToName(30)).toBe("INFO ");
519
- expect(levelToName(40)).toBe("WARN ");
520
- expect(levelToName(50)).toBe("ERROR");
521
- expect(levelToName(60)).toBe("FATAL");
522
- });
523
- });
801
+ /**
802
+ * Record an eval run to .hive/eval-history.jsonl
803
+ */
804
+ function recordEvalRun(
805
+ projectPath: string,
806
+ record: EvalRunRecord,
807
+ ): void {
808
+ const hivePath = join(projectPath, ".hive");
809
+ const recordsPath = join(hivePath, "eval-history.jsonl");
810
+
811
+ // Ensure .hive directory exists
812
+ if (!existsSync(hivePath)) {
813
+ mkdirSync(hivePath, { recursive: true });
814
+ }
524
815
 
525
- describe("readLogFiles", () => {
526
- test("reads multiple .1log files", () => {
527
- // Create test log files
528
- const log1 = join(testDir, "swarm.1log");
529
- const log2 = join(testDir, "swarm.2log");
530
- const log3 = join(testDir, "compaction.1log");
531
-
532
- writeFileSync(log1, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"line1"}\n');
533
- writeFileSync(log2, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"line2"}\n');
534
- writeFileSync(log3, '{"level":30,"time":"2024-12-24T16:00:02.000Z","module":"compaction","msg":"line3"}\n');
535
-
536
- function readLogFiles(dir: string): string[] {
537
- if (!existsSync(dir)) return [];
538
-
539
- const files = readdirSync(dir)
540
- .filter((f) => /\.\d+log$/.test(f))
541
- .sort() // Sort by filename
542
- .map((f) => join(dir, f));
543
-
544
- const lines: string[] = [];
545
- for (const file of files) {
546
- const content = readFileSync(file, "utf-8");
547
- lines.push(...content.split("\n").filter((line) => line.trim()));
548
- }
549
-
550
- return lines;
551
- }
552
-
553
- const lines = readLogFiles(testDir);
554
- expect(lines).toHaveLength(3);
555
- // Files are sorted alphabetically: compaction.1log, swarm.1log, swarm.2log
556
- expect(lines.some((l) => l.includes("line1"))).toBe(true);
557
- expect(lines.some((l) => l.includes("line2"))).toBe(true);
558
- expect(lines.some((l) => l.includes("line3"))).toBe(true);
559
- });
816
+ // Append record as JSONL
817
+ const line = JSON.stringify(record) + "\n";
560
818
 
561
- test("returns empty array for non-existent directory", () => {
562
- function readLogFiles(dir: string): string[] {
563
- if (!existsSync(dir)) return [];
564
- return [];
565
- }
566
-
567
- const lines = readLogFiles(join(testDir, "nonexistent"));
568
- expect(lines).toHaveLength(0);
569
- });
570
- });
819
+ if (existsSync(recordsPath)) {
820
+ const existingContent = readFileSync(recordsPath, "utf-8");
821
+ writeFileSync(recordsPath, existingContent + line);
822
+ } else {
823
+ writeFileSync(recordsPath, line);
824
+ }
825
+ }
571
826
 
572
- describe("watchLogs", () => {
573
- test("detects new log lines appended to file", async () => {
574
- const logFile = join(testDir, "swarm.1log");
575
- const collectedLines: string[] = [];
576
-
577
- // Create initial log file
578
- writeFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"initial"}\n');
579
-
580
- // Import watch utilities
581
- const { watch } = await import("fs");
582
- const { appendFileSync } = await import("fs");
583
-
584
- // Track file position for incremental reads
585
- let lastSize = 0;
586
-
587
- function readNewLines(filePath: string): string[] {
588
- const content = readFileSync(filePath, "utf-8");
589
- const newContent = content.slice(lastSize);
590
- lastSize = content.length;
591
- return newContent.split("\n").filter((line) => line.trim());
592
- }
593
-
594
- // Simulate watch behavior
595
- const watcher = watch(testDir, (eventType, filename) => {
596
- if (filename && /\.\d+log$/.test(filename)) {
597
- const newLines = readNewLines(join(testDir, filename));
598
- collectedLines.push(...newLines);
599
- }
600
- });
601
-
602
- // Wait for watcher to be ready
603
- await new Promise((resolve) => setTimeout(resolve, 100));
604
-
605
- // Append new log line
606
- appendFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"appended"}\n');
607
-
608
- // Wait for event to fire
609
- await new Promise((resolve) => setTimeout(resolve, 200));
610
-
611
- watcher.close();
612
-
613
- // Should have detected the new line
614
- expect(collectedLines.some((l) => l.includes("appended"))).toBe(true);
615
- });
827
+ /**
828
+ * Check eval gate for progressive gating
829
+ */
830
+ function checkGate(
831
+ projectPath: string,
832
+ evalName: string,
833
+ currentScore: number,
834
+ ): GateResult {
835
+ const records = readAllRecords(projectPath).filter(
836
+ (r) => r.eval_name === evalName,
837
+ );
616
838
 
617
- test("parseWatchArgs extracts --watch flag", () => {
618
- function parseWatchArgs(args: string[]): { watch: boolean; interval: number } {
619
- let watch = false;
620
- let interval = 1000; // default 1 second
621
-
622
- for (let i = 0; i < args.length; i++) {
623
- const arg = args[i];
624
- if (arg === "--watch" || arg === "-w") {
625
- watch = true;
626
- } else if (arg === "--interval" && i + 1 < args.length) {
627
- interval = parseInt(args[++i], 10);
628
- }
629
- }
630
-
631
- return { watch, interval };
632
- }
633
-
634
- expect(parseWatchArgs(["--watch"])).toEqual({ watch: true, interval: 1000 });
635
- expect(parseWatchArgs(["-w"])).toEqual({ watch: true, interval: 1000 });
636
- expect(parseWatchArgs(["--watch", "--interval", "500"])).toEqual({ watch: true, interval: 500 });
637
- expect(parseWatchArgs(["compaction", "--watch"])).toEqual({ watch: true, interval: 1000 });
638
- expect(parseWatchArgs(["--level", "error"])).toEqual({ watch: false, interval: 1000 });
639
- });
640
- });
641
- });
839
+ if (records.length < 10) {
840
+ return {
841
+ passed: true,
842
+ phase: "bootstrap",
843
+ message: `BOOTSTRAP (${records.length}/10 runs): no gates yet`,
844
+ };
845
+ }
642
846
 
643
- // ============================================================================
644
- // Eval Commands Tests (TDD)
645
- // ============================================================================
847
+ const lastTenScores = records.slice(-10).map((r) => r.score);
848
+ const baseline = lastTenScores.reduce((sum, x) => sum + x, 0) / lastTenScores.length;
849
+ const variance = calculateVariance(lastTenScores);
646
850
 
647
- describe("Eval commands", () => {
648
- describe("formatEvalStatus", () => {
649
- test("displays phase, thresholds, and recent scores", () => {
650
- const status = {
651
- phase: "stabilization" as const,
652
- runCount: 25,
653
- thresholds: {
654
- stabilization: 0.1,
655
- production: 0.05,
656
- },
657
- recentScores: [
658
- { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
659
- { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
660
- { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
661
- ],
851
+ if (records.length < 50) {
852
+ const drop = ((baseline - currentScore) / baseline) * 100;
853
+ if (drop > 5) {
854
+ return {
855
+ passed: false,
856
+ phase: "stabilization",
857
+ message: `WARN: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)}`,
858
+ baseline,
859
+ variance,
662
860
  };
861
+ }
663
862
 
664
- const output = formatEvalStatus(status);
665
-
666
- // Should show phase
667
- expect(output).toContain("stabilization");
668
-
669
- // Should show run count
670
- expect(output).toContain("25");
671
-
672
- // Should show thresholds
673
- expect(output).toContain("10%"); // stabilization threshold
674
- expect(output).toContain("5%"); // production threshold
675
-
676
- // Should show recent scores
677
- expect(output).toContain("0.85");
678
- expect(output).toContain("0.87");
679
- expect(output).toContain("0.82");
680
- });
863
+ return {
864
+ passed: true,
865
+ phase: "stabilization",
866
+ message: `Stabilization (${records.length}/50 runs): baseline=${baseline.toFixed(2)}`,
867
+ baseline,
868
+ variance,
869
+ };
870
+ }
681
871
 
682
- test("shows bootstrap phase message", () => {
683
- const status = {
684
- phase: "bootstrap" as const,
685
- runCount: 5,
686
- thresholds: {
687
- stabilization: 0.1,
688
- production: 0.05,
689
- },
690
- recentScores: [],
872
+ // Production phase: variance < 0.1 AND score doesn't drop >5%
873
+ if (variance < 0.1) {
874
+ const drop = ((baseline - currentScore) / baseline) * 100;
875
+ if (drop > 5) {
876
+ return {
877
+ passed: false,
878
+ phase: "production",
879
+ message: `FAIL: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)} (variance=${variance.toFixed(3)})`,
880
+ baseline,
881
+ variance,
691
882
  };
883
+ }
692
884
 
693
- const output = formatEvalStatus(status);
885
+ return {
886
+ passed: true,
887
+ phase: "production",
888
+ message: `PASS: Production phase (variance=${variance.toFixed(3)}, baseline=${baseline.toFixed(2)})`,
889
+ baseline,
890
+ variance,
891
+ };
892
+ }
694
893
 
695
- expect(output).toContain("bootstrap");
696
- expect(output).toContain("collecting data");
697
- });
894
+ // Stuck in stabilization (>50 runs but variance still high)
895
+ return {
896
+ passed: true,
897
+ phase: "stabilization",
898
+ message: `Stabilization: variance too high (${variance.toFixed(3)} > 0.1), need more consistent runs`,
899
+ baseline,
900
+ variance,
901
+ };
902
+ }
698
903
 
699
- test("shows production phase message", () => {
700
- const status = {
701
- phase: "production" as const,
702
- runCount: 75,
703
- thresholds: {
704
- stabilization: 0.1,
705
- production: 0.05,
706
- },
707
- recentScores: [],
708
- };
904
+ /**
905
+ * Ensure .hive directory exists
906
+ */
907
+ function ensureHiveDirectory(projectPath: string): void {
908
+ const hivePath = join(projectPath, ".hive");
909
+ if (!existsSync(hivePath)) {
910
+ mkdirSync(hivePath, { recursive: true });
911
+ }
912
+ }
913
+
914
+ describe("Eval gate", () => {
915
+ let testDir: string;
709
916
 
710
- const output = formatEvalStatus(status);
917
+ beforeEach(() => {
918
+ testDir = join(tmpdir(), `eval-gate-test-${Date.now()}`);
919
+ mkdirSync(testDir, { recursive: true });
920
+ });
711
921
 
712
- expect(output).toContain("production");
713
- });
922
+ afterEach(() => {
923
+ if (existsSync(testDir)) {
924
+ rmSync(testDir, { recursive: true, force: true });
925
+ }
714
926
  });
715
927
 
716
- describe("formatEvalHistory", () => {
717
- test("shows eval entries with timestamps and scores", () => {
718
- const history = [
719
- {
720
- timestamp: "2024-12-24T10:00:00.000Z",
721
- eval_name: "swarm-decomposition",
722
- score: 0.85,
723
- run_count: 1,
724
- },
725
- {
726
- timestamp: "2024-12-24T11:00:00.000Z",
727
- eval_name: "swarm-decomposition",
728
- score: 0.87,
729
- run_count: 2,
730
- },
731
- {
732
- timestamp: "2024-12-24T12:00:00.000Z",
733
- eval_name: "coordinator-behavior",
734
- score: 0.92,
735
- run_count: 1,
736
- },
737
- ];
928
+ describe("Bootstrap phase (<10 runs)", () => {
929
+ test("allows any score", () => {
930
+ ensureHiveDirectory(testDir);
931
+
932
+ // Record 5 runs
933
+ for (let i = 0; i < 5; i++) {
934
+ recordEvalRun(testDir, {
935
+ timestamp: new Date().toISOString(),
936
+ eval_name: "test-eval",
937
+ score: 0.5 + i * 0.1,
938
+ run_count: i + 1,
939
+ });
940
+ }
738
941
 
739
- const output = formatEvalHistory(history);
942
+ const result = checkGate(testDir, "test-eval", 0.3); // Low score
740
943
 
741
- // Should show all eval names
742
- expect(output).toContain("swarm-decomposition");
743
- expect(output).toContain("coordinator-behavior");
744
-
745
- // Should show scores
746
- expect(output).toContain("0.85");
747
- expect(output).toContain("0.87");
748
- expect(output).toContain("0.92");
749
-
750
- // Should show run counts
751
- expect(output).toContain("run #1");
752
- expect(output).toContain("run #2");
944
+ expect(result.passed).toBe(true);
945
+ expect(result.phase).toBe("bootstrap");
946
+ expect(result.message).toContain("BOOTSTRAP");
753
947
  });
754
948
 
755
- test("returns empty message for no history", () => {
756
- const output = formatEvalHistory([]);
757
- expect(output).toContain("No eval history");
758
- });
949
+ test("counts runs correctly", () => {
950
+ ensureHiveDirectory(testDir);
759
951
 
760
- test("formats timestamps as readable dates", () => {
761
- const history = [
762
- {
763
- timestamp: "2024-12-24T10:00:00.000Z",
764
- eval_name: "test",
765
- score: 0.85,
766
- run_count: 1,
767
- },
768
- ];
952
+ for (let i = 0; i < 7; i++) {
953
+ recordEvalRun(testDir, {
954
+ timestamp: new Date().toISOString(),
955
+ eval_name: "test-eval",
956
+ score: 0.8,
957
+ run_count: i + 1,
958
+ });
959
+ }
769
960
 
770
- const output = formatEvalHistory(history);
961
+ const result = checkGate(testDir, "test-eval", 0.8);
771
962
 
772
- // Should contain a formatted date (not raw ISO)
773
- expect(output).not.toContain("2024-12-24T10:00:00.000Z");
774
- expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
963
+ expect(result.phase).toBe("bootstrap");
964
+ expect(result.message).toContain("7/10");
775
965
  });
776
966
  });
777
967
 
778
- describe("generateSparkline", () => {
779
- test("generates sparkline from scores", () => {
780
- const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
781
- const sparkline = generateSparkline(scores);
782
-
783
- // Should use sparkline characters
784
- expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
785
-
786
- // Length should match input
787
- expect(sparkline.length).toBe(scores.length);
788
-
789
- // Should show ascending trend
790
- expect(sparkline).toContain("▁"); // Low score
791
- expect(sparkline).toContain("█"); // High score
792
- });
968
+ describe("Stabilization phase (10-50 runs)", () => {
969
+ test("warns on >5% regression", () => {
970
+ ensureHiveDirectory(testDir);
971
+
972
+ // Record 20 runs with consistent 0.9 score
973
+ for (let i = 0; i < 20; i++) {
974
+ recordEvalRun(testDir, {
975
+ timestamp: new Date().toISOString(),
976
+ eval_name: "test-eval",
977
+ score: 0.9,
978
+ run_count: i + 1,
979
+ });
980
+ }
793
981
 
794
- test("handles single score", () => {
795
- const sparkline = generateSparkline([0.5]);
796
- expect(sparkline.length).toBe(1);
797
- expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
798
- });
982
+ // Test with regressed score (>5% drop from 0.9 baseline)
983
+ const regressedScore = 0.85; // 5.5% drop
984
+ const result = checkGate(testDir, "test-eval", regressedScore);
799
985
 
800
- test("handles all same scores", () => {
801
- const sparkline = generateSparkline([0.5, 0.5, 0.5]);
802
- expect(sparkline.length).toBe(3);
803
- // All should be same character
804
- expect(new Set(sparkline.split("")).size).toBe(1);
986
+ expect(result.passed).toBe(false);
987
+ expect(result.phase).toBe("stabilization");
988
+ expect(result.message).toContain("WARN");
989
+ expect(result.baseline).toBeCloseTo(0.9, 2);
805
990
  });
806
991
 
807
- test("returns empty for empty array", () => {
808
- const sparkline = generateSparkline([]);
809
- expect(sparkline).toBe("");
810
- });
811
- });
992
+ test("passes when score is stable", () => {
993
+ ensureHiveDirectory(testDir);
812
994
 
813
- describe("formatEvalRunResult", () => {
814
- test("shows pass/fail with gate result", () => {
815
- const result = {
816
- passed: true,
817
- phase: "production" as const,
818
- message: "Production phase: 2.5% regression - acceptable",
819
- baseline: 0.85,
820
- currentScore: 0.83,
821
- regressionPercent: 0.025,
822
- };
995
+ for (let i = 0; i < 25; i++) {
996
+ recordEvalRun(testDir, {
997
+ timestamp: new Date().toISOString(),
998
+ eval_name: "test-eval",
999
+ score: 0.85,
1000
+ run_count: i + 1,
1001
+ });
1002
+ }
823
1003
 
824
- const output = formatEvalRunResult(result);
1004
+ const result = checkGate(testDir, "test-eval", 0.86);
825
1005
 
826
- expect(output).toContain("PASS");
827
- expect(output).toContain("production");
828
- expect(output).toContain("0.83"); // current score
829
- expect(output).toContain("2.5%"); // regression
1006
+ expect(result.passed).toBe(true);
1007
+ expect(result.phase).toBe("stabilization");
1008
+ expect(result.baseline).toBeCloseTo(0.85, 2);
830
1009
  });
1010
+ });
831
1011
 
832
- test("shows failure with details", () => {
833
- const result = {
834
- passed: false,
835
- phase: "production" as const,
836
- message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
837
- baseline: 0.85,
838
- currentScore: 0.78,
839
- regressionPercent: 0.08,
840
- };
1012
+ describe("Production phase (>50 runs, low variance)", () => {
1013
+ test("enters production when variance < 0.1", () => {
1014
+ ensureHiveDirectory(testDir);
1015
+
1016
+ // Simulate 60 runs with consistent scores (low variance)
1017
+ for (let i = 0; i < 60; i++) {
1018
+ recordEvalRun(testDir, {
1019
+ timestamp: new Date().toISOString(),
1020
+ eval_name: "test-eval",
1021
+ score: 0.9, // All same score = zero variance
1022
+ run_count: i + 1,
1023
+ });
1024
+ }
841
1025
 
842
- const output = formatEvalRunResult(result);
1026
+ const result = checkGate(testDir, "test-eval", 0.91);
843
1027
 
844
- expect(output).toContain("FAIL");
845
- expect(output).toContain("8.0%");
846
- expect(output).toContain("exceeds");
1028
+ expect(result.phase).toBe("production");
1029
+ expect(result.variance).toBeLessThan(0.1);
847
1030
  });
848
1031
 
849
- test("shows bootstrap phase without baseline", () => {
850
- const result = {
851
- passed: true,
852
- phase: "bootstrap" as const,
853
- message: "Bootstrap phase (5/10 runs) - collecting data",
854
- currentScore: 0.85,
855
- };
1032
+ test("fails on regression in production", () => {
1033
+ ensureHiveDirectory(testDir);
1034
+
1035
+ // Simulate 60 runs with consistent high scores to reach production phase
1036
+ for (let i = 0; i < 60; i++) {
1037
+ recordEvalRun(testDir, {
1038
+ timestamp: new Date().toISOString(),
1039
+ eval_name: "test-eval",
1040
+ score: 0.9,
1041
+ run_count: i + 1,
1042
+ });
1043
+ }
856
1044
 
857
- const output = formatEvalRunResult(result);
1045
+ // Now test with a regressed score (>5% drop from 0.9 baseline)
1046
+ const regressedScore = 0.8; // 11% drop
1047
+ const result = checkGate(testDir, "test-eval", regressedScore);
858
1048
 
859
- expect(output).toContain("bootstrap");
860
- expect(output).toContain("collecting data");
861
- expect(output).not.toContain("baseline");
1049
+ expect(result.passed).toBe(false);
1050
+ expect(result.phase).toBe("production");
1051
+ expect(result.message).toContain("FAIL");
862
1052
  });
863
1053
  });
864
1054
  });
865
1055
 
866
1056
  // ============================================================================
867
- // Eval Command Helpers (Implementation)
1057
+ // History Command Tests (TDD)
868
1058
  // ============================================================================
869
1059
 
1060
+ interface SwarmHistoryRecord {
1061
+ epic_id: string;
1062
+ epic_title: string;
1063
+ strategy: string;
1064
+ timestamp: string;
1065
+ overall_success: boolean;
1066
+ task_count: number;
1067
+ completed_count: number;
1068
+ }
1069
+
870
1070
  /**
871
- * Generate sparkline from array of scores (0-1 range)
1071
+ * Format relative time (e.g., "2h ago", "1d ago")
872
1072
  */
873
- function generateSparkline(scores: number[]): string {
874
- if (scores.length === 0) return "";
1073
+ function formatRelativeTime(timestamp: string): string {
1074
+ const now = Date.now();
1075
+ const then = new Date(timestamp).getTime();
1076
+ const diffMs = now - then;
1077
+
1078
+ const minutes = Math.floor(diffMs / 60000);
1079
+ const hours = Math.floor(diffMs / 3600000);
1080
+ const days = Math.floor(diffMs / 86400000);
1081
+
1082
+ if (minutes < 60) return `${minutes}m ago`;
1083
+ if (hours < 24) return `${hours}h ago`;
1084
+ return `${days}d ago`;
1085
+ }
1086
+
1087
+ /**
1088
+ * Format swarm history as beautiful CLI table
1089
+ */
1090
+ function formatSwarmHistory(records: SwarmHistoryRecord[]): string {
1091
+ if (records.length === 0) {
1092
+ return "No swarm history found";
1093
+ }
875
1094
 
876
- const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
877
- const min = Math.min(...scores);
878
- const max = Math.max(...scores);
879
- const range = max - min;
1095
+ const rows = records.map(r => ({
1096
+ time: formatRelativeTime(r.timestamp),
1097
+ status: r.overall_success ? "✅" : "❌",
1098
+ title: r.epic_title.length > 30 ? r.epic_title.slice(0, 27) + "..." : r.epic_title,
1099
+ strategy: r.strategy,
1100
+ tasks: `${r.completed_count}/${r.task_count} tasks`,
1101
+ }));
880
1102
 
881
- if (range === 0) {
882
- // All scores the same
883
- return chars[4].repeat(scores.length);
1103
+ // Box drawing characters
1104
+ const lines: string[] = [];
1105
+ lines.push("┌─────────────────────────────────────────────────────────────┐");
1106
+ lines.push("│ SWARM HISTORY │");
1107
+ lines.push("├─────────────────────────────────────────────────────────────┤");
1108
+
1109
+ for (const row of rows) {
1110
+ const statusCol = `${row.time.padEnd(8)} ${row.status}`;
1111
+ const titleCol = row.title.padEnd(32);
1112
+ const strategyCol = row.strategy.padEnd(13);
1113
+ const tasksCol = row.tasks;
1114
+
1115
+ const line = `│ ${statusCol} ${titleCol} ${strategyCol} ${tasksCol.padEnd(3)} │`;
1116
+ lines.push(line);
884
1117
  }
885
1118
 
886
- return scores
887
- .map((score) => {
888
- const normalized = (score - min) / range;
889
- const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
890
- return chars[index];
891
- })
892
- .join("");
1119
+ lines.push("└─────────────────────────────────────────────────────────────┘");
1120
+
1121
+ return lines.join("\n");
893
1122
  }
894
1123
 
895
1124
  /**
896
- * Format eval status for display
1125
+ * Filter history by status
897
1126
  */
898
- function formatEvalStatus(status: {
899
- phase: "bootstrap" | "stabilization" | "production";
900
- runCount: number;
901
- thresholds: { stabilization: number; production: number };
902
- recentScores: Array<{ timestamp: string; score: number }>;
903
- }): string {
904
- const lines: string[] = [];
905
-
906
- // Phase banner
907
- const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
908
- lines.push(`${phaseEmoji} Phase: ${status.phase}`);
909
- lines.push(`Runs: ${status.runCount}`);
910
- lines.push("");
911
-
912
- // Thresholds
913
- lines.push("Thresholds:");
914
- lines.push(` Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
915
- lines.push(` Production: ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
916
- lines.push("");
917
-
918
- // Recent scores with sparkline
919
- if (status.recentScores.length > 0) {
920
- lines.push("Recent scores:");
921
- const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
922
- lines.push(` ${sparkline}`);
923
- for (const { timestamp, score } of status.recentScores) {
924
- const time = new Date(timestamp).toLocaleString();
925
- lines.push(` ${time}: ${score.toFixed(2)}`);
926
- }
927
- } else {
928
- lines.push("No scores yet - collecting data");
1127
+ function filterHistoryByStatus(
1128
+ records: SwarmHistoryRecord[],
1129
+ status?: "success" | "failed" | "in_progress",
1130
+ ): SwarmHistoryRecord[] {
1131
+ if (!status) return records;
1132
+
1133
+ switch (status) {
1134
+ case "success":
1135
+ return records.filter(r => r.overall_success);
1136
+ case "failed":
1137
+ return records.filter(r => !r.overall_success && r.completed_count === r.task_count);
1138
+ case "in_progress":
1139
+ return records.filter(r => r.completed_count < r.task_count);
1140
+ default:
1141
+ return records;
929
1142
  }
1143
+ }
930
1144
 
931
- return lines.join("\n");
1145
+ /**
1146
+ * Filter history by strategy
1147
+ */
1148
+ function filterHistoryByStrategy(
1149
+ records: SwarmHistoryRecord[],
1150
+ strategy?: "file-based" | "feature-based" | "risk-based",
1151
+ ): SwarmHistoryRecord[] {
1152
+ if (!strategy) return records;
1153
+ return records.filter(r => r.strategy === strategy);
932
1154
  }
933
1155
 
934
1156
  /**
935
- * Format eval history for display
1157
+ * Parse history CLI arguments
936
1158
  */
937
- function formatEvalHistory(history: Array<{
938
- timestamp: string;
939
- eval_name: string;
940
- score: number;
941
- run_count: number;
942
- }>): string {
943
- if (history.length === 0) {
944
- return "No eval history found";
945
- }
1159
+ function parseHistoryArgs(args: string[]): {
1160
+ limit: number;
1161
+ status?: "success" | "failed" | "in_progress";
1162
+ strategy?: "file-based" | "feature-based" | "risk-based";
1163
+ verbose: boolean;
1164
+ } {
1165
+ const result: {
1166
+ limit: number;
1167
+ status?: "success" | "failed" | "in_progress";
1168
+ strategy?: "file-based" | "feature-based" | "risk-based";
1169
+ verbose: boolean;
1170
+ } = {
1171
+ limit: 10,
1172
+ verbose: false,
1173
+ };
946
1174
 
947
- const lines: string[] = [];
948
- lines.push("Eval History:");
949
- lines.push("");
950
-
951
- // Group by eval name
952
- const grouped = new Map<string, typeof history>();
953
- for (const entry of history) {
954
- if (!grouped.has(entry.eval_name)) {
955
- grouped.set(entry.eval_name, []);
956
- }
957
- grouped.get(entry.eval_name)!.push(entry);
958
- }
1175
+ for (let i = 0; i < args.length; i++) {
1176
+ const arg = args[i];
959
1177
 
960
- // Display each eval group
961
- for (const [evalName, entries] of grouped) {
962
- lines.push(`${evalName}:`);
963
- const sparkline = generateSparkline(entries.map((e) => e.score));
964
- lines.push(` Trend: ${sparkline}`);
965
-
966
- // Show latest 5 entries
967
- const latest = entries.slice(-5);
968
- for (const entry of latest) {
969
- const time = new Date(entry.timestamp).toLocaleTimeString();
970
- lines.push(` ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
971
- }
972
-
973
- if (entries.length > 5) {
974
- lines.push(` ... and ${entries.length - 5} more`);
1178
+ if (arg === "--limit" || arg === "-n") {
1179
+ const limitStr = args[i + 1];
1180
+ if (limitStr && !isNaN(Number(limitStr))) {
1181
+ result.limit = Number(limitStr);
1182
+ i++;
1183
+ }
1184
+ } else if (arg === "--status") {
1185
+ const statusStr = args[i + 1];
1186
+ if (statusStr && ["success", "failed", "in_progress"].includes(statusStr)) {
1187
+ result.status = statusStr as "success" | "failed" | "in_progress";
1188
+ i++;
1189
+ }
1190
+ } else if (arg === "--strategy") {
1191
+ const strategyStr = args[i + 1];
1192
+ if (strategyStr && ["file-based", "feature-based", "risk-based"].includes(strategyStr)) {
1193
+ result.strategy = strategyStr as "file-based" | "feature-based" | "risk-based";
1194
+ i++;
1195
+ }
1196
+ } else if (arg === "--verbose" || arg === "-v") {
1197
+ result.verbose = true;
975
1198
  }
976
-
977
- lines.push("");
978
1199
  }
979
1200
 
980
- return lines.join("\n");
1201
+ return result;
981
1202
  }
982
1203
 
983
- /**
984
- * Format eval run result (gate check)
985
- */
986
- function formatEvalRunResult(result: {
987
- passed: boolean;
988
- phase: "bootstrap" | "stabilization" | "production";
989
- message: string;
990
- baseline?: number;
991
- currentScore: number;
992
- regressionPercent?: number;
993
- }): string {
994
- const lines: string[] = [];
995
-
996
- // Pass/fail banner
997
- const status = result.passed ? "✅ PASS" : "❌ FAIL";
998
- lines.push(status);
999
- lines.push("");
1204
+ describe("swarm history", () => {
1205
+ describe("formatRelativeTime", () => {
1206
+ test("formats minutes ago", () => {
1207
+ const fiveMinutesAgo = new Date(Date.now() - 5 * 60000).toISOString();
1208
+ const result = formatRelativeTime(fiveMinutesAgo);
1209
+ expect(result).toMatch(/5m ago/);
1210
+ });
1000
1211
 
1001
- // Phase and score
1002
- lines.push(`Phase: ${result.phase}`);
1003
- lines.push(`Score: ${result.currentScore.toFixed(2)}`);
1212
+ test("formats hours ago", () => {
1213
+ const threeHoursAgo = new Date(Date.now() - 3 * 3600000).toISOString();
1214
+ const result = formatRelativeTime(threeHoursAgo);
1215
+ expect(result).toMatch(/3h ago/);
1216
+ });
1004
1217
 
1005
- if (result.baseline !== undefined) {
1006
- lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
1007
- }
1218
+ test("formats days ago", () => {
1219
+ const twoDaysAgo = new Date(Date.now() - 2 * 86400000).toISOString();
1220
+ const result = formatRelativeTime(twoDaysAgo);
1221
+ expect(result).toMatch(/2d ago/);
1222
+ });
1223
+ });
1008
1224
 
1009
- if (result.regressionPercent !== undefined) {
1010
- const sign = result.regressionPercent > 0 ? "+" : "";
1011
- lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
1012
- }
1225
+ describe("formatSwarmHistory", () => {
1226
+ test("formats history as beautiful box-drawn table", () => {
1227
+ const records: SwarmHistoryRecord[] = [
1228
+ {
1229
+ epic_id: "epic-1",
1230
+ epic_title: "Add auth flow",
1231
+ strategy: "feature-based",
1232
+ timestamp: new Date(Date.now() - 2 * 3600000).toISOString(),
1233
+ overall_success: true,
1234
+ task_count: 4,
1235
+ completed_count: 4,
1236
+ },
1237
+ {
1238
+ epic_id: "epic-2",
1239
+ epic_title: "Refactor DB layer",
1240
+ strategy: "file-based",
1241
+ timestamp: new Date(Date.now() - 5 * 3600000).toISOString(),
1242
+ overall_success: false,
1243
+ task_count: 5,
1244
+ completed_count: 2,
1245
+ },
1246
+ ];
1013
1247
 
1014
- lines.push("");
1015
- lines.push(result.message);
1248
+ const result = formatSwarmHistory(records);
1249
+
1250
+ expect(result).toContain("┌─────");
1251
+ expect(result).toContain("SWARM HISTORY");
1252
+ expect(result).toContain("✅");
1253
+ expect(result).toContain("❌");
1254
+ expect(result).toContain("Add auth flow");
1255
+ expect(result).toContain("Refactor DB layer");
1256
+ expect(result).toContain("feature-based");
1257
+ expect(result).toContain("file-based");
1258
+ expect(result).toContain("4/4 tasks");
1259
+ expect(result).toContain("2/5 tasks");
1260
+ expect(result).toContain("└─────");
1261
+ });
1016
1262
 
1017
- return lines.join("\n");
1018
- }
1263
+ test("truncates long titles with ellipsis", () => {
1264
+ const records: SwarmHistoryRecord[] = [
1265
+ {
1266
+ epic_id: "epic-1",
1267
+ epic_title: "A".repeat(100),
1268
+ strategy: "feature-based",
1269
+ timestamp: new Date(Date.now() - 1000).toISOString(),
1270
+ overall_success: true,
1271
+ task_count: 1,
1272
+ completed_count: 1,
1273
+ },
1274
+ ];
1019
1275
 
1020
- // ============================================================================
1021
- // Eval Run Tests
1022
- // ============================================================================
1276
+ const result = formatSwarmHistory(records);
1023
1277
 
1024
- describe("Eval Run CI Mode", () => {
1025
- let testDir: string;
1278
+ expect(result).toContain("...");
1279
+ expect(result).toMatch(/A{27}\.\.\./);
1280
+ });
1026
1281
 
1027
- beforeEach(() => {
1028
- testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
1029
- mkdirSync(testDir, { recursive: true });
1282
+ test("returns 'No swarm history found' for empty array", () => {
1283
+ const result = formatSwarmHistory([]);
1284
+ expect(result).toBe("No swarm history found");
1285
+ });
1030
1286
  });
1031
1287
 
1032
- afterEach(() => {
1033
- if (existsSync(testDir)) {
1034
- rmSync(testDir, { recursive: true, force: true });
1035
- }
1036
- });
1288
+ describe("filterHistoryByStatus", () => {
1289
+ const records: SwarmHistoryRecord[] = [
1290
+ {
1291
+ epic_id: "epic-1",
1292
+ epic_title: "Success",
1293
+ strategy: "feature-based",
1294
+ timestamp: "2025-01-01T00:00:00Z",
1295
+ overall_success: true,
1296
+ task_count: 4,
1297
+ completed_count: 4,
1298
+ },
1299
+ {
1300
+ epic_id: "epic-2",
1301
+ epic_title: "Failed",
1302
+ strategy: "file-based",
1303
+ timestamp: "2025-01-01T00:00:00Z",
1304
+ overall_success: false,
1305
+ task_count: 4,
1306
+ completed_count: 4,
1307
+ },
1308
+ {
1309
+ epic_id: "epic-3",
1310
+ epic_title: "In Progress",
1311
+ strategy: "risk-based",
1312
+ timestamp: "2025-01-01T00:00:00Z",
1313
+ overall_success: false,
1314
+ task_count: 5,
1315
+ completed_count: 2,
1316
+ },
1317
+ ];
1318
+
1319
+ test("filters success only", () => {
1320
+ const result = filterHistoryByStatus(records, "success");
1321
+ expect(result).toHaveLength(1);
1322
+ expect(result[0].epic_title).toBe("Success");
1323
+ });
1037
1324
 
1038
- test("writes eval results JSON file", async () => {
1039
- // Import the function we need to test
1040
- const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
1041
- const { checkGate } = await import("../src/eval-gates.js");
1042
- const { ensureHiveDirectory } = await import("../src/hive.js");
1043
-
1044
- // Set up test data
1045
- const evalName = "test-eval";
1046
- const mockScore = 0.85;
1047
-
1048
- // Ensure directory exists
1049
- ensureHiveDirectory(testDir);
1050
-
1051
- // Get history and record run (simulating what eval run does)
1052
- const history = getScoreHistory(testDir, evalName);
1053
- recordEvalRun(testDir, {
1054
- timestamp: new Date().toISOString(),
1055
- eval_name: evalName,
1056
- score: mockScore,
1057
- run_count: history.length + 1,
1325
+ test("filters failed only", () => {
1326
+ const result = filterHistoryByStatus(records, "failed");
1327
+ expect(result).toHaveLength(1);
1328
+ expect(result[0].epic_title).toBe("Failed");
1058
1329
  });
1059
1330
 
1060
- // Check gate
1061
- const gateResult = checkGate(testDir, evalName, mockScore);
1331
+ test("filters in_progress only", () => {
1332
+ const result = filterHistoryByStatus(records, "in_progress");
1333
+ expect(result).toHaveLength(1);
1334
+ expect(result[0].epic_title).toBe("In Progress");
1335
+ });
1062
1336
 
1063
- // Write results file (simulating CI mode)
1064
- const resultsPath = join(testDir, ".hive", "eval-results.json");
1065
- const results = { [evalName]: gateResult };
1066
- writeFileSync(resultsPath, JSON.stringify(results, null, 2));
1337
+ test("returns all when no status filter", () => {
1338
+ const result = filterHistoryByStatus(records);
1339
+ expect(result).toHaveLength(3);
1340
+ });
1341
+ });
1067
1342
 
1068
- // Verify file exists and has correct structure
1069
- expect(existsSync(resultsPath)).toBe(true);
1343
+ describe("filterHistoryByStrategy", () => {
1344
+ const records: SwarmHistoryRecord[] = [
1345
+ {
1346
+ epic_id: "epic-1",
1347
+ epic_title: "File",
1348
+ strategy: "file-based",
1349
+ timestamp: "2025-01-01T00:00:00Z",
1350
+ overall_success: true,
1351
+ task_count: 4,
1352
+ completed_count: 4,
1353
+ },
1354
+ {
1355
+ epic_id: "epic-2",
1356
+ epic_title: "Feature",
1357
+ strategy: "feature-based",
1358
+ timestamp: "2025-01-01T00:00:00Z",
1359
+ overall_success: true,
1360
+ task_count: 4,
1361
+ completed_count: 4,
1362
+ },
1363
+ {
1364
+ epic_id: "epic-3",
1365
+ epic_title: "Risk",
1366
+ strategy: "risk-based",
1367
+ timestamp: "2025-01-01T00:00:00Z",
1368
+ overall_success: true,
1369
+ task_count: 4,
1370
+ completed_count: 4,
1371
+ },
1372
+ ];
1373
+
1374
+ test("filters file-based only", () => {
1375
+ const result = filterHistoryByStrategy(records, "file-based");
1376
+ expect(result).toHaveLength(1);
1377
+ expect(result[0].epic_title).toBe("File");
1378
+ });
1070
1379
 
1071
- const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
1072
- expect(savedResults).toHaveProperty(evalName);
1073
- expect(savedResults[evalName]).toMatchObject({
1074
- passed: true,
1075
- phase: "bootstrap",
1076
- currentScore: mockScore,
1380
+ test("filters feature-based only", () => {
1381
+ const result = filterHistoryByStrategy(records, "feature-based");
1382
+ expect(result).toHaveLength(1);
1383
+ expect(result[0].epic_title).toBe("Feature");
1384
+ });
1385
+
1386
+ test("filters risk-based only", () => {
1387
+ const result = filterHistoryByStrategy(records, "risk-based");
1388
+ expect(result).toHaveLength(1);
1389
+ expect(result[0].epic_title).toBe("Risk");
1390
+ });
1391
+
1392
+ test("returns all when no strategy filter", () => {
1393
+ const result = filterHistoryByStrategy(records);
1394
+ expect(result).toHaveLength(3);
1077
1395
  });
1078
1396
  });
1079
1397
 
1080
- test("bootstrap phase always passes", async () => {
1081
- const { checkGate } = await import("../src/eval-gates.js");
1398
+ describe("parseHistoryArgs", () => {
1399
+ test("parses --limit flag", () => {
1400
+ const result = parseHistoryArgs(["--limit", "20"]);
1401
+ expect(result.limit).toBe(20);
1402
+ });
1082
1403
 
1083
- // Even with a low score, bootstrap phase should pass
1084
- const result = checkGate(testDir, "test-eval", 0.1);
1404
+ test("parses -n shorthand for limit", () => {
1405
+ const result = parseHistoryArgs(["-n", "5"]);
1406
+ expect(result.limit).toBe(5);
1407
+ });
1085
1408
 
1086
- expect(result.passed).toBe(true);
1087
- expect(result.phase).toBe("bootstrap");
1088
- expect(result.message).toContain("Bootstrap phase");
1089
- });
1409
+ test("parses --status flag", () => {
1410
+ const result = parseHistoryArgs(["--status", "success"]);
1411
+ expect(result.status).toBe("success");
1412
+ });
1090
1413
 
1091
- test("production phase fails on regression", async () => {
1092
- const { recordEvalRun } = await import("../src/eval-history.js");
1093
- const { checkGate } = await import("../src/eval-gates.js");
1094
- const { ensureHiveDirectory } = await import("../src/hive.js");
1095
-
1096
- ensureHiveDirectory(testDir);
1097
-
1098
- // Simulate 60 runs with consistent high scores to reach production phase
1099
- for (let i = 0; i < 60; i++) {
1100
- recordEvalRun(testDir, {
1101
- timestamp: new Date().toISOString(),
1102
- eval_name: "test-eval",
1103
- score: 0.9,
1104
- run_count: i + 1,
1105
- });
1106
- }
1414
+ test("parses --strategy flag", () => {
1415
+ const result = parseHistoryArgs(["--strategy", "file-based"]);
1416
+ expect(result.strategy).toBe("file-based");
1417
+ });
1418
+
1419
+ test("parses --verbose flag", () => {
1420
+ const result = parseHistoryArgs(["--verbose"]);
1421
+ expect(result.verbose).toBe(true);
1422
+ });
1423
+
1424
+ test("parses -v shorthand for verbose", () => {
1425
+ const result = parseHistoryArgs(["-v"]);
1426
+ expect(result.verbose).toBe(true);
1427
+ });
1428
+
1429
+ test("parses multiple flags together", () => {
1430
+ const result = parseHistoryArgs(["--limit", "15", "--status", "failed", "--verbose"]);
1431
+ expect(result.limit).toBe(15);
1432
+ expect(result.status).toBe("failed");
1433
+ expect(result.verbose).toBe(true);
1434
+ });
1107
1435
 
1108
- // Now test with a regressed score (>5% drop from 0.9 baseline)
1109
- const regressedScore = 0.8; // 11% drop
1110
- const result = checkGate(testDir, "test-eval", regressedScore);
1436
+ test("uses default limit of 10 when not specified", () => {
1437
+ const result = parseHistoryArgs([]);
1438
+ expect(result.limit).toBe(10);
1439
+ });
1111
1440
 
1112
- expect(result.passed).toBe(false);
1113
- expect(result.phase).toBe("production");
1114
- expect(result.message).toContain("FAIL");
1441
+ test("ignores invalid status values", () => {
1442
+ const result = parseHistoryArgs(["--status", "invalid"]);
1443
+ expect(result.status).toBeUndefined();
1444
+ });
1445
+
1446
+ test("ignores invalid strategy values", () => {
1447
+ const result = parseHistoryArgs(["--strategy", "invalid"]);
1448
+ expect(result.strategy).toBeUndefined();
1449
+ });
1115
1450
  });
1116
1451
  });