opencode-swarm-plugin 0.40.0 → 0.42.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +67 -16
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +165 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.serve.test.ts +46 -0
  14. package/bin/swarm.test.ts +661 -732
  15. package/bin/swarm.ts +335 -0
  16. package/dist/compaction-hook.d.ts +7 -5
  17. package/dist/compaction-hook.d.ts.map +1 -1
  18. package/dist/compaction-prompt-scoring.d.ts +1 -0
  19. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  20. package/dist/eval-runner.d.ts +134 -0
  21. package/dist/eval-runner.d.ts.map +1 -0
  22. package/dist/hive.d.ts.map +1 -1
  23. package/dist/index.d.ts +29 -0
  24. package/dist/index.d.ts.map +1 -1
  25. package/dist/index.js +99741 -58858
  26. package/dist/memory-tools.d.ts +70 -2
  27. package/dist/memory-tools.d.ts.map +1 -1
  28. package/dist/memory.d.ts +37 -0
  29. package/dist/memory.d.ts.map +1 -1
  30. package/dist/observability-tools.d.ts +64 -0
  31. package/dist/observability-tools.d.ts.map +1 -1
  32. package/dist/plugin.js +99356 -58318
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +32 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  37. package/evals/ARCHITECTURE.md +1189 -0
  38. package/evals/example.eval.ts +3 -4
  39. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  40. package/evals/scorers/coordinator-discipline.evalite-test.ts +1 -162
  41. package/evals/scorers/coordinator-discipline.ts +0 -323
  42. package/evals/swarm-decomposition.eval.ts +4 -2
  43. package/package.json +4 -3
  44. package/src/compaction-prompt-scorers.test.ts +185 -9
  45. package/src/compaction-prompt-scoring.ts +7 -5
  46. package/src/eval-runner.test.ts +128 -1
  47. package/src/eval-runner.ts +46 -0
  48. package/src/hive.ts +43 -42
  49. package/src/memory-tools.test.ts +84 -0
  50. package/src/memory-tools.ts +68 -3
  51. package/src/memory.test.ts +2 -112
  52. package/src/memory.ts +88 -49
  53. package/src/observability-tools.test.ts +13 -0
  54. package/src/observability-tools.ts +277 -0
  55. package/src/swarm-orchestrate.test.ts +162 -0
  56. package/src/swarm-orchestrate.ts +7 -5
  57. package/src/swarm-prompts.test.ts +168 -4
  58. package/src/swarm-prompts.ts +228 -7
  59. package/.env +0 -2
  60. package/.turbo/turbo-test.log +0 -481
  61. package/.turbo/turbo-typecheck.log +0 -1
package/bin/swarm.test.ts CHANGED
@@ -1,11 +1,10 @@
1
1
  #!/usr/bin/env bun
2
2
  /**
3
- * Tests for swarm CLI file operation helpers
3
+ * Tests for swarm CLI helpers
4
4
  *
5
- * These tests verify the verbose output helpers used in `swarm setup`:
6
- * - writeFileWithStatus: logs created/updated/unchanged status
7
- * - mkdirWithStatus: logs directory creation
8
- * - rmWithStatus: logs file removal
5
+ * These tests verify the CLI helpers:
6
+ * - File operation helpers (writeFileWithStatus, mkdirWithStatus, rmWithStatus)
7
+ * - Swarm history helpers (formatSwarmHistory, parseHistoryArgs, filterHistoryByStatus)
9
8
  */
10
9
  import { describe, test, expect, beforeEach, afterEach } from "bun:test";
11
10
  import { mkdirSync, rmSync, writeFileSync, existsSync, readFileSync, readdirSync } from "fs";
@@ -501,17 +500,17 @@ describe("swarm log sessions", () => {
501
500
  {
502
501
  session_id: "s1",
503
502
  epic_id: "e1",
504
- timestamp: "2025-01-01T00:01:00Z",
503
+ timestamp: "2025-01-01T00:00:01Z",
505
504
  event_type: "VIOLATION",
506
- violation_type: "coordinator_edited_file",
505
+ violation_type: "direct_edit",
507
506
  payload: {},
508
507
  },
509
508
  {
510
509
  session_id: "s1",
511
510
  epic_id: "e1",
512
- timestamp: "2025-01-01T00:02:00Z",
511
+ timestamp: "2025-01-01T00:00:02Z",
513
512
  event_type: "DECISION",
514
- decision_type: "review_completed",
513
+ decision_type: "worker_spawned",
515
514
  payload: {},
516
515
  },
517
516
  ];
@@ -535,9 +534,9 @@ describe("swarm log sessions", () => {
535
534
  {
536
535
  session_id: "s1",
537
536
  epic_id: "e1",
538
- timestamp: "2025-01-01T00:01:00Z",
537
+ timestamp: "2025-01-01T00:00:01Z",
539
538
  event_type: "VIOLATION",
540
- violation_type: "coordinator_edited_file",
539
+ violation_type: "direct_edit",
541
540
  payload: {},
542
541
  },
543
542
  ];
@@ -555,7 +554,7 @@ describe("swarm log sessions", () => {
555
554
  {
556
555
  session_id: "s1",
557
556
  epic_id: "e1",
558
- timestamp: new Date(now - 10000).toISOString(), // 10s ago
557
+ timestamp: new Date(now - 5000).toISOString(), // 5s ago
559
558
  event_type: "DECISION",
560
559
  decision_type: "worker_spawned",
561
560
  payload: {},
@@ -563,17 +562,17 @@ describe("swarm log sessions", () => {
563
562
  {
564
563
  session_id: "s1",
565
564
  epic_id: "e1",
566
- timestamp: new Date(now - 60000).toISOString(), // 1m ago
567
- event_type: "VIOLATION",
568
- violation_type: "coordinator_edited_file",
565
+ timestamp: new Date(now - 10000).toISOString(), // 10s ago
566
+ event_type: "DECISION",
567
+ decision_type: "worker_spawned",
569
568
  payload: {},
570
569
  },
571
570
  {
572
571
  session_id: "s1",
573
572
  epic_id: "e1",
574
- timestamp: new Date(now - 3000).toISOString(), // 3s ago
575
- event_type: "OUTCOME",
576
- outcome_type: "subtask_success",
573
+ timestamp: new Date(now - 60000).toISOString(), // 1min ago
574
+ event_type: "DECISION",
575
+ decision_type: "worker_spawned",
577
576
  payload: {},
578
577
  },
579
578
  ];
@@ -682,841 +681,771 @@ describe("Cells command", () => {
682
681
  },
683
682
  ];
684
683
 
685
- const table = formatCellsTable(cells);
686
-
687
- // Should contain headers
688
- expect(table).toContain("ID");
689
- expect(table).toContain("TITLE");
690
- expect(table).toContain("STATUS");
691
- expect(table).toContain("PRIORITY");
692
-
693
- // Should contain cell data
694
- expect(table).toContain("test-abc123-xyz");
695
- expect(table).toContain("Fix bug");
696
- expect(table).toContain("open");
697
- expect(table).toContain("0");
698
-
699
- expect(table).toContain("test-def456-abc");
700
- expect(table).toContain("Add feature");
701
- expect(table).toContain("in_progress");
702
- expect(table).toContain("2");
703
- });
684
+ const result = formatCellsTable(cells);
704
685
 
705
- test("returns 'No cells found' for empty array", () => {
706
- const table = formatCellsTable([]);
707
- expect(table).toBe("No cells found");
686
+ expect(result).toContain("ID");
687
+ expect(result).toContain("TITLE");
688
+ expect(result).toContain("STATUS");
689
+ expect(result).toContain("PRIORITY");
690
+ expect(result).toContain("Fix bug");
691
+ expect(result).toContain("Add feature");
692
+ expect(result).toContain("open");
693
+ expect(result).toContain("in_progress");
708
694
  });
709
- });
710
- });
711
-
712
- describe("Log command helpers", () => {
713
- let testDir: string;
714
-
715
- beforeEach(() => {
716
- testDir = join(tmpdir(), `swarm-log-test-${Date.now()}`);
717
- mkdirSync(testDir, { recursive: true });
718
- });
719
-
720
- afterEach(() => {
721
- if (existsSync(testDir)) {
722
- rmSync(testDir, { recursive: true, force: true });
723
- }
724
- });
725
-
726
- describe("parseLogLine", () => {
727
- function parseLogLine(line: string): { level: number; time: string; module: string; msg: string } | null {
728
- try {
729
- const parsed = JSON.parse(line);
730
- if (typeof parsed.level === "number" && parsed.time && parsed.msg) {
731
- return {
732
- level: parsed.level,
733
- time: parsed.time,
734
- module: parsed.module || "unknown",
735
- msg: parsed.msg,
736
- };
737
- }
738
- } catch {
739
- // Invalid JSON
740
- }
741
- return null;
742
- }
743
695
 
744
- test("parses valid log line", () => {
745
- const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","module":"compaction","msg":"started"}';
746
- const result = parseLogLine(line);
747
-
748
- expect(result).not.toBeNull();
749
- expect(result?.level).toBe(30);
750
- expect(result?.module).toBe("compaction");
751
- expect(result?.msg).toBe("started");
752
- });
696
+ test("truncates long titles with ellipsis", () => {
697
+ const cells = [
698
+ {
699
+ id: "test-abc",
700
+ title: "A".repeat(100),
701
+ status: "open",
702
+ priority: 0,
703
+ type: "task",
704
+ created_at: 1234567890,
705
+ updated_at: 1234567890,
706
+ },
707
+ ];
753
708
 
754
- test("returns null for invalid JSON", () => {
755
- const line = "not json";
756
- expect(parseLogLine(line)).toBeNull();
757
- });
709
+ const result = formatCellsTable(cells);
758
710
 
759
- test("defaults module to 'unknown' if missing", () => {
760
- const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"test"}';
761
- const result = parseLogLine(line);
762
-
763
- expect(result?.module).toBe("unknown");
711
+ expect(result).toContain("...");
712
+ expect(result.split("\n")[2]).toMatch(/A{47}\.\.\./);
764
713
  });
765
- });
766
714
 
767
- describe("filterLogsByLevel", () => {
768
- function filterLogsByLevel(logs: Array<{ level: number }>, minLevel: number): Array<{ level: number }> {
769
- return logs.filter((log) => log.level >= minLevel);
770
- }
715
+ test("returns 'No cells found' for empty array", () => {
716
+ const result = formatCellsTable([]);
771
717
 
772
- test("filters logs by minimum level", () => {
773
- const logs = [
774
- { level: 10 }, // trace
775
- { level: 30 }, // info
776
- { level: 50 }, // error
777
- ];
778
-
779
- const result = filterLogsByLevel(logs, 30);
780
- expect(result).toHaveLength(2);
781
- expect(result[0].level).toBe(30);
782
- expect(result[1].level).toBe(50);
718
+ expect(result).toBe("No cells found");
783
719
  });
784
720
 
785
- test("includes all logs when minLevel is 0", () => {
786
- const logs = [
787
- { level: 10 },
788
- { level: 20 },
789
- { level: 30 },
721
+ test("aligns columns correctly", () => {
722
+ const cells = [
723
+ {
724
+ id: "short",
725
+ title: "T",
726
+ status: "open",
727
+ priority: 0,
728
+ type: "task",
729
+ created_at: 1234567890,
730
+ updated_at: 1234567890,
731
+ },
732
+ {
733
+ id: "very-long-id-here",
734
+ title: "Very long title here",
735
+ status: "in_progress",
736
+ priority: 2,
737
+ type: "task",
738
+ created_at: 1234567890,
739
+ updated_at: 1234567890,
740
+ },
790
741
  ];
791
-
792
- const result = filterLogsByLevel(logs, 0);
793
- expect(result).toHaveLength(3);
794
- });
795
- });
796
742
 
797
- describe("filterLogsByModule", () => {
798
- function filterLogsByModule(logs: Array<{ module: string }>, module: string): Array<{ module: string }> {
799
- return logs.filter((log) => log.module === module);
800
- }
743
+ const result = formatCellsTable(cells);
744
+ const lines = result.split("\n");
801
745
 
802
- test("filters logs by exact module name", () => {
803
- const logs = [
804
- { module: "compaction" },
805
- { module: "swarm" },
806
- { module: "compaction" },
807
- ];
808
-
809
- const result = filterLogsByModule(logs, "compaction");
810
- expect(result).toHaveLength(2);
811
- });
812
-
813
- test("returns empty array when no match", () => {
814
- const logs = [
815
- { module: "compaction" },
816
- ];
817
-
818
- const result = filterLogsByModule(logs, "swarm");
819
- expect(result).toHaveLength(0);
746
+ // All lines should be same length (aligned)
747
+ const lengths = lines.map(l => l.length);
748
+ expect(Math.max(...lengths) - Math.min(...lengths)).toBeLessThan(3);
820
749
  });
821
750
  });
751
+ });
822
752
 
823
- describe("filterLogsBySince", () => {
824
- function parseDuration(duration: string): number | null {
825
- const match = duration.match(/^(\d+)([smhd])$/);
826
- if (!match) return null;
827
-
828
- const [, num, unit] = match;
829
- const value = parseInt(num, 10);
830
-
831
- const multipliers: Record<string, number> = {
832
- s: 1000,
833
- m: 60 * 1000,
834
- h: 60 * 60 * 1000,
835
- d: 24 * 60 * 60 * 1000,
836
- };
837
-
838
- return value * multipliers[unit];
839
- }
840
-
841
- function filterLogsBySince(logs: Array<{ time: string }>, sinceMs: number): Array<{ time: string }> {
842
- const cutoffTime = Date.now() - sinceMs;
843
- return logs.filter((log) => new Date(log.time).getTime() >= cutoffTime);
844
- }
845
-
846
- test("parseDuration handles seconds", () => {
847
- expect(parseDuration("30s")).toBe(30 * 1000);
848
- });
753
+ // ============================================================================
754
+ // Eval Gate Tests (TDD)
755
+ // ============================================================================
849
756
 
850
- test("parseDuration handles minutes", () => {
851
- expect(parseDuration("5m")).toBe(5 * 60 * 1000);
852
- });
757
+ interface EvalRunRecord {
758
+ timestamp: string;
759
+ eval_name: string;
760
+ score: number;
761
+ run_count: number;
762
+ }
853
763
 
854
- test("parseDuration handles hours", () => {
855
- expect(parseDuration("2h")).toBe(2 * 60 * 60 * 1000);
856
- });
764
+ interface GateResult {
765
+ passed: boolean;
766
+ phase: "bootstrap" | "stabilization" | "production";
767
+ message: string;
768
+ baseline?: number;
769
+ variance?: number;
770
+ }
857
771
 
858
- test("parseDuration handles days", () => {
859
- expect(parseDuration("1d")).toBe(24 * 60 * 60 * 1000);
860
- });
772
+ /**
773
+ * Calculate variance for phase transitions
774
+ */
775
+ function calculateVariance(scores: number[]): number {
776
+ if (scores.length <= 1) return 0;
861
777
 
862
- test("parseDuration returns null for invalid format", () => {
863
- expect(parseDuration("invalid")).toBeNull();
864
- expect(parseDuration("30x")).toBeNull();
865
- expect(parseDuration("30")).toBeNull();
866
- });
778
+ const mean = scores.reduce((sum, x) => sum + x, 0) / scores.length;
779
+ const squaredDiffs = scores.map((x) => Math.pow(x - mean, 2));
780
+ const variance = squaredDiffs.reduce((sum, x) => sum + x, 0) / scores.length;
867
781
 
868
- test("filterLogsBySince filters old logs", () => {
869
- const now = Date.now();
870
- const logs = [
871
- { time: new Date(now - 10000).toISOString() }, // 10s ago
872
- { time: new Date(now - 120000).toISOString() }, // 2m ago
873
- { time: new Date(now - 1000).toISOString() }, // 1s ago
874
- ];
875
-
876
- const result = filterLogsBySince(logs, 60000); // Last 1m
877
- expect(result).toHaveLength(2); // Only logs within last minute
878
- });
879
- });
782
+ return variance;
783
+ }
880
784
 
881
- describe("formatLogLine", () => {
882
- function levelToName(level: number): string {
883
- if (level >= 60) return "FATAL";
884
- if (level >= 50) return "ERROR";
885
- if (level >= 40) return "WARN ";
886
- if (level >= 30) return "INFO ";
887
- if (level >= 20) return "DEBUG";
888
- return "TRACE";
889
- }
785
+ /**
786
+ * Read all eval run records from .hive/eval-history.jsonl
787
+ */
788
+ function readAllRecords(projectPath: string): EvalRunRecord[] {
789
+ const recordsPath = join(projectPath, ".hive", "eval-history.jsonl");
890
790
 
891
- function formatLogLine(log: { level: number; time: string; module: string; msg: string }): string {
892
- const timestamp = new Date(log.time).toLocaleTimeString();
893
- const levelName = levelToName(log.level);
894
- const module = log.module.padEnd(12);
895
- return `${timestamp} ${levelName} ${module} ${log.msg}`;
896
- }
791
+ if (!existsSync(recordsPath)) {
792
+ return [];
793
+ }
897
794
 
898
- test("formats log line with timestamp and level", () => {
899
- const log = {
900
- level: 30,
901
- time: "2024-12-24T16:00:00.000Z",
902
- module: "compaction",
903
- msg: "started",
904
- };
905
-
906
- const result = formatLogLine(log);
907
- expect(result).toContain("INFO");
908
- expect(result).toContain("compaction");
909
- expect(result).toContain("started");
910
- });
795
+ const content = readFileSync(recordsPath, "utf-8");
796
+ const lines = content.split("\n").filter((line) => line.trim());
911
797
 
912
- test("pads module name for alignment", () => {
913
- const log1 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "a", msg: "test" });
914
- const log2 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "compaction", msg: "test" });
915
-
916
- // Module names should be padded to 12 chars
917
- expect(log1).toContain("a test"); // 'a' + 11 spaces
918
- expect(log2).toContain("compaction test"); // 'compaction' + 3 spaces (10 chars + 2)
919
- });
798
+ return lines.map((line) => JSON.parse(line) as EvalRunRecord);
799
+ }
920
800
 
921
- test("levelToName maps all levels correctly", () => {
922
- expect(levelToName(10)).toBe("TRACE");
923
- expect(levelToName(20)).toBe("DEBUG");
924
- expect(levelToName(30)).toBe("INFO ");
925
- expect(levelToName(40)).toBe("WARN ");
926
- expect(levelToName(50)).toBe("ERROR");
927
- expect(levelToName(60)).toBe("FATAL");
928
- });
929
- });
801
+ /**
802
+ * Record an eval run to .hive/eval-history.jsonl
803
+ */
804
+ function recordEvalRun(
805
+ projectPath: string,
806
+ record: EvalRunRecord,
807
+ ): void {
808
+ const hivePath = join(projectPath, ".hive");
809
+ const recordsPath = join(hivePath, "eval-history.jsonl");
810
+
811
+ // Ensure .hive directory exists
812
+ if (!existsSync(hivePath)) {
813
+ mkdirSync(hivePath, { recursive: true });
814
+ }
930
815
 
931
- describe("readLogFiles", () => {
932
- test("reads multiple .1log files", () => {
933
- // Create test log files
934
- const log1 = join(testDir, "swarm.1log");
935
- const log2 = join(testDir, "swarm.2log");
936
- const log3 = join(testDir, "compaction.1log");
937
-
938
- writeFileSync(log1, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"line1"}\n');
939
- writeFileSync(log2, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"line2"}\n');
940
- writeFileSync(log3, '{"level":30,"time":"2024-12-24T16:00:02.000Z","module":"compaction","msg":"line3"}\n');
941
-
942
- function readLogFiles(dir: string): string[] {
943
- if (!existsSync(dir)) return [];
944
-
945
- const files = readdirSync(dir)
946
- .filter((f) => /\.\d+log$/.test(f))
947
- .sort() // Sort by filename
948
- .map((f) => join(dir, f));
949
-
950
- const lines: string[] = [];
951
- for (const file of files) {
952
- const content = readFileSync(file, "utf-8");
953
- lines.push(...content.split("\n").filter((line) => line.trim()));
954
- }
955
-
956
- return lines;
957
- }
958
-
959
- const lines = readLogFiles(testDir);
960
- expect(lines).toHaveLength(3);
961
- // Files are sorted alphabetically: compaction.1log, swarm.1log, swarm.2log
962
- expect(lines.some((l) => l.includes("line1"))).toBe(true);
963
- expect(lines.some((l) => l.includes("line2"))).toBe(true);
964
- expect(lines.some((l) => l.includes("line3"))).toBe(true);
965
- });
816
+ // Append record as JSONL
817
+ const line = JSON.stringify(record) + "\n";
966
818
 
967
- test("returns empty array for non-existent directory", () => {
968
- function readLogFiles(dir: string): string[] {
969
- if (!existsSync(dir)) return [];
970
- return [];
971
- }
972
-
973
- const lines = readLogFiles(join(testDir, "nonexistent"));
974
- expect(lines).toHaveLength(0);
975
- });
976
- });
819
+ if (existsSync(recordsPath)) {
820
+ const existingContent = readFileSync(recordsPath, "utf-8");
821
+ writeFileSync(recordsPath, existingContent + line);
822
+ } else {
823
+ writeFileSync(recordsPath, line);
824
+ }
825
+ }
977
826
 
978
- describe("watchLogs", () => {
979
- test("detects new log lines appended to file", async () => {
980
- const logFile = join(testDir, "swarm.1log");
981
- const collectedLines: string[] = [];
982
-
983
- // Create initial log file
984
- writeFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"initial"}\n');
985
-
986
- // Import watch utilities
987
- const { watch } = await import("fs");
988
- const { appendFileSync } = await import("fs");
989
-
990
- // Track file position for incremental reads
991
- let lastSize = 0;
992
-
993
- function readNewLines(filePath: string): string[] {
994
- const content = readFileSync(filePath, "utf-8");
995
- const newContent = content.slice(lastSize);
996
- lastSize = content.length;
997
- return newContent.split("\n").filter((line) => line.trim());
998
- }
999
-
1000
- // Simulate watch behavior
1001
- const watcher = watch(testDir, (eventType, filename) => {
1002
- if (filename && /\.\d+log$/.test(filename)) {
1003
- const newLines = readNewLines(join(testDir, filename));
1004
- collectedLines.push(...newLines);
1005
- }
1006
- });
1007
-
1008
- // Wait for watcher to be ready
1009
- await new Promise((resolve) => setTimeout(resolve, 100));
1010
-
1011
- // Append new log line
1012
- appendFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"appended"}\n');
1013
-
1014
- // Wait for event to fire
1015
- await new Promise((resolve) => setTimeout(resolve, 200));
1016
-
1017
- watcher.close();
1018
-
1019
- // Should have detected the new line
1020
- expect(collectedLines.some((l) => l.includes("appended"))).toBe(true);
1021
- });
827
+ /**
828
+ * Check eval gate for progressive gating
829
+ */
830
+ function checkGate(
831
+ projectPath: string,
832
+ evalName: string,
833
+ currentScore: number,
834
+ ): GateResult {
835
+ const records = readAllRecords(projectPath).filter(
836
+ (r) => r.eval_name === evalName,
837
+ );
1022
838
 
1023
- test("parseWatchArgs extracts --watch flag", () => {
1024
- function parseWatchArgs(args: string[]): { watch: boolean; interval: number } {
1025
- let watch = false;
1026
- let interval = 1000; // default 1 second
1027
-
1028
- for (let i = 0; i < args.length; i++) {
1029
- const arg = args[i];
1030
- if (arg === "--watch" || arg === "-w") {
1031
- watch = true;
1032
- } else if (arg === "--interval" && i + 1 < args.length) {
1033
- interval = parseInt(args[++i], 10);
1034
- }
1035
- }
1036
-
1037
- return { watch, interval };
1038
- }
1039
-
1040
- expect(parseWatchArgs(["--watch"])).toEqual({ watch: true, interval: 1000 });
1041
- expect(parseWatchArgs(["-w"])).toEqual({ watch: true, interval: 1000 });
1042
- expect(parseWatchArgs(["--watch", "--interval", "500"])).toEqual({ watch: true, interval: 500 });
1043
- expect(parseWatchArgs(["compaction", "--watch"])).toEqual({ watch: true, interval: 1000 });
1044
- expect(parseWatchArgs(["--level", "error"])).toEqual({ watch: false, interval: 1000 });
1045
- });
1046
- });
1047
- });
839
+ if (records.length < 10) {
840
+ return {
841
+ passed: true,
842
+ phase: "bootstrap",
843
+ message: `BOOTSTRAP (${records.length}/10 runs): no gates yet`,
844
+ };
845
+ }
1048
846
 
1049
- // ============================================================================
1050
- // Eval Commands Tests (TDD)
1051
- // ============================================================================
847
+ const lastTenScores = records.slice(-10).map((r) => r.score);
848
+ const baseline = lastTenScores.reduce((sum, x) => sum + x, 0) / lastTenScores.length;
849
+ const variance = calculateVariance(lastTenScores);
1052
850
 
1053
- describe("Eval commands", () => {
1054
- describe("formatEvalStatus", () => {
1055
- test("displays phase, thresholds, and recent scores", () => {
1056
- const status = {
1057
- phase: "stabilization" as const,
1058
- runCount: 25,
1059
- thresholds: {
1060
- stabilization: 0.1,
1061
- production: 0.05,
1062
- },
1063
- recentScores: [
1064
- { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
1065
- { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
1066
- { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
1067
- ],
851
+ if (records.length < 50) {
852
+ const drop = ((baseline - currentScore) / baseline) * 100;
853
+ if (drop > 5) {
854
+ return {
855
+ passed: false,
856
+ phase: "stabilization",
857
+ message: `WARN: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)}`,
858
+ baseline,
859
+ variance,
1068
860
  };
861
+ }
1069
862
 
1070
- const output = formatEvalStatus(status);
1071
-
1072
- // Should show phase
1073
- expect(output).toContain("stabilization");
1074
-
1075
- // Should show run count
1076
- expect(output).toContain("25");
1077
-
1078
- // Should show thresholds
1079
- expect(output).toContain("10%"); // stabilization threshold
1080
- expect(output).toContain("5%"); // production threshold
1081
-
1082
- // Should show recent scores
1083
- expect(output).toContain("0.85");
1084
- expect(output).toContain("0.87");
1085
- expect(output).toContain("0.82");
1086
- });
863
+ return {
864
+ passed: true,
865
+ phase: "stabilization",
866
+ message: `Stabilization (${records.length}/50 runs): baseline=${baseline.toFixed(2)}`,
867
+ baseline,
868
+ variance,
869
+ };
870
+ }
1087
871
 
1088
- test("shows bootstrap phase message", () => {
1089
- const status = {
1090
- phase: "bootstrap" as const,
1091
- runCount: 5,
1092
- thresholds: {
1093
- stabilization: 0.1,
1094
- production: 0.05,
1095
- },
1096
- recentScores: [],
872
+ // Production phase: variance < 0.1 AND score doesn't drop >5%
873
+ if (variance < 0.1) {
874
+ const drop = ((baseline - currentScore) / baseline) * 100;
875
+ if (drop > 5) {
876
+ return {
877
+ passed: false,
878
+ phase: "production",
879
+ message: `FAIL: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)} (variance=${variance.toFixed(3)})`,
880
+ baseline,
881
+ variance,
1097
882
  };
883
+ }
1098
884
 
1099
- const output = formatEvalStatus(status);
885
+ return {
886
+ passed: true,
887
+ phase: "production",
888
+ message: `PASS: Production phase (variance=${variance.toFixed(3)}, baseline=${baseline.toFixed(2)})`,
889
+ baseline,
890
+ variance,
891
+ };
892
+ }
1100
893
 
1101
- expect(output).toContain("bootstrap");
1102
- expect(output).toContain("collecting data");
1103
- });
894
+ // Stuck in stabilization (>50 runs but variance still high)
895
+ return {
896
+ passed: true,
897
+ phase: "stabilization",
898
+ message: `Stabilization: variance too high (${variance.toFixed(3)} > 0.1), need more consistent runs`,
899
+ baseline,
900
+ variance,
901
+ };
902
+ }
1104
903
 
1105
- test("shows production phase message", () => {
1106
- const status = {
1107
- phase: "production" as const,
1108
- runCount: 75,
1109
- thresholds: {
1110
- stabilization: 0.1,
1111
- production: 0.05,
1112
- },
1113
- recentScores: [],
1114
- };
904
+ /**
905
+ * Ensure .hive directory exists
906
+ */
907
+ function ensureHiveDirectory(projectPath: string): void {
908
+ const hivePath = join(projectPath, ".hive");
909
+ if (!existsSync(hivePath)) {
910
+ mkdirSync(hivePath, { recursive: true });
911
+ }
912
+ }
1115
913
 
1116
- const output = formatEvalStatus(status);
914
+ describe("Eval gate", () => {
915
+ let testDir: string;
1117
916
 
1118
- expect(output).toContain("production");
1119
- });
917
+ beforeEach(() => {
918
+ testDir = join(tmpdir(), `eval-gate-test-${Date.now()}`);
919
+ mkdirSync(testDir, { recursive: true });
1120
920
  });
1121
921
 
1122
- describe("formatEvalHistory", () => {
1123
- test("shows eval entries with timestamps and scores", () => {
1124
- const history = [
1125
- {
1126
- timestamp: "2024-12-24T10:00:00.000Z",
1127
- eval_name: "swarm-decomposition",
1128
- score: 0.85,
1129
- run_count: 1,
1130
- },
1131
- {
1132
- timestamp: "2024-12-24T11:00:00.000Z",
1133
- eval_name: "swarm-decomposition",
1134
- score: 0.87,
1135
- run_count: 2,
1136
- },
1137
- {
1138
- timestamp: "2024-12-24T12:00:00.000Z",
1139
- eval_name: "coordinator-behavior",
1140
- score: 0.92,
1141
- run_count: 1,
1142
- },
1143
- ];
922
+ afterEach(() => {
923
+ if (existsSync(testDir)) {
924
+ rmSync(testDir, { recursive: true, force: true });
925
+ }
926
+ });
1144
927
 
1145
- const output = formatEvalHistory(history);
928
+ describe("Bootstrap phase (<10 runs)", () => {
929
+ test("allows any score", () => {
930
+ ensureHiveDirectory(testDir);
931
+
932
+ // Record 5 runs
933
+ for (let i = 0; i < 5; i++) {
934
+ recordEvalRun(testDir, {
935
+ timestamp: new Date().toISOString(),
936
+ eval_name: "test-eval",
937
+ score: 0.5 + i * 0.1,
938
+ run_count: i + 1,
939
+ });
940
+ }
1146
941
 
1147
- // Should show all eval names
1148
- expect(output).toContain("swarm-decomposition");
1149
- expect(output).toContain("coordinator-behavior");
1150
-
1151
- // Should show scores
1152
- expect(output).toContain("0.85");
1153
- expect(output).toContain("0.87");
1154
- expect(output).toContain("0.92");
1155
-
1156
- // Should show run counts
1157
- expect(output).toContain("run #1");
1158
- expect(output).toContain("run #2");
1159
- });
942
+ const result = checkGate(testDir, "test-eval", 0.3); // Low score
1160
943
 
1161
- test("returns empty message for no history", () => {
1162
- const output = formatEvalHistory([]);
1163
- expect(output).toContain("No eval history");
944
+ expect(result.passed).toBe(true);
945
+ expect(result.phase).toBe("bootstrap");
946
+ expect(result.message).toContain("BOOTSTRAP");
1164
947
  });
1165
948
 
1166
- test("formats timestamps as readable dates", () => {
1167
- const history = [
1168
- {
1169
- timestamp: "2024-12-24T10:00:00.000Z",
1170
- eval_name: "test",
1171
- score: 0.85,
1172
- run_count: 1,
1173
- },
1174
- ];
949
+ test("counts runs correctly", () => {
950
+ ensureHiveDirectory(testDir);
951
+
952
+ for (let i = 0; i < 7; i++) {
953
+ recordEvalRun(testDir, {
954
+ timestamp: new Date().toISOString(),
955
+ eval_name: "test-eval",
956
+ score: 0.8,
957
+ run_count: i + 1,
958
+ });
959
+ }
1175
960
 
1176
- const output = formatEvalHistory(history);
961
+ const result = checkGate(testDir, "test-eval", 0.8);
1177
962
 
1178
- // Should contain a formatted date (not raw ISO)
1179
- expect(output).not.toContain("2024-12-24T10:00:00.000Z");
1180
- expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
963
+ expect(result.phase).toBe("bootstrap");
964
+ expect(result.message).toContain("7/10");
1181
965
  });
1182
966
  });
1183
967
 
1184
- describe("generateSparkline", () => {
1185
- test("generates sparkline from scores", () => {
1186
- const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
1187
- const sparkline = generateSparkline(scores);
1188
-
1189
- // Should use sparkline characters
1190
- expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
1191
-
1192
- // Length should match input
1193
- expect(sparkline.length).toBe(scores.length);
1194
-
1195
- // Should show ascending trend
1196
- expect(sparkline).toContain("▁"); // Low score
1197
- expect(sparkline).toContain("█"); // High score
1198
- });
968
+ describe("Stabilization phase (10-50 runs)", () => {
969
+ test("warns on >5% regression", () => {
970
+ ensureHiveDirectory(testDir);
971
+
972
+ // Record 20 runs with consistent 0.9 score
973
+ for (let i = 0; i < 20; i++) {
974
+ recordEvalRun(testDir, {
975
+ timestamp: new Date().toISOString(),
976
+ eval_name: "test-eval",
977
+ score: 0.9,
978
+ run_count: i + 1,
979
+ });
980
+ }
1199
981
 
1200
- test("handles single score", () => {
1201
- const sparkline = generateSparkline([0.5]);
1202
- expect(sparkline.length).toBe(1);
1203
- expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
1204
- });
982
+ // Test with regressed score (>5% drop from 0.9 baseline)
983
+ const regressedScore = 0.85; // 5.5% drop
984
+ const result = checkGate(testDir, "test-eval", regressedScore);
1205
985
 
1206
- test("handles all same scores", () => {
1207
- const sparkline = generateSparkline([0.5, 0.5, 0.5]);
1208
- expect(sparkline.length).toBe(3);
1209
- // All should be same character
1210
- expect(new Set(sparkline.split("")).size).toBe(1);
986
+ expect(result.passed).toBe(false);
987
+ expect(result.phase).toBe("stabilization");
988
+ expect(result.message).toContain("WARN");
989
+ expect(result.baseline).toBeCloseTo(0.9, 2);
1211
990
  });
1212
991
 
1213
- test("returns empty for empty array", () => {
1214
- const sparkline = generateSparkline([]);
1215
- expect(sparkline).toBe("");
1216
- });
1217
- });
992
+ test("passes when score is stable", () => {
993
+ ensureHiveDirectory(testDir);
1218
994
 
1219
- describe("formatEvalRunResult", () => {
1220
- test("shows pass/fail with gate result", () => {
1221
- const result = {
1222
- passed: true,
1223
- phase: "production" as const,
1224
- message: "Production phase: 2.5% regression - acceptable",
1225
- baseline: 0.85,
1226
- currentScore: 0.83,
1227
- regressionPercent: 0.025,
1228
- };
995
+ for (let i = 0; i < 25; i++) {
996
+ recordEvalRun(testDir, {
997
+ timestamp: new Date().toISOString(),
998
+ eval_name: "test-eval",
999
+ score: 0.85,
1000
+ run_count: i + 1,
1001
+ });
1002
+ }
1229
1003
 
1230
- const output = formatEvalRunResult(result);
1004
+ const result = checkGate(testDir, "test-eval", 0.86);
1231
1005
 
1232
- expect(output).toContain("PASS");
1233
- expect(output).toContain("production");
1234
- expect(output).toContain("0.83"); // current score
1235
- expect(output).toContain("2.5%"); // regression
1006
+ expect(result.passed).toBe(true);
1007
+ expect(result.phase).toBe("stabilization");
1008
+ expect(result.baseline).toBeCloseTo(0.85, 2);
1236
1009
  });
1010
+ });
1237
1011
 
1238
- test("shows failure with details", () => {
1239
- const result = {
1240
- passed: false,
1241
- phase: "production" as const,
1242
- message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
1243
- baseline: 0.85,
1244
- currentScore: 0.78,
1245
- regressionPercent: 0.08,
1246
- };
1012
+ describe("Production phase (>50 runs, low variance)", () => {
1013
+ test("enters production when variance < 0.1", () => {
1014
+ ensureHiveDirectory(testDir);
1015
+
1016
+ // Simulate 60 runs with consistent scores (low variance)
1017
+ for (let i = 0; i < 60; i++) {
1018
+ recordEvalRun(testDir, {
1019
+ timestamp: new Date().toISOString(),
1020
+ eval_name: "test-eval",
1021
+ score: 0.9, // All same score = zero variance
1022
+ run_count: i + 1,
1023
+ });
1024
+ }
1247
1025
 
1248
- const output = formatEvalRunResult(result);
1026
+ const result = checkGate(testDir, "test-eval", 0.91);
1249
1027
 
1250
- expect(output).toContain("FAIL");
1251
- expect(output).toContain("8.0%");
1252
- expect(output).toContain("exceeds");
1028
+ expect(result.phase).toBe("production");
1029
+ expect(result.variance).toBeLessThan(0.1);
1253
1030
  });
1254
1031
 
1255
- test("shows bootstrap phase without baseline", () => {
1256
- const result = {
1257
- passed: true,
1258
- phase: "bootstrap" as const,
1259
- message: "Bootstrap phase (5/10 runs) - collecting data",
1260
- currentScore: 0.85,
1261
- };
1032
+ test("fails on regression in production", () => {
1033
+ ensureHiveDirectory(testDir);
1034
+
1035
+ // Simulate 60 runs with consistent high scores to reach production phase
1036
+ for (let i = 0; i < 60; i++) {
1037
+ recordEvalRun(testDir, {
1038
+ timestamp: new Date().toISOString(),
1039
+ eval_name: "test-eval",
1040
+ score: 0.9,
1041
+ run_count: i + 1,
1042
+ });
1043
+ }
1262
1044
 
1263
- const output = formatEvalRunResult(result);
1045
+ // Now test with a regressed score (>5% drop from 0.9 baseline)
1046
+ const regressedScore = 0.8; // 11% drop
1047
+ const result = checkGate(testDir, "test-eval", regressedScore);
1264
1048
 
1265
- expect(output).toContain("bootstrap");
1266
- expect(output).toContain("collecting data");
1267
- expect(output).not.toContain("baseline");
1049
+ expect(result.passed).toBe(false);
1050
+ expect(result.phase).toBe("production");
1051
+ expect(result.message).toContain("FAIL");
1268
1052
  });
1269
1053
  });
1270
1054
  });
1271
1055
 
1272
1056
  // ============================================================================
1273
- // Eval Command Helpers (Implementation)
1057
+ // History Command Tests (TDD)
1274
1058
  // ============================================================================
1275
1059
 
1060
+ interface SwarmHistoryRecord {
1061
+ epic_id: string;
1062
+ epic_title: string;
1063
+ strategy: string;
1064
+ timestamp: string;
1065
+ overall_success: boolean;
1066
+ task_count: number;
1067
+ completed_count: number;
1068
+ }
1069
+
1276
1070
  /**
1277
- * Generate sparkline from array of scores (0-1 range)
1071
+ * Format relative time (e.g., "2h ago", "1d ago")
1278
1072
  */
1279
- function generateSparkline(scores: number[]): string {
1280
- if (scores.length === 0) return "";
1073
+ function formatRelativeTime(timestamp: string): string {
1074
+ const now = Date.now();
1075
+ const then = new Date(timestamp).getTime();
1076
+ const diffMs = now - then;
1077
+
1078
+ const minutes = Math.floor(diffMs / 60000);
1079
+ const hours = Math.floor(diffMs / 3600000);
1080
+ const days = Math.floor(diffMs / 86400000);
1081
+
1082
+ if (minutes < 60) return `${minutes}m ago`;
1083
+ if (hours < 24) return `${hours}h ago`;
1084
+ return `${days}d ago`;
1085
+ }
1281
1086
 
1282
- const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
1283
- const min = Math.min(...scores);
1284
- const max = Math.max(...scores);
1285
- const range = max - min;
1087
+ /**
1088
+ * Format swarm history as beautiful CLI table
1089
+ */
1090
+ function formatSwarmHistory(records: SwarmHistoryRecord[]): string {
1091
+ if (records.length === 0) {
1092
+ return "No swarm history found";
1093
+ }
1286
1094
 
1287
- if (range === 0) {
1288
- // All scores the same
1289
- return chars[4].repeat(scores.length);
1095
+ const rows = records.map(r => ({
1096
+ time: formatRelativeTime(r.timestamp),
1097
+ status: r.overall_success ? "✅" : "❌",
1098
+ title: r.epic_title.length > 30 ? r.epic_title.slice(0, 27) + "..." : r.epic_title,
1099
+ strategy: r.strategy,
1100
+ tasks: `${r.completed_count}/${r.task_count} tasks`,
1101
+ }));
1102
+
1103
+ // Box drawing characters
1104
+ const lines: string[] = [];
1105
+ lines.push("┌─────────────────────────────────────────────────────────────┐");
1106
+ lines.push("│ SWARM HISTORY │");
1107
+ lines.push("├─────────────────────────────────────────────────────────────┤");
1108
+
1109
+ for (const row of rows) {
1110
+ const statusCol = `${row.time.padEnd(8)} ${row.status}`;
1111
+ const titleCol = row.title.padEnd(32);
1112
+ const strategyCol = row.strategy.padEnd(13);
1113
+ const tasksCol = row.tasks;
1114
+
1115
+ const line = `│ ${statusCol} ${titleCol} ${strategyCol} ${tasksCol.padEnd(3)} │`;
1116
+ lines.push(line);
1290
1117
  }
1291
1118
 
1292
- return scores
1293
- .map((score) => {
1294
- const normalized = (score - min) / range;
1295
- const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
1296
- return chars[index];
1297
- })
1298
- .join("");
1119
+ lines.push("└─────────────────────────────────────────────────────────────┘");
1120
+
1121
+ return lines.join("\n");
1299
1122
  }
1300
1123
 
1301
1124
  /**
1302
- * Format eval status for display
1125
+ * Filter history by status
1303
1126
  */
1304
- function formatEvalStatus(status: {
1305
- phase: "bootstrap" | "stabilization" | "production";
1306
- runCount: number;
1307
- thresholds: { stabilization: number; production: number };
1308
- recentScores: Array<{ timestamp: string; score: number }>;
1309
- }): string {
1310
- const lines: string[] = [];
1311
-
1312
- // Phase banner
1313
- const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
1314
- lines.push(`${phaseEmoji} Phase: ${status.phase}`);
1315
- lines.push(`Runs: ${status.runCount}`);
1316
- lines.push("");
1317
-
1318
- // Thresholds
1319
- lines.push("Thresholds:");
1320
- lines.push(` Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
1321
- lines.push(` Production: ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
1322
- lines.push("");
1323
-
1324
- // Recent scores with sparkline
1325
- if (status.recentScores.length > 0) {
1326
- lines.push("Recent scores:");
1327
- const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
1328
- lines.push(` ${sparkline}`);
1329
- for (const { timestamp, score } of status.recentScores) {
1330
- const time = new Date(timestamp).toLocaleString();
1331
- lines.push(` ${time}: ${score.toFixed(2)}`);
1332
- }
1333
- } else {
1334
- lines.push("No scores yet - collecting data");
1127
+ function filterHistoryByStatus(
1128
+ records: SwarmHistoryRecord[],
1129
+ status?: "success" | "failed" | "in_progress",
1130
+ ): SwarmHistoryRecord[] {
1131
+ if (!status) return records;
1132
+
1133
+ switch (status) {
1134
+ case "success":
1135
+ return records.filter(r => r.overall_success);
1136
+ case "failed":
1137
+ return records.filter(r => !r.overall_success && r.completed_count === r.task_count);
1138
+ case "in_progress":
1139
+ return records.filter(r => r.completed_count < r.task_count);
1140
+ default:
1141
+ return records;
1335
1142
  }
1143
+ }
1336
1144
 
1337
- return lines.join("\n");
1145
+ /**
1146
+ * Filter history by strategy
1147
+ */
1148
+ function filterHistoryByStrategy(
1149
+ records: SwarmHistoryRecord[],
1150
+ strategy?: "file-based" | "feature-based" | "risk-based",
1151
+ ): SwarmHistoryRecord[] {
1152
+ if (!strategy) return records;
1153
+ return records.filter(r => r.strategy === strategy);
1338
1154
  }
1339
1155
 
1340
1156
  /**
1341
- * Format eval history for display
1157
+ * Parse history CLI arguments
1342
1158
  */
1343
- function formatEvalHistory(history: Array<{
1344
- timestamp: string;
1345
- eval_name: string;
1346
- score: number;
1347
- run_count: number;
1348
- }>): string {
1349
- if (history.length === 0) {
1350
- return "No eval history found";
1351
- }
1159
+ function parseHistoryArgs(args: string[]): {
1160
+ limit: number;
1161
+ status?: "success" | "failed" | "in_progress";
1162
+ strategy?: "file-based" | "feature-based" | "risk-based";
1163
+ verbose: boolean;
1164
+ } {
1165
+ const result: {
1166
+ limit: number;
1167
+ status?: "success" | "failed" | "in_progress";
1168
+ strategy?: "file-based" | "feature-based" | "risk-based";
1169
+ verbose: boolean;
1170
+ } = {
1171
+ limit: 10,
1172
+ verbose: false,
1173
+ };
1352
1174
 
1353
- const lines: string[] = [];
1354
- lines.push("Eval History:");
1355
- lines.push("");
1356
-
1357
- // Group by eval name
1358
- const grouped = new Map<string, typeof history>();
1359
- for (const entry of history) {
1360
- if (!grouped.has(entry.eval_name)) {
1361
- grouped.set(entry.eval_name, []);
1362
- }
1363
- grouped.get(entry.eval_name)!.push(entry);
1364
- }
1175
+ for (let i = 0; i < args.length; i++) {
1176
+ const arg = args[i];
1365
1177
 
1366
- // Display each eval group
1367
- for (const [evalName, entries] of grouped) {
1368
- lines.push(`${evalName}:`);
1369
- const sparkline = generateSparkline(entries.map((e) => e.score));
1370
- lines.push(` Trend: ${sparkline}`);
1371
-
1372
- // Show latest 5 entries
1373
- const latest = entries.slice(-5);
1374
- for (const entry of latest) {
1375
- const time = new Date(entry.timestamp).toLocaleTimeString();
1376
- lines.push(` ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
1377
- }
1378
-
1379
- if (entries.length > 5) {
1380
- lines.push(` ... and ${entries.length - 5} more`);
1178
+ if (arg === "--limit" || arg === "-n") {
1179
+ const limitStr = args[i + 1];
1180
+ if (limitStr && !isNaN(Number(limitStr))) {
1181
+ result.limit = Number(limitStr);
1182
+ i++;
1183
+ }
1184
+ } else if (arg === "--status") {
1185
+ const statusStr = args[i + 1];
1186
+ if (statusStr && ["success", "failed", "in_progress"].includes(statusStr)) {
1187
+ result.status = statusStr as "success" | "failed" | "in_progress";
1188
+ i++;
1189
+ }
1190
+ } else if (arg === "--strategy") {
1191
+ const strategyStr = args[i + 1];
1192
+ if (strategyStr && ["file-based", "feature-based", "risk-based"].includes(strategyStr)) {
1193
+ result.strategy = strategyStr as "file-based" | "feature-based" | "risk-based";
1194
+ i++;
1195
+ }
1196
+ } else if (arg === "--verbose" || arg === "-v") {
1197
+ result.verbose = true;
1381
1198
  }
1382
-
1383
- lines.push("");
1384
1199
  }
1385
1200
 
1386
- return lines.join("\n");
1201
+ return result;
1387
1202
  }
1388
1203
 
1389
- /**
1390
- * Format eval run result (gate check)
1391
- */
1392
- function formatEvalRunResult(result: {
1393
- passed: boolean;
1394
- phase: "bootstrap" | "stabilization" | "production";
1395
- message: string;
1396
- baseline?: number;
1397
- currentScore: number;
1398
- regressionPercent?: number;
1399
- }): string {
1400
- const lines: string[] = [];
1401
-
1402
- // Pass/fail banner
1403
- const status = result.passed ? "✅ PASS" : "❌ FAIL";
1404
- lines.push(status);
1405
- lines.push("");
1204
+ describe("swarm history", () => {
1205
+ describe("formatRelativeTime", () => {
1206
+ test("formats minutes ago", () => {
1207
+ const fiveMinutesAgo = new Date(Date.now() - 5 * 60000).toISOString();
1208
+ const result = formatRelativeTime(fiveMinutesAgo);
1209
+ expect(result).toMatch(/5m ago/);
1210
+ });
1406
1211
 
1407
- // Phase and score
1408
- lines.push(`Phase: ${result.phase}`);
1409
- lines.push(`Score: ${result.currentScore.toFixed(2)}`);
1212
+ test("formats hours ago", () => {
1213
+ const threeHoursAgo = new Date(Date.now() - 3 * 3600000).toISOString();
1214
+ const result = formatRelativeTime(threeHoursAgo);
1215
+ expect(result).toMatch(/3h ago/);
1216
+ });
1410
1217
 
1411
- if (result.baseline !== undefined) {
1412
- lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
1413
- }
1218
+ test("formats days ago", () => {
1219
+ const twoDaysAgo = new Date(Date.now() - 2 * 86400000).toISOString();
1220
+ const result = formatRelativeTime(twoDaysAgo);
1221
+ expect(result).toMatch(/2d ago/);
1222
+ });
1223
+ });
1414
1224
 
1415
- if (result.regressionPercent !== undefined) {
1416
- const sign = result.regressionPercent > 0 ? "+" : "";
1417
- lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
1418
- }
1225
+ describe("formatSwarmHistory", () => {
1226
+ test("formats history as beautiful box-drawn table", () => {
1227
+ const records: SwarmHistoryRecord[] = [
1228
+ {
1229
+ epic_id: "epic-1",
1230
+ epic_title: "Add auth flow",
1231
+ strategy: "feature-based",
1232
+ timestamp: new Date(Date.now() - 2 * 3600000).toISOString(),
1233
+ overall_success: true,
1234
+ task_count: 4,
1235
+ completed_count: 4,
1236
+ },
1237
+ {
1238
+ epic_id: "epic-2",
1239
+ epic_title: "Refactor DB layer",
1240
+ strategy: "file-based",
1241
+ timestamp: new Date(Date.now() - 5 * 3600000).toISOString(),
1242
+ overall_success: false,
1243
+ task_count: 5,
1244
+ completed_count: 2,
1245
+ },
1246
+ ];
1419
1247
 
1420
- lines.push("");
1421
- lines.push(result.message);
1248
+ const result = formatSwarmHistory(records);
1249
+
1250
+ expect(result).toContain("┌─────");
1251
+ expect(result).toContain("SWARM HISTORY");
1252
+ expect(result).toContain("✅");
1253
+ expect(result).toContain("❌");
1254
+ expect(result).toContain("Add auth flow");
1255
+ expect(result).toContain("Refactor DB layer");
1256
+ expect(result).toContain("feature-based");
1257
+ expect(result).toContain("file-based");
1258
+ expect(result).toContain("4/4 tasks");
1259
+ expect(result).toContain("2/5 tasks");
1260
+ expect(result).toContain("└─────");
1261
+ });
1422
1262
 
1423
- return lines.join("\n");
1424
- }
1263
+ test("truncates long titles with ellipsis", () => {
1264
+ const records: SwarmHistoryRecord[] = [
1265
+ {
1266
+ epic_id: "epic-1",
1267
+ epic_title: "A".repeat(100),
1268
+ strategy: "feature-based",
1269
+ timestamp: new Date(Date.now() - 1000).toISOString(),
1270
+ overall_success: true,
1271
+ task_count: 1,
1272
+ completed_count: 1,
1273
+ },
1274
+ ];
1425
1275
 
1426
- // ============================================================================
1427
- // Eval Run Tests
1428
- // ============================================================================
1276
+ const result = formatSwarmHistory(records);
1429
1277
 
1430
- describe("Eval Run CI Mode", () => {
1431
- let testDir: string;
1278
+ expect(result).toContain("...");
1279
+ expect(result).toMatch(/A{27}\.\.\./);
1280
+ });
1432
1281
 
1433
- beforeEach(() => {
1434
- testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
1435
- mkdirSync(testDir, { recursive: true });
1282
+ test("returns 'No swarm history found' for empty array", () => {
1283
+ const result = formatSwarmHistory([]);
1284
+ expect(result).toBe("No swarm history found");
1285
+ });
1436
1286
  });
1437
1287
 
1438
- afterEach(() => {
1439
- if (existsSync(testDir)) {
1440
- rmSync(testDir, { recursive: true, force: true });
1441
- }
1442
- });
1288
+ describe("filterHistoryByStatus", () => {
1289
+ const records: SwarmHistoryRecord[] = [
1290
+ {
1291
+ epic_id: "epic-1",
1292
+ epic_title: "Success",
1293
+ strategy: "feature-based",
1294
+ timestamp: "2025-01-01T00:00:00Z",
1295
+ overall_success: true,
1296
+ task_count: 4,
1297
+ completed_count: 4,
1298
+ },
1299
+ {
1300
+ epic_id: "epic-2",
1301
+ epic_title: "Failed",
1302
+ strategy: "file-based",
1303
+ timestamp: "2025-01-01T00:00:00Z",
1304
+ overall_success: false,
1305
+ task_count: 4,
1306
+ completed_count: 4,
1307
+ },
1308
+ {
1309
+ epic_id: "epic-3",
1310
+ epic_title: "In Progress",
1311
+ strategy: "risk-based",
1312
+ timestamp: "2025-01-01T00:00:00Z",
1313
+ overall_success: false,
1314
+ task_count: 5,
1315
+ completed_count: 2,
1316
+ },
1317
+ ];
1318
+
1319
+ test("filters success only", () => {
1320
+ const result = filterHistoryByStatus(records, "success");
1321
+ expect(result).toHaveLength(1);
1322
+ expect(result[0].epic_title).toBe("Success");
1323
+ });
1443
1324
 
1444
- test("writes eval results JSON file", async () => {
1445
- // Import the function we need to test
1446
- const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
1447
- const { checkGate } = await import("../src/eval-gates.js");
1448
- const { ensureHiveDirectory } = await import("../src/hive.js");
1449
-
1450
- // Set up test data
1451
- const evalName = "test-eval";
1452
- const mockScore = 0.85;
1453
-
1454
- // Ensure directory exists
1455
- ensureHiveDirectory(testDir);
1456
-
1457
- // Get history and record run (simulating what eval run does)
1458
- const history = getScoreHistory(testDir, evalName);
1459
- recordEvalRun(testDir, {
1460
- timestamp: new Date().toISOString(),
1461
- eval_name: evalName,
1462
- score: mockScore,
1463
- run_count: history.length + 1,
1325
+ test("filters failed only", () => {
1326
+ const result = filterHistoryByStatus(records, "failed");
1327
+ expect(result).toHaveLength(1);
1328
+ expect(result[0].epic_title).toBe("Failed");
1464
1329
  });
1465
1330
 
1466
- // Check gate
1467
- const gateResult = checkGate(testDir, evalName, mockScore);
1331
+ test("filters in_progress only", () => {
1332
+ const result = filterHistoryByStatus(records, "in_progress");
1333
+ expect(result).toHaveLength(1);
1334
+ expect(result[0].epic_title).toBe("In Progress");
1335
+ });
1468
1336
 
1469
- // Write results file (simulating CI mode)
1470
- const resultsPath = join(testDir, ".hive", "eval-results.json");
1471
- const results = { [evalName]: gateResult };
1472
- writeFileSync(resultsPath, JSON.stringify(results, null, 2));
1337
+ test("returns all when no status filter", () => {
1338
+ const result = filterHistoryByStatus(records);
1339
+ expect(result).toHaveLength(3);
1340
+ });
1341
+ });
1473
1342
 
1474
- // Verify file exists and has correct structure
1475
- expect(existsSync(resultsPath)).toBe(true);
1343
+ describe("filterHistoryByStrategy", () => {
1344
+ const records: SwarmHistoryRecord[] = [
1345
+ {
1346
+ epic_id: "epic-1",
1347
+ epic_title: "File",
1348
+ strategy: "file-based",
1349
+ timestamp: "2025-01-01T00:00:00Z",
1350
+ overall_success: true,
1351
+ task_count: 4,
1352
+ completed_count: 4,
1353
+ },
1354
+ {
1355
+ epic_id: "epic-2",
1356
+ epic_title: "Feature",
1357
+ strategy: "feature-based",
1358
+ timestamp: "2025-01-01T00:00:00Z",
1359
+ overall_success: true,
1360
+ task_count: 4,
1361
+ completed_count: 4,
1362
+ },
1363
+ {
1364
+ epic_id: "epic-3",
1365
+ epic_title: "Risk",
1366
+ strategy: "risk-based",
1367
+ timestamp: "2025-01-01T00:00:00Z",
1368
+ overall_success: true,
1369
+ task_count: 4,
1370
+ completed_count: 4,
1371
+ },
1372
+ ];
1373
+
1374
+ test("filters file-based only", () => {
1375
+ const result = filterHistoryByStrategy(records, "file-based");
1376
+ expect(result).toHaveLength(1);
1377
+ expect(result[0].epic_title).toBe("File");
1378
+ });
1476
1379
 
1477
- const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
1478
- expect(savedResults).toHaveProperty(evalName);
1479
- expect(savedResults[evalName]).toMatchObject({
1480
- passed: true,
1481
- phase: "bootstrap",
1482
- currentScore: mockScore,
1380
+ test("filters feature-based only", () => {
1381
+ const result = filterHistoryByStrategy(records, "feature-based");
1382
+ expect(result).toHaveLength(1);
1383
+ expect(result[0].epic_title).toBe("Feature");
1384
+ });
1385
+
1386
+ test("filters risk-based only", () => {
1387
+ const result = filterHistoryByStrategy(records, "risk-based");
1388
+ expect(result).toHaveLength(1);
1389
+ expect(result[0].epic_title).toBe("Risk");
1390
+ });
1391
+
1392
+ test("returns all when no strategy filter", () => {
1393
+ const result = filterHistoryByStrategy(records);
1394
+ expect(result).toHaveLength(3);
1483
1395
  });
1484
1396
  });
1485
1397
 
1486
- test("bootstrap phase always passes", async () => {
1487
- const { checkGate } = await import("../src/eval-gates.js");
1398
+ describe("parseHistoryArgs", () => {
1399
+ test("parses --limit flag", () => {
1400
+ const result = parseHistoryArgs(["--limit", "20"]);
1401
+ expect(result.limit).toBe(20);
1402
+ });
1488
1403
 
1489
- // Even with a low score, bootstrap phase should pass
1490
- const result = checkGate(testDir, "test-eval", 0.1);
1404
+ test("parses -n shorthand for limit", () => {
1405
+ const result = parseHistoryArgs(["-n", "5"]);
1406
+ expect(result.limit).toBe(5);
1407
+ });
1491
1408
 
1492
- expect(result.passed).toBe(true);
1493
- expect(result.phase).toBe("bootstrap");
1494
- expect(result.message).toContain("Bootstrap phase");
1495
- });
1409
+ test("parses --status flag", () => {
1410
+ const result = parseHistoryArgs(["--status", "success"]);
1411
+ expect(result.status).toBe("success");
1412
+ });
1496
1413
 
1497
- test("production phase fails on regression", async () => {
1498
- const { recordEvalRun } = await import("../src/eval-history.js");
1499
- const { checkGate } = await import("../src/eval-gates.js");
1500
- const { ensureHiveDirectory } = await import("../src/hive.js");
1501
-
1502
- ensureHiveDirectory(testDir);
1503
-
1504
- // Simulate 60 runs with consistent high scores to reach production phase
1505
- for (let i = 0; i < 60; i++) {
1506
- recordEvalRun(testDir, {
1507
- timestamp: new Date().toISOString(),
1508
- eval_name: "test-eval",
1509
- score: 0.9,
1510
- run_count: i + 1,
1511
- });
1512
- }
1414
+ test("parses --strategy flag", () => {
1415
+ const result = parseHistoryArgs(["--strategy", "file-based"]);
1416
+ expect(result.strategy).toBe("file-based");
1417
+ });
1513
1418
 
1514
- // Now test with a regressed score (>5% drop from 0.9 baseline)
1515
- const regressedScore = 0.8; // 11% drop
1516
- const result = checkGate(testDir, "test-eval", regressedScore);
1419
+ test("parses --verbose flag", () => {
1420
+ const result = parseHistoryArgs(["--verbose"]);
1421
+ expect(result.verbose).toBe(true);
1422
+ });
1423
+
1424
+ test("parses -v shorthand for verbose", () => {
1425
+ const result = parseHistoryArgs(["-v"]);
1426
+ expect(result.verbose).toBe(true);
1427
+ });
1428
+
1429
+ test("parses multiple flags together", () => {
1430
+ const result = parseHistoryArgs(["--limit", "15", "--status", "failed", "--verbose"]);
1431
+ expect(result.limit).toBe(15);
1432
+ expect(result.status).toBe("failed");
1433
+ expect(result.verbose).toBe(true);
1434
+ });
1517
1435
 
1518
- expect(result.passed).toBe(false);
1519
- expect(result.phase).toBe("production");
1520
- expect(result.message).toContain("FAIL");
1436
+ test("uses default limit of 10 when not specified", () => {
1437
+ const result = parseHistoryArgs([]);
1438
+ expect(result.limit).toBe(10);
1439
+ });
1440
+
1441
+ test("ignores invalid status values", () => {
1442
+ const result = parseHistoryArgs(["--status", "invalid"]);
1443
+ expect(result.status).toBeUndefined();
1444
+ });
1445
+
1446
+ test("ignores invalid strategy values", () => {
1447
+ const result = parseHistoryArgs(["--strategy", "invalid"]);
1448
+ expect(result.strategy).toBeUndefined();
1449
+ });
1521
1450
  });
1522
1451
  });