npm - opencode-swarm-plugin - Versions diffs - 0.40.0 → 0.42.1 - Mend

opencode-swarm-plugin 0.40.0 → 0.42.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
package/.hive/analysis/session-data-quality-audit.md +320 -0
package/.hive/eval-results.json +481 -24
package/.hive/issues.jsonl +67 -16
package/.hive/memories.jsonl +159 -1
package/.opencode/eval-history.jsonl +315 -0
package/.turbo/turbo-build.log +5 -5
package/CHANGELOG.md +165 -0
package/README.md +2 -0
package/SCORER-ANALYSIS.md +598 -0
package/bin/eval-gate.test.ts +158 -0
package/bin/eval-gate.ts +74 -0
package/bin/swarm.serve.test.ts +46 -0
package/bin/swarm.test.ts +661 -732
package/bin/swarm.ts +335 -0
package/dist/compaction-hook.d.ts +7 -5
package/dist/compaction-hook.d.ts.map +1 -1
package/dist/compaction-prompt-scoring.d.ts +1 -0
package/dist/compaction-prompt-scoring.d.ts.map +1 -1
package/dist/eval-runner.d.ts +134 -0
package/dist/eval-runner.d.ts.map +1 -0
package/dist/hive.d.ts.map +1 -1
package/dist/index.d.ts +29 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +99741 -58858
package/dist/memory-tools.d.ts +70 -2
package/dist/memory-tools.d.ts.map +1 -1
package/dist/memory.d.ts +37 -0
package/dist/memory.d.ts.map +1 -1
package/dist/observability-tools.d.ts +64 -0
package/dist/observability-tools.d.ts.map +1 -1
package/dist/plugin.js +99356 -58318
package/dist/swarm-orchestrate.d.ts.map +1 -1
package/dist/swarm-prompts.d.ts +32 -1
package/dist/swarm-prompts.d.ts.map +1 -1
package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
package/evals/ARCHITECTURE.md +1189 -0
package/evals/example.eval.ts +3 -4
package/evals/fixtures/compaction-prompt-cases.ts +6 -0
package/evals/scorers/coordinator-discipline.evalite-test.ts +1 -162
package/evals/scorers/coordinator-discipline.ts +0 -323
package/evals/swarm-decomposition.eval.ts +4 -2
package/package.json +4 -3
package/src/compaction-prompt-scorers.test.ts +185 -9
package/src/compaction-prompt-scoring.ts +7 -5
package/src/eval-runner.test.ts +128 -1
package/src/eval-runner.ts +46 -0
package/src/hive.ts +43 -42
package/src/memory-tools.test.ts +84 -0
package/src/memory-tools.ts +68 -3
package/src/memory.test.ts +2 -112
package/src/memory.ts +88 -49
package/src/observability-tools.test.ts +13 -0
package/src/observability-tools.ts +277 -0
package/src/swarm-orchestrate.test.ts +162 -0
package/src/swarm-orchestrate.ts +7 -5
package/src/swarm-prompts.test.ts +168 -4
package/src/swarm-prompts.ts +228 -7
package/.env +0 -2
package/.turbo/turbo-test.log +0 -481
package/.turbo/turbo-typecheck.log +0 -1

package/bin/swarm.test.ts CHANGED Viewed

@@ -1,11 +1,10 @@
 #!/usr/bin/env bun
 /**
- * Tests for swarm CLI file operation helpers
+ * Tests for swarm CLI helpers
  *
- * These tests verify the verbose output helpers used in `swarm setup`:
- * - writeFileWithStatus: logs created/updated/unchanged status
- * - mkdirWithStatus: logs directory creation
- * - rmWithStatus: logs file removal
+ * These tests verify the CLI helpers:
+ * - File operation helpers (writeFileWithStatus, mkdirWithStatus, rmWithStatus)
+ * - Swarm history helpers (formatSwarmHistory, parseHistoryArgs, filterHistoryByStatus)
  */
 import { describe, test, expect, beforeEach, afterEach } from "bun:test";
 import { mkdirSync, rmSync, writeFileSync, existsSync, readFileSync, readdirSync } from "fs";
@@ -501,17 +500,17 @@ describe("swarm log sessions", () => {
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: "2025-01-01T00:01:00Z",
+          timestamp: "2025-01-01T00:00:01Z",
           event_type: "VIOLATION",
-          violation_type: "coordinator_edited_file",
+          violation_type: "direct_edit",
           payload: {},
         },
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: "2025-01-01T00:02:00Z",
+          timestamp: "2025-01-01T00:00:02Z",
           event_type: "DECISION",
-          decision_type: "review_completed",
+          decision_type: "worker_spawned",
           payload: {},
         },
       ];
@@ -535,9 +534,9 @@ describe("swarm log sessions", () => {
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: "2025-01-01T00:01:00Z",
+          timestamp: "2025-01-01T00:00:01Z",
           event_type: "VIOLATION",
-          violation_type: "coordinator_edited_file",
+          violation_type: "direct_edit",
           payload: {},
         },
       ];
@@ -555,7 +554,7 @@ describe("swarm log sessions", () => {
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: new Date(now - 10000).toISOString(), // 10s ago
+          timestamp: new Date(now - 5000).toISOString(), // 5s ago
           event_type: "DECISION",
           decision_type: "worker_spawned",
           payload: {},
@@ -563,17 +562,17 @@ describe("swarm log sessions", () => {
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: new Date(now - 60000).toISOString(), // 1m ago
-          event_type: "VIOLATION",
-          violation_type: "coordinator_edited_file",
+          timestamp: new Date(now - 10000).toISOString(), // 10s ago
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
           payload: {},
         },
         {
           session_id: "s1",
           epic_id: "e1",
-          timestamp: new Date(now - 3000).toISOString(), // 3s ago
-          event_type: "OUTCOME",
-          outcome_type: "subtask_success",
+          timestamp: new Date(now - 60000).toISOString(), // 1min ago
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
           payload: {},
         },
       ];
@@ -682,841 +681,771 @@ describe("Cells command", () => {
         },
       ];
-      const table = formatCellsTable(cells);
-      // Should contain headers
-      expect(table).toContain("ID");
-      expect(table).toContain("TITLE");
-      expect(table).toContain("STATUS");
-      expect(table).toContain("PRIORITY");
-      // Should contain cell data
-      expect(table).toContain("test-abc123-xyz");
-      expect(table).toContain("Fix bug");
-      expect(table).toContain("open");
-      expect(table).toContain("0");
-      expect(table).toContain("test-def456-abc");
-      expect(table).toContain("Add feature");
-      expect(table).toContain("in_progress");
-      expect(table).toContain("2");
-    });
+      const result = formatCellsTable(cells);
-    test("returns 'No cells found' for empty array", () => {
-      const table = formatCellsTable([]);
-      expect(table).toBe("No cells found");
+      expect(result).toContain("ID");
+      expect(result).toContain("TITLE");
+      expect(result).toContain("STATUS");
+      expect(result).toContain("PRIORITY");
+      expect(result).toContain("Fix bug");
+      expect(result).toContain("Add feature");
+      expect(result).toContain("open");
+      expect(result).toContain("in_progress");
     });
-  });
-});
-describe("Log command helpers", () => {
-  let testDir: string;
-  beforeEach(() => {
-    testDir = join(tmpdir(), `swarm-log-test-${Date.now()}`);
-    mkdirSync(testDir, { recursive: true });
-  });
-  afterEach(() => {
-    if (existsSync(testDir)) {
-      rmSync(testDir, { recursive: true, force: true });
-    }
-  });
-  describe("parseLogLine", () => {
-    function parseLogLine(line: string): { level: number; time: string; module: string; msg: string } | null {
-      try {
-        const parsed = JSON.parse(line);
-        if (typeof parsed.level === "number" && parsed.time && parsed.msg) {
-          return {
-            level: parsed.level,
-            time: parsed.time,
-            module: parsed.module || "unknown",
-            msg: parsed.msg,
-          };
-        }
-      } catch {
-        // Invalid JSON
-      }
-      return null;
-    }
-    test("parses valid log line", () => {
-      const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","module":"compaction","msg":"started"}';
-      const result = parseLogLine(line);
-      expect(result).not.toBeNull();
-      expect(result?.level).toBe(30);
-      expect(result?.module).toBe("compaction");
-      expect(result?.msg).toBe("started");
-    });
+    test("truncates long titles with ellipsis", () => {
+      const cells = [
+        {
+          id: "test-abc",
+          title: "A".repeat(100),
+          status: "open",
+          priority: 0,
+          type: "task",
+          created_at: 1234567890,
+          updated_at: 1234567890,
+        },
+      ];
-    test("returns null for invalid JSON", () => {
-      const line = "not json";
-      expect(parseLogLine(line)).toBeNull();
-    });
+      const result = formatCellsTable(cells);
-    test("defaults module to 'unknown' if missing", () => {
-      const line = '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"test"}';
-      const result = parseLogLine(line);
-      expect(result?.module).toBe("unknown");
+      expect(result).toContain("...");
+      expect(result.split("\n")[2]).toMatch(/A{47}\.\.\./);
     });
-  });
-  describe("filterLogsByLevel", () => {
-    function filterLogsByLevel(logs: Array<{ level: number }>, minLevel: number): Array<{ level: number }> {
-      return logs.filter((log) => log.level >= minLevel);
-    }
+    test("returns 'No cells found' for empty array", () => {
+      const result = formatCellsTable([]);
-    test("filters logs by minimum level", () => {
-      const logs = [
-        { level: 10 }, // trace
-        { level: 30 }, // info
-        { level: 50 }, // error
-      ];
-      const result = filterLogsByLevel(logs, 30);
-      expect(result).toHaveLength(2);
-      expect(result[0].level).toBe(30);
-      expect(result[1].level).toBe(50);
+      expect(result).toBe("No cells found");
     });
-    test("includes all logs when minLevel is 0", () => {
-      const logs = [
-        { level: 10 },
-        { level: 20 },
-        { level: 30 },
+    test("aligns columns correctly", () => {
+      const cells = [
+        {
+          id: "short",
+          title: "T",
+          status: "open",
+          priority: 0,
+          type: "task",
+          created_at: 1234567890,
+          updated_at: 1234567890,
+        },
+        {
+          id: "very-long-id-here",
+          title: "Very long title here",
+          status: "in_progress",
+          priority: 2,
+          type: "task",
+          created_at: 1234567890,
+          updated_at: 1234567890,
+        },
       ];
-      const result = filterLogsByLevel(logs, 0);
-      expect(result).toHaveLength(3);
-    });
-  });
-  describe("filterLogsByModule", () => {
-    function filterLogsByModule(logs: Array<{ module: string }>, module: string): Array<{ module: string }> {
-      return logs.filter((log) => log.module === module);
-    }
+      const result = formatCellsTable(cells);
+      const lines = result.split("\n");
-    test("filters logs by exact module name", () => {
-      const logs = [
-        { module: "compaction" },
-        { module: "swarm" },
-        { module: "compaction" },
-      ];
-      const result = filterLogsByModule(logs, "compaction");
-      expect(result).toHaveLength(2);
-    });
-    test("returns empty array when no match", () => {
-      const logs = [
-        { module: "compaction" },
-      ];
-      const result = filterLogsByModule(logs, "swarm");
-      expect(result).toHaveLength(0);
+      // All lines should be same length (aligned)
+      const lengths = lines.map(l => l.length);
+      expect(Math.max(...lengths) - Math.min(...lengths)).toBeLessThan(3);
     });
   });
+});
-  describe("filterLogsBySince", () => {
-    function parseDuration(duration: string): number | null {
-      const match = duration.match(/^(\d+)([smhd])$/);
-      if (!match) return null;
-      const [, num, unit] = match;
-      const value = parseInt(num, 10);
-      const multipliers: Record<string, number> = {
-        s: 1000,
-        m: 60 * 1000,
-        h: 60 * 60 * 1000,
-        d: 24 * 60 * 60 * 1000,
-      };
-      return value * multipliers[unit];
-    }
-    function filterLogsBySince(logs: Array<{ time: string }>, sinceMs: number): Array<{ time: string }> {
-      const cutoffTime = Date.now() - sinceMs;
-      return logs.filter((log) => new Date(log.time).getTime() >= cutoffTime);
-    }
-    test("parseDuration handles seconds", () => {
-      expect(parseDuration("30s")).toBe(30 * 1000);
-    });
+// ============================================================================
+// Eval Gate Tests (TDD)
+// ============================================================================
-    test("parseDuration handles minutes", () => {
-      expect(parseDuration("5m")).toBe(5 * 60 * 1000);
-    });
+interface EvalRunRecord {
+  timestamp: string;
+  eval_name: string;
+  score: number;
+  run_count: number;
+}
-    test("parseDuration handles hours", () => {
-      expect(parseDuration("2h")).toBe(2 * 60 * 60 * 1000);
-    });
+interface GateResult {
+  passed: boolean;
+  phase: "bootstrap" | "stabilization" | "production";
+  message: string;
+  baseline?: number;
+  variance?: number;
+}
-    test("parseDuration handles days", () => {
-      expect(parseDuration("1d")).toBe(24 * 60 * 60 * 1000);
-    });
+/**
+ * Calculate variance for phase transitions
+ */
+function calculateVariance(scores: number[]): number {
+  if (scores.length <= 1) return 0;
-    test("parseDuration returns null for invalid format", () => {
-      expect(parseDuration("invalid")).toBeNull();
-      expect(parseDuration("30x")).toBeNull();
-      expect(parseDuration("30")).toBeNull();
-    });
+  const mean = scores.reduce((sum, x) => sum + x, 0) / scores.length;
+  const squaredDiffs = scores.map((x) => Math.pow(x - mean, 2));
+  const variance = squaredDiffs.reduce((sum, x) => sum + x, 0) / scores.length;
-    test("filterLogsBySince filters old logs", () => {
-      const now = Date.now();
-      const logs = [
-        { time: new Date(now - 10000).toISOString() }, // 10s ago
-        { time: new Date(now - 120000).toISOString() }, // 2m ago
-        { time: new Date(now - 1000).toISOString() }, // 1s ago
-      ];
-      const result = filterLogsBySince(logs, 60000); // Last 1m
-      expect(result).toHaveLength(2); // Only logs within last minute
-    });
-  });
+  return variance;
+}
-  describe("formatLogLine", () => {
-    function levelToName(level: number): string {
-      if (level >= 60) return "FATAL";
-      if (level >= 50) return "ERROR";
-      if (level >= 40) return "WARN ";
-      if (level >= 30) return "INFO ";
-      if (level >= 20) return "DEBUG";
-      return "TRACE";
-    }
+/**
+ * Read all eval run records from .hive/eval-history.jsonl
+ */
+function readAllRecords(projectPath: string): EvalRunRecord[] {
+  const recordsPath = join(projectPath, ".hive", "eval-history.jsonl");
-    function formatLogLine(log: { level: number; time: string; module: string; msg: string }): string {
-      const timestamp = new Date(log.time).toLocaleTimeString();
-      const levelName = levelToName(log.level);
-      const module = log.module.padEnd(12);
-      return `${timestamp} ${levelName} ${module} ${log.msg}`;
-    }
+  if (!existsSync(recordsPath)) {
+    return [];
+  }
-    test("formats log line with timestamp and level", () => {
-      const log = {
-        level: 30,
-        time: "2024-12-24T16:00:00.000Z",
-        module: "compaction",
-        msg: "started",
-      };
-      const result = formatLogLine(log);
-      expect(result).toContain("INFO");
-      expect(result).toContain("compaction");
-      expect(result).toContain("started");
-    });
+  const content = readFileSync(recordsPath, "utf-8");
+  const lines = content.split("\n").filter((line) => line.trim());
-    test("pads module name for alignment", () => {
-      const log1 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "a", msg: "test" });
-      const log2 = formatLogLine({ level: 30, time: "2024-12-24T16:00:00.000Z", module: "compaction", msg: "test" });
-      // Module names should be padded to 12 chars
-      expect(log1).toContain("a            test"); // 'a' + 11 spaces
-      expect(log2).toContain("compaction   test"); // 'compaction' + 3 spaces (10 chars + 2)
-    });
+  return lines.map((line) => JSON.parse(line) as EvalRunRecord);
+}
-    test("levelToName maps all levels correctly", () => {
-      expect(levelToName(10)).toBe("TRACE");
-      expect(levelToName(20)).toBe("DEBUG");
-      expect(levelToName(30)).toBe("INFO ");
-      expect(levelToName(40)).toBe("WARN ");
-      expect(levelToName(50)).toBe("ERROR");
-      expect(levelToName(60)).toBe("FATAL");
-    });
-  });
+/**
+ * Record an eval run to .hive/eval-history.jsonl
+ */
+function recordEvalRun(
+  projectPath: string,
+  record: EvalRunRecord,
+): void {
+  const hivePath = join(projectPath, ".hive");
+  const recordsPath = join(hivePath, "eval-history.jsonl");
+  // Ensure .hive directory exists
+  if (!existsSync(hivePath)) {
+    mkdirSync(hivePath, { recursive: true });
+  }
-  describe("readLogFiles", () => {
-    test("reads multiple .1log files", () => {
-      // Create test log files
-      const log1 = join(testDir, "swarm.1log");
-      const log2 = join(testDir, "swarm.2log");
-      const log3 = join(testDir, "compaction.1log");
-      writeFileSync(log1, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"line1"}\n');
-      writeFileSync(log2, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"line2"}\n');
-      writeFileSync(log3, '{"level":30,"time":"2024-12-24T16:00:02.000Z","module":"compaction","msg":"line3"}\n');
-      function readLogFiles(dir: string): string[] {
-        if (!existsSync(dir)) return [];
-        const files = readdirSync(dir)
-          .filter((f) => /\.\d+log$/.test(f))
-          .sort() // Sort by filename
-          .map((f) => join(dir, f));
-        const lines: string[] = [];
-        for (const file of files) {
-          const content = readFileSync(file, "utf-8");
-          lines.push(...content.split("\n").filter((line) => line.trim()));
-        }
-        return lines;
-      }
-      const lines = readLogFiles(testDir);
-      expect(lines).toHaveLength(3);
-      // Files are sorted alphabetically: compaction.1log, swarm.1log, swarm.2log
-      expect(lines.some((l) => l.includes("line1"))).toBe(true);
-      expect(lines.some((l) => l.includes("line2"))).toBe(true);
-      expect(lines.some((l) => l.includes("line3"))).toBe(true);
-    });
+  // Append record as JSONL
+  const line = JSON.stringify(record) + "\n";
-    test("returns empty array for non-existent directory", () => {
-      function readLogFiles(dir: string): string[] {
-        if (!existsSync(dir)) return [];
-        return [];
-      }
-      const lines = readLogFiles(join(testDir, "nonexistent"));
-      expect(lines).toHaveLength(0);
-    });
-  });
+  if (existsSync(recordsPath)) {
+    const existingContent = readFileSync(recordsPath, "utf-8");
+    writeFileSync(recordsPath, existingContent + line);
+  } else {
+    writeFileSync(recordsPath, line);
+  }
+}
-  describe("watchLogs", () => {
-    test("detects new log lines appended to file", async () => {
-      const logFile = join(testDir, "swarm.1log");
-      const collectedLines: string[] = [];
-      // Create initial log file
-      writeFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:00.000Z","msg":"initial"}\n');
-      // Import watch utilities
-      const { watch } = await import("fs");
-      const { appendFileSync } = await import("fs");
-      // Track file position for incremental reads
-      let lastSize = 0;
-      function readNewLines(filePath: string): string[] {
-        const content = readFileSync(filePath, "utf-8");
-        const newContent = content.slice(lastSize);
-        lastSize = content.length;
-        return newContent.split("\n").filter((line) => line.trim());
-      }
-      // Simulate watch behavior
-      const watcher = watch(testDir, (eventType, filename) => {
-        if (filename && /\.\d+log$/.test(filename)) {
-          const newLines = readNewLines(join(testDir, filename));
-          collectedLines.push(...newLines);
-        }
-      });
-      // Wait for watcher to be ready
-      await new Promise((resolve) => setTimeout(resolve, 100));
-      // Append new log line
-      appendFileSync(logFile, '{"level":30,"time":"2024-12-24T16:00:01.000Z","msg":"appended"}\n');
-      // Wait for event to fire
-      await new Promise((resolve) => setTimeout(resolve, 200));
-      watcher.close();
-      // Should have detected the new line
-      expect(collectedLines.some((l) => l.includes("appended"))).toBe(true);
-    });
+/**
+ * Check eval gate for progressive gating
+ */
+function checkGate(
+  projectPath: string,
+  evalName: string,
+  currentScore: number,
+): GateResult {
+  const records = readAllRecords(projectPath).filter(
+    (r) => r.eval_name === evalName,
+  );
-    test("parseWatchArgs extracts --watch flag", () => {
-      function parseWatchArgs(args: string[]): { watch: boolean; interval: number } {
-        let watch = false;
-        let interval = 1000; // default 1 second
-        for (let i = 0; i < args.length; i++) {
-          const arg = args[i];
-          if (arg === "--watch" || arg === "-w") {
-            watch = true;
-          } else if (arg === "--interval" && i + 1 < args.length) {
-            interval = parseInt(args[++i], 10);
-          }
-        }
-        return { watch, interval };
-      }
-      expect(parseWatchArgs(["--watch"])).toEqual({ watch: true, interval: 1000 });
-      expect(parseWatchArgs(["-w"])).toEqual({ watch: true, interval: 1000 });
-      expect(parseWatchArgs(["--watch", "--interval", "500"])).toEqual({ watch: true, interval: 500 });
-      expect(parseWatchArgs(["compaction", "--watch"])).toEqual({ watch: true, interval: 1000 });
-      expect(parseWatchArgs(["--level", "error"])).toEqual({ watch: false, interval: 1000 });
-    });
-  });
-});
+  if (records.length < 10) {
+    return {
+      passed: true,
+      phase: "bootstrap",
+      message: `BOOTSTRAP (${records.length}/10 runs): no gates yet`,
+    };
+  }
-// ============================================================================
-// Eval Commands Tests (TDD)
-// ============================================================================
+  const lastTenScores = records.slice(-10).map((r) => r.score);
+  const baseline = lastTenScores.reduce((sum, x) => sum + x, 0) / lastTenScores.length;
+  const variance = calculateVariance(lastTenScores);
-describe("Eval commands", () => {
-  describe("formatEvalStatus", () => {
-    test("displays phase, thresholds, and recent scores", () => {
-      const status = {
-        phase: "stabilization" as const,
-        runCount: 25,
-        thresholds: {
-          stabilization: 0.1,
-          production: 0.05,
-        },
-        recentScores: [
-          { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
-          { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
-          { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
-        ],
+  if (records.length < 50) {
+    const drop = ((baseline - currentScore) / baseline) * 100;
+    if (drop > 5) {
+      return {
+        passed: false,
+        phase: "stabilization",
+        message: `WARN: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)}`,
+        baseline,
+        variance,
       };
+    }
-      const output = formatEvalStatus(status);
-      // Should show phase
-      expect(output).toContain("stabilization");
-      // Should show run count
-      expect(output).toContain("25");
-      // Should show thresholds
-      expect(output).toContain("10%"); // stabilization threshold
-      expect(output).toContain("5%");  // production threshold
-      // Should show recent scores
-      expect(output).toContain("0.85");
-      expect(output).toContain("0.87");
-      expect(output).toContain("0.82");
-    });
+    return {
+      passed: true,
+      phase: "stabilization",
+      message: `Stabilization (${records.length}/50 runs): baseline=${baseline.toFixed(2)}`,
+      baseline,
+      variance,
+    };
+  }
-    test("shows bootstrap phase message", () => {
-      const status = {
-        phase: "bootstrap" as const,
-        runCount: 5,
-        thresholds: {
-          stabilization: 0.1,
-          production: 0.05,
-        },
-        recentScores: [],
+  // Production phase: variance < 0.1 AND score doesn't drop >5%
+  if (variance < 0.1) {
+    const drop = ((baseline - currentScore) / baseline) * 100;
+    if (drop > 5) {
+      return {
+        passed: false,
+        phase: "production",
+        message: `FAIL: Score dropped ${drop.toFixed(1)}% from baseline ${baseline.toFixed(2)} (variance=${variance.toFixed(3)})`,
+        baseline,
+        variance,
       };
+    }
-      const output = formatEvalStatus(status);
+    return {
+      passed: true,
+      phase: "production",
+      message: `PASS: Production phase (variance=${variance.toFixed(3)}, baseline=${baseline.toFixed(2)})`,
+      baseline,
+      variance,
+    };
+  }
-      expect(output).toContain("bootstrap");
-      expect(output).toContain("collecting data");
-    });
+  // Stuck in stabilization (>50 runs but variance still high)
+  return {
+    passed: true,
+    phase: "stabilization",
+    message: `Stabilization: variance too high (${variance.toFixed(3)} > 0.1), need more consistent runs`,
+    baseline,
+    variance,
+  };
+}
-    test("shows production phase message", () => {
-      const status = {
-        phase: "production" as const,
-        runCount: 75,
-        thresholds: {
-          stabilization: 0.1,
-          production: 0.05,
-        },
-        recentScores: [],
-      };
+/**
+ * Ensure .hive directory exists
+ */
+function ensureHiveDirectory(projectPath: string): void {
+  const hivePath = join(projectPath, ".hive");
+  if (!existsSync(hivePath)) {
+    mkdirSync(hivePath, { recursive: true });
+  }
+}
-      const output = formatEvalStatus(status);
+describe("Eval gate", () => {
+  let testDir: string;
-      expect(output).toContain("production");
-    });
+  beforeEach(() => {
+    testDir = join(tmpdir(), `eval-gate-test-${Date.now()}`);
+    mkdirSync(testDir, { recursive: true });
   });
-  describe("formatEvalHistory", () => {
-    test("shows eval entries with timestamps and scores", () => {
-      const history = [
-        {
-          timestamp: "2024-12-24T10:00:00.000Z",
-          eval_name: "swarm-decomposition",
-          score: 0.85,
-          run_count: 1,
-        },
-        {
-          timestamp: "2024-12-24T11:00:00.000Z",
-          eval_name: "swarm-decomposition",
-          score: 0.87,
-          run_count: 2,
-        },
-        {
-          timestamp: "2024-12-24T12:00:00.000Z",
-          eval_name: "coordinator-behavior",
-          score: 0.92,
-          run_count: 1,
-        },
-      ];
+  afterEach(() => {
+    if (existsSync(testDir)) {
+      rmSync(testDir, { recursive: true, force: true });
+    }
+  });
-      const output = formatEvalHistory(history);
+  describe("Bootstrap phase (<10 runs)", () => {
+    test("allows any score", () => {
+      ensureHiveDirectory(testDir);
+      // Record 5 runs
+      for (let i = 0; i < 5; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.5 + i * 0.1,
+          run_count: i + 1,
+        });
+      }
-      // Should show all eval names
-      expect(output).toContain("swarm-decomposition");
-      expect(output).toContain("coordinator-behavior");
-      // Should show scores
-      expect(output).toContain("0.85");
-      expect(output).toContain("0.87");
-      expect(output).toContain("0.92");
-      // Should show run counts
-      expect(output).toContain("run #1");
-      expect(output).toContain("run #2");
-    });
+      const result = checkGate(testDir, "test-eval", 0.3); // Low score
-    test("returns empty message for no history", () => {
-      const output = formatEvalHistory([]);
-      expect(output).toContain("No eval history");
+      expect(result.passed).toBe(true);
+      expect(result.phase).toBe("bootstrap");
+      expect(result.message).toContain("BOOTSTRAP");
     });
-    test("formats timestamps as readable dates", () => {
-      const history = [
-        {
-          timestamp: "2024-12-24T10:00:00.000Z",
-          eval_name: "test",
-          score: 0.85,
-          run_count: 1,
-        },
-      ];
+    test("counts runs correctly", () => {
+      ensureHiveDirectory(testDir);
+      for (let i = 0; i < 7; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.8,
+          run_count: i + 1,
+        });
+      }
-      const output = formatEvalHistory(history);
+      const result = checkGate(testDir, "test-eval", 0.8);
-      // Should contain a formatted date (not raw ISO)
-      expect(output).not.toContain("2024-12-24T10:00:00.000Z");
-      expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
+      expect(result.phase).toBe("bootstrap");
+      expect(result.message).toContain("7/10");
     });
   });
-  describe("generateSparkline", () => {
-    test("generates sparkline from scores", () => {
-      const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
-      const sparkline = generateSparkline(scores);
-      // Should use sparkline characters
-      expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
-      // Length should match input
-      expect(sparkline.length).toBe(scores.length);
-      // Should show ascending trend
-      expect(sparkline).toContain("▁"); // Low score
-      expect(sparkline).toContain("█"); // High score
-    });
+  describe("Stabilization phase (10-50 runs)", () => {
+    test("warns on >5% regression", () => {
+      ensureHiveDirectory(testDir);
+      // Record 20 runs with consistent 0.9 score
+      for (let i = 0; i < 20; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.9,
+          run_count: i + 1,
+        });
+      }
-    test("handles single score", () => {
-      const sparkline = generateSparkline([0.5]);
-      expect(sparkline.length).toBe(1);
-      expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
-    });
+      // Test with regressed score (>5% drop from 0.9 baseline)
+      const regressedScore = 0.85; // 5.5% drop
+      const result = checkGate(testDir, "test-eval", regressedScore);
-    test("handles all same scores", () => {
-      const sparkline = generateSparkline([0.5, 0.5, 0.5]);
-      expect(sparkline.length).toBe(3);
-      // All should be same character
-      expect(new Set(sparkline.split("")).size).toBe(1);
+      expect(result.passed).toBe(false);
+      expect(result.phase).toBe("stabilization");
+      expect(result.message).toContain("WARN");
+      expect(result.baseline).toBeCloseTo(0.9, 2);
     });
-    test("returns empty for empty array", () => {
-      const sparkline = generateSparkline([]);
-      expect(sparkline).toBe("");
-    });
-  });
+    test("passes when score is stable", () => {
+      ensureHiveDirectory(testDir);
-  describe("formatEvalRunResult", () => {
-    test("shows pass/fail with gate result", () => {
-      const result = {
-        passed: true,
-        phase: "production" as const,
-        message: "Production phase: 2.5% regression - acceptable",
-        baseline: 0.85,
-        currentScore: 0.83,
-        regressionPercent: 0.025,
-      };
+      for (let i = 0; i < 25; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.85,
+          run_count: i + 1,
+        });
+      }
-      const output = formatEvalRunResult(result);
+      const result = checkGate(testDir, "test-eval", 0.86);
-      expect(output).toContain("PASS");
-      expect(output).toContain("production");
-      expect(output).toContain("0.83"); // current score
-      expect(output).toContain("2.5%"); // regression
+      expect(result.passed).toBe(true);
+      expect(result.phase).toBe("stabilization");
+      expect(result.baseline).toBeCloseTo(0.85, 2);
     });
+  });
-    test("shows failure with details", () => {
-      const result = {
-        passed: false,
-        phase: "production" as const,
-        message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
-        baseline: 0.85,
-        currentScore: 0.78,
-        regressionPercent: 0.08,
-      };
+  describe("Production phase (>50 runs, low variance)", () => {
+    test("enters production when variance < 0.1", () => {
+      ensureHiveDirectory(testDir);
+      // Simulate 60 runs with consistent scores (low variance)
+      for (let i = 0; i < 60; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.9, // All same score = zero variance
+          run_count: i + 1,
+        });
+      }
-      const output = formatEvalRunResult(result);
+      const result = checkGate(testDir, "test-eval", 0.91);
-      expect(output).toContain("FAIL");
-      expect(output).toContain("8.0%");
-      expect(output).toContain("exceeds");
+      expect(result.phase).toBe("production");
+      expect(result.variance).toBeLessThan(0.1);
     });
-    test("shows bootstrap phase without baseline", () => {
-      const result = {
-        passed: true,
-        phase: "bootstrap" as const,
-        message: "Bootstrap phase (5/10 runs) - collecting data",
-        currentScore: 0.85,
-      };
+    test("fails on regression in production", () => {
+      ensureHiveDirectory(testDir);
+      // Simulate 60 runs with consistent high scores to reach production phase
+      for (let i = 0; i < 60; i++) {
+        recordEvalRun(testDir, {
+          timestamp: new Date().toISOString(),
+          eval_name: "test-eval",
+          score: 0.9,
+          run_count: i + 1,
+        });
+      }
-      const output = formatEvalRunResult(result);
+      // Now test with a regressed score (>5% drop from 0.9 baseline)
+      const regressedScore = 0.8; // 11% drop
+      const result = checkGate(testDir, "test-eval", regressedScore);
-      expect(output).toContain("bootstrap");
-      expect(output).toContain("collecting data");
-      expect(output).not.toContain("baseline");
+      expect(result.passed).toBe(false);
+      expect(result.phase).toBe("production");
+      expect(result.message).toContain("FAIL");
     });
   });
 });
 // ============================================================================
-// Eval Command Helpers (Implementation)
+// History Command Tests (TDD)
 // ============================================================================
+interface SwarmHistoryRecord {
+  epic_id: string;
+  epic_title: string;
+  strategy: string;
+  timestamp: string;
+  overall_success: boolean;
+  task_count: number;
+  completed_count: number;
+}
 /**
- * Generate sparkline from array of scores (0-1 range)
+ * Format relative time (e.g., "2h ago", "1d ago")
  */
-function generateSparkline(scores: number[]): string {
-  if (scores.length === 0) return "";
+function formatRelativeTime(timestamp: string): string {
+  const now = Date.now();
+  const then = new Date(timestamp).getTime();
+  const diffMs = now - then;
+  const minutes = Math.floor(diffMs / 60000);
+  const hours = Math.floor(diffMs / 3600000);
+  const days = Math.floor(diffMs / 86400000);
+  if (minutes < 60) return `${minutes}m ago`;
+  if (hours < 24) return `${hours}h ago`;
+  return `${days}d ago`;
+}
-  const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
-  const min = Math.min(...scores);
-  const max = Math.max(...scores);
-  const range = max - min;
+/**
+ * Format swarm history as beautiful CLI table
+ */
+function formatSwarmHistory(records: SwarmHistoryRecord[]): string {
+  if (records.length === 0) {
+    return "No swarm history found";
+  }
-  if (range === 0) {
-    // All scores the same
-    return chars[4].repeat(scores.length);
+  const rows = records.map(r => ({
+    time: formatRelativeTime(r.timestamp),
+    status: r.overall_success ? "✅" : "❌",
+    title: r.epic_title.length > 30 ? r.epic_title.slice(0, 27) + "..." : r.epic_title,
+    strategy: r.strategy,
+    tasks: `${r.completed_count}/${r.task_count} tasks`,
+  }));
+  // Box drawing characters
+  const lines: string[] = [];
+  lines.push("┌─────────────────────────────────────────────────────────────┐");
+  lines.push("│                    SWARM HISTORY                            │");
+  lines.push("├─────────────────────────────────────────────────────────────┤");
+  for (const row of rows) {
+    const statusCol = `${row.time.padEnd(8)} ${row.status}`;
+    const titleCol = row.title.padEnd(32);
+    const strategyCol = row.strategy.padEnd(13);
+    const tasksCol = row.tasks;
+    const line = `│ ${statusCol} ${titleCol} ${strategyCol} ${tasksCol.padEnd(3)} │`;
+    lines.push(line);
   }
-  return scores
-    .map((score) => {
-      const normalized = (score - min) / range;
-      const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
-      return chars[index];
-    })
-    .join("");
+  lines.push("└─────────────────────────────────────────────────────────────┘");
+  return lines.join("\n");
 }
 /**
- * Format eval status for display
+ * Filter history by status
  */
-function formatEvalStatus(status: {
-  phase: "bootstrap" | "stabilization" | "production";
-  runCount: number;
-  thresholds: { stabilization: number; production: number };
-  recentScores: Array<{ timestamp: string; score: number }>;
-}): string {
-  const lines: string[] = [];
-  // Phase banner
-  const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
-  lines.push(`${phaseEmoji} Phase: ${status.phase}`);
-  lines.push(`Runs: ${status.runCount}`);
-  lines.push("");
-  // Thresholds
-  lines.push("Thresholds:");
-  lines.push(`  Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
-  lines.push(`  Production:    ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
-  lines.push("");
-  // Recent scores with sparkline
-  if (status.recentScores.length > 0) {
-    lines.push("Recent scores:");
-    const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
-    lines.push(`  ${sparkline}`);
-    for (const { timestamp, score } of status.recentScores) {
-      const time = new Date(timestamp).toLocaleString();
-      lines.push(`  ${time}: ${score.toFixed(2)}`);
-    }
-  } else {
-    lines.push("No scores yet - collecting data");
+function filterHistoryByStatus(
+  records: SwarmHistoryRecord[],
+  status?: "success" | "failed" | "in_progress",
+): SwarmHistoryRecord[] {
+  if (!status) return records;
+  switch (status) {
+    case "success":
+      return records.filter(r => r.overall_success);
+    case "failed":
+      return records.filter(r => !r.overall_success && r.completed_count === r.task_count);
+    case "in_progress":
+      return records.filter(r => r.completed_count < r.task_count);
+    default:
+      return records;
   }
+}
-  return lines.join("\n");
+/**
+ * Filter history by strategy
+ */
+function filterHistoryByStrategy(
+  records: SwarmHistoryRecord[],
+  strategy?: "file-based" | "feature-based" | "risk-based",
+): SwarmHistoryRecord[] {
+  if (!strategy) return records;
+  return records.filter(r => r.strategy === strategy);
 }
 /**
- * Format eval history for display
+ * Parse history CLI arguments
  */
-function formatEvalHistory(history: Array<{
-  timestamp: string;
-  eval_name: string;
-  score: number;
-  run_count: number;
-}>): string {
-  if (history.length === 0) {
-    return "No eval history found";
-  }
+function parseHistoryArgs(args: string[]): {
+  limit: number;
+  status?: "success" | "failed" | "in_progress";
+  strategy?: "file-based" | "feature-based" | "risk-based";
+  verbose: boolean;
+} {
+  const result: {
+    limit: number;
+    status?: "success" | "failed" | "in_progress";
+    strategy?: "file-based" | "feature-based" | "risk-based";
+    verbose: boolean;
+  } = {
+    limit: 10,
+    verbose: false,
+  };
-  const lines: string[] = [];
-  lines.push("Eval History:");
-  lines.push("");
-  // Group by eval name
-  const grouped = new Map<string, typeof history>();
-  for (const entry of history) {
-    if (!grouped.has(entry.eval_name)) {
-      grouped.set(entry.eval_name, []);
-    }
-    grouped.get(entry.eval_name)!.push(entry);
-  }
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
-  // Display each eval group
-  for (const [evalName, entries] of grouped) {
-    lines.push(`${evalName}:`);
-    const sparkline = generateSparkline(entries.map((e) => e.score));
-    lines.push(`  Trend: ${sparkline}`);
-    // Show latest 5 entries
-    const latest = entries.slice(-5);
-    for (const entry of latest) {
-      const time = new Date(entry.timestamp).toLocaleTimeString();
-      lines.push(`  ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
-    }
-    if (entries.length > 5) {
-      lines.push(`  ... and ${entries.length - 5} more`);
+    if (arg === "--limit" || arg === "-n") {
+      const limitStr = args[i + 1];
+      if (limitStr && !isNaN(Number(limitStr))) {
+        result.limit = Number(limitStr);
+        i++;
+      }
+    } else if (arg === "--status") {
+      const statusStr = args[i + 1];
+      if (statusStr && ["success", "failed", "in_progress"].includes(statusStr)) {
+        result.status = statusStr as "success" | "failed" | "in_progress";
+        i++;
+      }
+    } else if (arg === "--strategy") {
+      const strategyStr = args[i + 1];
+      if (strategyStr && ["file-based", "feature-based", "risk-based"].includes(strategyStr)) {
+        result.strategy = strategyStr as "file-based" | "feature-based" | "risk-based";
+        i++;
+      }
+    } else if (arg === "--verbose" || arg === "-v") {
+      result.verbose = true;
     }
-    lines.push("");
   }
-  return lines.join("\n");
+  return result;
 }
-/**
- * Format eval run result (gate check)
- */
-function formatEvalRunResult(result: {
-  passed: boolean;
-  phase: "bootstrap" | "stabilization" | "production";
-  message: string;
-  baseline?: number;
-  currentScore: number;
-  regressionPercent?: number;
-}): string {
-  const lines: string[] = [];
-  // Pass/fail banner
-  const status = result.passed ? "✅ PASS" : "❌ FAIL";
-  lines.push(status);
-  lines.push("");
+describe("swarm history", () => {
+  describe("formatRelativeTime", () => {
+    test("formats minutes ago", () => {
+      const fiveMinutesAgo = new Date(Date.now() - 5 * 60000).toISOString();
+      const result = formatRelativeTime(fiveMinutesAgo);
+      expect(result).toMatch(/5m ago/);
+    });
-  // Phase and score
-  lines.push(`Phase: ${result.phase}`);
-  lines.push(`Score: ${result.currentScore.toFixed(2)}`);
+    test("formats hours ago", () => {
+      const threeHoursAgo = new Date(Date.now() - 3 * 3600000).toISOString();
+      const result = formatRelativeTime(threeHoursAgo);
+      expect(result).toMatch(/3h ago/);
+    });
-  if (result.baseline !== undefined) {
-    lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
-  }
+    test("formats days ago", () => {
+      const twoDaysAgo = new Date(Date.now() - 2 * 86400000).toISOString();
+      const result = formatRelativeTime(twoDaysAgo);
+      expect(result).toMatch(/2d ago/);
+    });
+  });
-  if (result.regressionPercent !== undefined) {
-    const sign = result.regressionPercent > 0 ? "+" : "";
-    lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
-  }
+  describe("formatSwarmHistory", () => {
+    test("formats history as beautiful box-drawn table", () => {
+      const records: SwarmHistoryRecord[] = [
+        {
+          epic_id: "epic-1",
+          epic_title: "Add auth flow",
+          strategy: "feature-based",
+          timestamp: new Date(Date.now() - 2 * 3600000).toISOString(),
+          overall_success: true,
+          task_count: 4,
+          completed_count: 4,
+        },
+        {
+          epic_id: "epic-2",
+          epic_title: "Refactor DB layer",
+          strategy: "file-based",
+          timestamp: new Date(Date.now() - 5 * 3600000).toISOString(),
+          overall_success: false,
+          task_count: 5,
+          completed_count: 2,
+        },
+      ];
-  lines.push("");
-  lines.push(result.message);
+      const result = formatSwarmHistory(records);
+      expect(result).toContain("┌─────");
+      expect(result).toContain("SWARM HISTORY");
+      expect(result).toContain("✅");
+      expect(result).toContain("❌");
+      expect(result).toContain("Add auth flow");
+      expect(result).toContain("Refactor DB layer");
+      expect(result).toContain("feature-based");
+      expect(result).toContain("file-based");
+      expect(result).toContain("4/4 tasks");
+      expect(result).toContain("2/5 tasks");
+      expect(result).toContain("└─────");
+    });
-  return lines.join("\n");
-}
+    test("truncates long titles with ellipsis", () => {
+      const records: SwarmHistoryRecord[] = [
+        {
+          epic_id: "epic-1",
+          epic_title: "A".repeat(100),
+          strategy: "feature-based",
+          timestamp: new Date(Date.now() - 1000).toISOString(),
+          overall_success: true,
+          task_count: 1,
+          completed_count: 1,
+        },
+      ];
-// ============================================================================
-// Eval Run Tests
-// ============================================================================
+      const result = formatSwarmHistory(records);
-describe("Eval Run CI Mode", () => {
-  let testDir: string;
+      expect(result).toContain("...");
+      expect(result).toMatch(/A{27}\.\.\./);
+    });
-  beforeEach(() => {
-    testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
-    mkdirSync(testDir, { recursive: true });
+    test("returns 'No swarm history found' for empty array", () => {
+      const result = formatSwarmHistory([]);
+      expect(result).toBe("No swarm history found");
+    });
   });
-  afterEach(() => {
-    if (existsSync(testDir)) {
-      rmSync(testDir, { recursive: true, force: true });
-    }
-  });
+  describe("filterHistoryByStatus", () => {
+    const records: SwarmHistoryRecord[] = [
+      {
+        epic_id: "epic-1",
+        epic_title: "Success",
+        strategy: "feature-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: true,
+        task_count: 4,
+        completed_count: 4,
+      },
+      {
+        epic_id: "epic-2",
+        epic_title: "Failed",
+        strategy: "file-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: false,
+        task_count: 4,
+        completed_count: 4,
+      },
+      {
+        epic_id: "epic-3",
+        epic_title: "In Progress",
+        strategy: "risk-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: false,
+        task_count: 5,
+        completed_count: 2,
+      },
+    ];
+    test("filters success only", () => {
+      const result = filterHistoryByStatus(records, "success");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("Success");
+    });
-  test("writes eval results JSON file", async () => {
-    // Import the function we need to test
-    const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
-    const { checkGate } = await import("../src/eval-gates.js");
-    const { ensureHiveDirectory } = await import("../src/hive.js");
-    // Set up test data
-    const evalName = "test-eval";
-    const mockScore = 0.85;
-    // Ensure directory exists
-    ensureHiveDirectory(testDir);
-    // Get history and record run (simulating what eval run does)
-    const history = getScoreHistory(testDir, evalName);
-    recordEvalRun(testDir, {
-      timestamp: new Date().toISOString(),
-      eval_name: evalName,
-      score: mockScore,
-      run_count: history.length + 1,
+    test("filters failed only", () => {
+      const result = filterHistoryByStatus(records, "failed");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("Failed");
     });
-    // Check gate
-    const gateResult = checkGate(testDir, evalName, mockScore);
+    test("filters in_progress only", () => {
+      const result = filterHistoryByStatus(records, "in_progress");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("In Progress");
+    });
-    // Write results file (simulating CI mode)
-    const resultsPath = join(testDir, ".hive", "eval-results.json");
-    const results = { [evalName]: gateResult };
-    writeFileSync(resultsPath, JSON.stringify(results, null, 2));
+    test("returns all when no status filter", () => {
+      const result = filterHistoryByStatus(records);
+      expect(result).toHaveLength(3);
+    });
+  });
-    // Verify file exists and has correct structure
-    expect(existsSync(resultsPath)).toBe(true);
+  describe("filterHistoryByStrategy", () => {
+    const records: SwarmHistoryRecord[] = [
+      {
+        epic_id: "epic-1",
+        epic_title: "File",
+        strategy: "file-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: true,
+        task_count: 4,
+        completed_count: 4,
+      },
+      {
+        epic_id: "epic-2",
+        epic_title: "Feature",
+        strategy: "feature-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: true,
+        task_count: 4,
+        completed_count: 4,
+      },
+      {
+        epic_id: "epic-3",
+        epic_title: "Risk",
+        strategy: "risk-based",
+        timestamp: "2025-01-01T00:00:00Z",
+        overall_success: true,
+        task_count: 4,
+        completed_count: 4,
+      },
+    ];
+    test("filters file-based only", () => {
+      const result = filterHistoryByStrategy(records, "file-based");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("File");
+    });
-    const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
-    expect(savedResults).toHaveProperty(evalName);
-    expect(savedResults[evalName]).toMatchObject({
-      passed: true,
-      phase: "bootstrap",
-      currentScore: mockScore,
+    test("filters feature-based only", () => {
+      const result = filterHistoryByStrategy(records, "feature-based");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("Feature");
+    });
+    test("filters risk-based only", () => {
+      const result = filterHistoryByStrategy(records, "risk-based");
+      expect(result).toHaveLength(1);
+      expect(result[0].epic_title).toBe("Risk");
+    });
+    test("returns all when no strategy filter", () => {
+      const result = filterHistoryByStrategy(records);
+      expect(result).toHaveLength(3);
     });
   });
-  test("bootstrap phase always passes", async () => {
-    const { checkGate } = await import("../src/eval-gates.js");
+  describe("parseHistoryArgs", () => {
+    test("parses --limit flag", () => {
+      const result = parseHistoryArgs(["--limit", "20"]);
+      expect(result.limit).toBe(20);
+    });
-    // Even with a low score, bootstrap phase should pass
-    const result = checkGate(testDir, "test-eval", 0.1);
+    test("parses -n shorthand for limit", () => {
+      const result = parseHistoryArgs(["-n", "5"]);
+      expect(result.limit).toBe(5);
+    });
-    expect(result.passed).toBe(true);
-    expect(result.phase).toBe("bootstrap");
-    expect(result.message).toContain("Bootstrap phase");
-  });
+    test("parses --status flag", () => {
+      const result = parseHistoryArgs(["--status", "success"]);
+      expect(result.status).toBe("success");
+    });
-  test("production phase fails on regression", async () => {
-    const { recordEvalRun } = await import("../src/eval-history.js");
-    const { checkGate } = await import("../src/eval-gates.js");
-    const { ensureHiveDirectory } = await import("../src/hive.js");
-    ensureHiveDirectory(testDir);
-    // Simulate 60 runs with consistent high scores to reach production phase
-    for (let i = 0; i < 60; i++) {
-      recordEvalRun(testDir, {
-        timestamp: new Date().toISOString(),
-        eval_name: "test-eval",
-        score: 0.9,
-        run_count: i + 1,
-      });
-    }
+    test("parses --strategy flag", () => {
+      const result = parseHistoryArgs(["--strategy", "file-based"]);
+      expect(result.strategy).toBe("file-based");
+    });
-    // Now test with a regressed score (>5% drop from 0.9 baseline)
-    const regressedScore = 0.8; // 11% drop
-    const result = checkGate(testDir, "test-eval", regressedScore);
+    test("parses --verbose flag", () => {
+      const result = parseHistoryArgs(["--verbose"]);
+      expect(result.verbose).toBe(true);
+    });
+    test("parses -v shorthand for verbose", () => {
+      const result = parseHistoryArgs(["-v"]);
+      expect(result.verbose).toBe(true);
+    });
+    test("parses multiple flags together", () => {
+      const result = parseHistoryArgs(["--limit", "15", "--status", "failed", "--verbose"]);
+      expect(result.limit).toBe(15);
+      expect(result.status).toBe("failed");
+      expect(result.verbose).toBe(true);
+    });
-    expect(result.passed).toBe(false);
-    expect(result.phase).toBe("production");
-    expect(result.message).toContain("FAIL");
+    test("uses default limit of 10 when not specified", () => {
+      const result = parseHistoryArgs([]);
+      expect(result.limit).toBe(10);
+    });
+    test("ignores invalid status values", () => {
+      const result = parseHistoryArgs(["--status", "invalid"]);
+      expect(result.status).toBeUndefined();
+    });
+    test("ignores invalid strategy values", () => {
+      const result = parseHistoryArgs(["--strategy", "invalid"]);
+      expect(result.strategy).toBeUndefined();
+    });
   });
 });