npm - opencode-swarm-plugin - Versions diffs - 0.38.0 → 0.40.0 - Mend

opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/.env +2 -0
package/.hive/eval-results.json +26 -0
package/.hive/issues.jsonl +27 -0
package/.hive/memories.jsonl +23 -1
package/.opencode/eval-history.jsonl +12 -0
package/CHANGELOG.md +182 -0
package/README.md +29 -12
package/bin/swarm.test.ts +881 -0
package/bin/swarm.ts +686 -0
package/dist/compaction-hook.d.ts +8 -1
package/dist/compaction-hook.d.ts.map +1 -1
package/dist/compaction-observability.d.ts +173 -0
package/dist/compaction-observability.d.ts.map +1 -0
package/dist/compaction-prompt-scoring.d.ts +124 -0
package/dist/compaction-prompt-scoring.d.ts.map +1 -0
package/dist/eval-capture.d.ts +174 -1
package/dist/eval-capture.d.ts.map +1 -1
package/dist/eval-gates.d.ts +84 -0
package/dist/eval-gates.d.ts.map +1 -0
package/dist/eval-history.d.ts +117 -0
package/dist/eval-history.d.ts.map +1 -0
package/dist/eval-learning.d.ts +216 -0
package/dist/eval-learning.d.ts.map +1 -0
package/dist/hive.d.ts.map +1 -1
package/dist/index.d.ts +80 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +16098 -651
package/dist/plugin.js +16012 -756
package/dist/post-compaction-tracker.d.ts +133 -0
package/dist/post-compaction-tracker.d.ts.map +1 -0
package/dist/schemas/task.d.ts +3 -3
package/dist/swarm-orchestrate.d.ts +23 -0
package/dist/swarm-orchestrate.d.ts.map +1 -1
package/dist/swarm-prompts.d.ts +25 -1
package/dist/swarm-prompts.d.ts.map +1 -1
package/dist/swarm.d.ts +4 -0
package/dist/swarm.d.ts.map +1 -1
package/evals/README.md +702 -105
package/evals/compaction-prompt.eval.ts +149 -0
package/evals/coordinator-behavior.eval.ts +8 -8
package/evals/fixtures/compaction-prompt-cases.ts +305 -0
package/evals/lib/compaction-loader.test.ts +248 -0
package/evals/lib/compaction-loader.ts +320 -0
package/evals/lib/data-loader.test.ts +345 -0
package/evals/lib/data-loader.ts +107 -6
package/evals/scorers/compaction-prompt-scorers.ts +145 -0
package/evals/scorers/compaction-scorers.ts +13 -13
package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
package/evals/scorers/coordinator-discipline.ts +348 -15
package/evals/scorers/index.test.ts +146 -0
package/evals/scorers/index.ts +104 -0
package/evals/swarm-decomposition.eval.ts +9 -2
package/examples/commands/swarm.md +291 -21
package/examples/plugin-wrapper-template.ts +117 -0
package/package.json +7 -5
package/scripts/migrate-unknown-sessions.ts +349 -0
package/src/compaction-capture.integration.test.ts +257 -0
package/src/compaction-hook.test.ts +42 -0
package/src/compaction-hook.ts +315 -86
package/src/compaction-observability.integration.test.ts +139 -0
package/src/compaction-observability.test.ts +187 -0
package/src/compaction-observability.ts +324 -0
package/src/compaction-prompt-scorers.test.ts +299 -0
package/src/compaction-prompt-scoring.ts +298 -0
package/src/eval-capture.test.ts +626 -1
package/src/eval-capture.ts +286 -2
package/src/eval-gates.test.ts +306 -0
package/src/eval-gates.ts +218 -0
package/src/eval-history.test.ts +508 -0
package/src/eval-history.ts +214 -0
package/src/eval-learning.test.ts +378 -0
package/src/eval-learning.ts +360 -0
package/src/eval-runner.test.ts +96 -0
package/src/eval-runner.ts +356 -0
package/src/hive.ts +34 -0
package/src/index.ts +115 -2
package/src/memory.test.ts +110 -0
package/src/memory.ts +34 -0
package/src/post-compaction-tracker.test.ts +251 -0
package/src/post-compaction-tracker.ts +237 -0
package/src/swarm-decompose.ts +2 -2
package/src/swarm-orchestrate.ts +2 -2
package/src/swarm-prompts.ts +2 -2
package/src/swarm-review.ts +3 -3
package/dist/beads.d.ts +0 -386
package/dist/beads.d.ts.map +0 -1
package/dist/schemas/bead-events.d.ts +0 -698
package/dist/schemas/bead-events.d.ts.map +0 -1
package/dist/schemas/bead.d.ts +0 -255
package/dist/schemas/bead.d.ts.map +0 -1
/package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0

package/bin/swarm.test.ts CHANGED Viewed

@@ -197,6 +197,412 @@ READ-ONLY research agent. Never modifies code - only gathers intel and stores fi
 // Log Command Tests (TDD)
 // ============================================================================
+// ============================================================================
+// Session Log Tests (TDD)
+// ============================================================================
+import type { CoordinatorEvent } from "../src/eval-capture";
+const TEST_SESSIONS_DIR = join(tmpdir(), "swarm-test-sessions");
+describe("swarm log sessions", () => {
+  beforeEach(() => {
+    // Create test sessions directory
+    if (!existsSync(TEST_SESSIONS_DIR)) {
+      mkdirSync(TEST_SESSIONS_DIR, { recursive: true });
+    }
+  });
+  afterEach(() => {
+    // Cleanup test directory
+    if (existsSync(TEST_SESSIONS_DIR)) {
+      rmSync(TEST_SESSIONS_DIR, { recursive: true, force: true });
+    }
+  });
+  // ========================================================================
+  // Helper Functions (to be implemented in swarm.ts)
+  // ========================================================================
+  function createTestSession(
+    sessionId: string,
+    epicId: string,
+    eventCount: number,
+    baseTimestamp?: number,
+  ): void {
+    const filePath = join(TEST_SESSIONS_DIR, `${sessionId}.jsonl`);
+    const lines: string[] = [];
+    const base = baseTimestamp || Date.now();
+    for (let i = 0; i < eventCount; i++) {
+      const event: CoordinatorEvent = {
+        session_id: sessionId,
+        epic_id: epicId,
+        timestamp: new Date(base - (eventCount - i) * 1000).toISOString(),
+        event_type: "DECISION",
+        decision_type: "worker_spawned",
+        payload: { worker_id: `worker-${i}` },
+      };
+      lines.push(JSON.stringify(event));
+    }
+    writeFileSync(filePath, lines.join("\n") + "\n");
+  }
+  /**
+   * Parse a session file and return events
+   */
+  function parseSessionFile(filePath: string): CoordinatorEvent[] {
+    if (!existsSync(filePath)) {
+      throw new Error(`Session file not found: ${filePath}`);
+    }
+    const content = readFileSync(filePath, "utf-8");
+    const lines = content.split("\n").filter((line) => line.trim());
+    const events: CoordinatorEvent[] = [];
+    for (const line of lines) {
+      try {
+        const parsed = JSON.parse(line);
+        events.push(parsed);
+      } catch {
+        // Skip invalid JSON lines
+      }
+    }
+    return events;
+  }
+  /**
+   * List all session files in a directory
+   */
+  function listSessionFiles(
+    dir: string,
+  ): Array<{
+    session_id: string;
+    file_path: string;
+    event_count: number;
+    start_time: string;
+    end_time?: string;
+  }> {
+    if (!existsSync(dir)) return [];
+    const files = readdirSync(dir).filter((f) => f.endsWith(".jsonl"));
+    const sessions: Array<{
+      session_id: string;
+      file_path: string;
+      event_count: number;
+      start_time: string;
+      end_time?: string;
+    }> = [];
+    for (const file of files) {
+      const filePath = join(dir, file);
+      try {
+        const events = parseSessionFile(filePath);
+        if (events.length === 0) continue;
+        const timestamps = events.map((e) => new Date(e.timestamp).getTime());
+        const startTime = new Date(Math.min(...timestamps)).toISOString();
+        const endTime =
+          timestamps.length > 1
+            ? new Date(Math.max(...timestamps)).toISOString()
+            : undefined;
+        sessions.push({
+          session_id: events[0].session_id,
+          file_path: filePath,
+          event_count: events.length,
+          start_time: startTime,
+          end_time: endTime,
+        });
+      } catch {
+        // Skip invalid files
+      }
+    }
+    // Sort by start time (newest first)
+    return sessions.sort((a, b) =>
+      new Date(b.start_time).getTime() - new Date(a.start_time).getTime()
+    );
+  }
+  /**
+   * Get the latest session file
+   */
+  function getLatestSession(
+    dir: string,
+  ): {
+    session_id: string;
+    file_path: string;
+    event_count: number;
+    start_time: string;
+    end_time?: string;
+  } | null {
+    const sessions = listSessionFiles(dir);
+    return sessions.length > 0 ? sessions[0] : null;
+  }
+  /**
+   * Filter events by type
+   */
+  function filterEventsByType(
+    events: CoordinatorEvent[],
+    eventType: string,
+  ): CoordinatorEvent[] {
+    if (eventType === "all") return events;
+    return events.filter((e) => e.event_type === eventType.toUpperCase());
+  }
+  /**
+   * Filter events by time
+   */
+  function filterEventsSince(
+    events: CoordinatorEvent[],
+    sinceMs: number,
+  ): CoordinatorEvent[] {
+    const cutoffTime = Date.now() - sinceMs;
+    return events.filter((e) =>
+      new Date(e.timestamp).getTime() >= cutoffTime
+    );
+  }
+  // ========================================================================
+  // Tests
+  // ========================================================================
+  describe("listSessionFiles", () => {
+    test("returns empty array when directory doesn't exist", () => {
+      const result = listSessionFiles("/nonexistent/directory");
+      expect(result).toEqual([]);
+    });
+    test("returns empty array when directory is empty", () => {
+      const result = listSessionFiles(TEST_SESSIONS_DIR);
+      expect(result).toEqual([]);
+    });
+    test("lists all session files with metadata", () => {
+      createTestSession("ses_abc123", "epic-1", 5);
+      createTestSession("ses_def456", "epic-2", 3);
+      const result = listSessionFiles(TEST_SESSIONS_DIR);
+      expect(result).toHaveLength(2);
+      expect(result[0].session_id).toMatch(/^ses_/);
+      expect(result[0].event_count).toBeGreaterThan(0);
+      expect(result[0].start_time).toBeTruthy();
+    });
+    test("calculates event count correctly", () => {
+      createTestSession("ses_test", "epic-1", 10);
+      const result = listSessionFiles(TEST_SESSIONS_DIR);
+      expect(result[0].event_count).toBe(10);
+    });
+    test("extracts start and end times from events", () => {
+      createTestSession("ses_test", "epic-1", 5);
+      const result = listSessionFiles(TEST_SESSIONS_DIR);
+      expect(result[0].start_time).toBeTruthy();
+      expect(new Date(result[0].start_time).getTime()).toBeLessThan(Date.now());
+    });
+    test("sorts sessions by start time (newest first)", () => {
+      // Create sessions with explicit different timestamps
+      const oldTime = Date.now() - 60000; // 1 minute ago
+      const newTime = Date.now();
+      createTestSession("ses_old", "epic-1", 2, oldTime);
+      createTestSession("ses_new", "epic-2", 2, newTime);
+      const result = listSessionFiles(TEST_SESSIONS_DIR);
+      expect(result[0].session_id).toBe("ses_new");
+      expect(result[1].session_id).toBe("ses_old");
+    });
+  });
+  describe("parseSessionFile", () => {
+    test("parses valid JSONL session file", () => {
+      createTestSession("ses_parse", "epic-1", 3);
+      const filePath = join(TEST_SESSIONS_DIR, "ses_parse.jsonl");
+      const events = parseSessionFile(filePath);
+      expect(events).toHaveLength(3);
+      expect(events[0].session_id).toBe("ses_parse");
+      expect(events[0].event_type).toBe("DECISION");
+    });
+    test("handles file with trailing newlines", () => {
+      const filePath = join(TEST_SESSIONS_DIR, "ses_trailing.jsonl");
+      writeFileSync(
+        filePath,
+        '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\n\n\n',
+      );
+      const events = parseSessionFile(filePath);
+      expect(events).toHaveLength(1);
+    });
+    test("skips invalid JSON lines", () => {
+      const filePath = join(TEST_SESSIONS_DIR, "ses_invalid.jsonl");
+      writeFileSync(
+        filePath,
+        '{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"DECISION","decision_type":"worker_spawned","payload":{}}\ninvalid json\n{"session_id":"test","epic_id":"e1","timestamp":"2025-01-01T00:00:00Z","event_type":"OUTCOME","outcome_type":"subtask_success","payload":{}}\n',
+      );
+      const events = parseSessionFile(filePath);
+      expect(events).toHaveLength(2);
+    });
+    test("throws error for non-existent file", () => {
+      expect(() => parseSessionFile("/nonexistent/file.jsonl")).toThrow();
+    });
+  });
+  describe("getLatestSession", () => {
+    test("returns null when directory is empty", () => {
+      const result = getLatestSession(TEST_SESSIONS_DIR);
+      expect(result).toBeNull();
+    });
+    test("returns the most recent session", () => {
+      const oldTime = Date.now() - 60000; // 1 minute ago
+      const newTime = Date.now();
+      createTestSession("ses_old", "epic-1", 2, oldTime);
+      createTestSession("ses_new", "epic-2", 3, newTime);
+      const result = getLatestSession(TEST_SESSIONS_DIR);
+      expect(result).not.toBeNull();
+      expect(result!.session_id).toBe("ses_new");
+    });
+  });
+  describe("filterEventsByType", () => {
+    test("filters DECISION events only", () => {
+      const events: CoordinatorEvent[] = [
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: "2025-01-01T00:00:00Z",
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
+          payload: {},
+        },
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: "2025-01-01T00:01:00Z",
+          event_type: "VIOLATION",
+          violation_type: "coordinator_edited_file",
+          payload: {},
+        },
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: "2025-01-01T00:02:00Z",
+          event_type: "DECISION",
+          decision_type: "review_completed",
+          payload: {},
+        },
+      ];
+      const result = filterEventsByType(events, "DECISION");
+      expect(result).toHaveLength(2);
+      expect(result.every((e) => e.event_type === "DECISION")).toBe(true);
+    });
+    test("returns all events when type is 'all'", () => {
+      const events: CoordinatorEvent[] = [
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: "2025-01-01T00:00:00Z",
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
+          payload: {},
+        },
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: "2025-01-01T00:01:00Z",
+          event_type: "VIOLATION",
+          violation_type: "coordinator_edited_file",
+          payload: {},
+        },
+      ];
+      const result = filterEventsByType(events, "all");
+      expect(result).toHaveLength(2);
+    });
+  });
+  describe("filterEventsSince", () => {
+    test("filters events within time window", () => {
+      const now = Date.now();
+      const events: CoordinatorEvent[] = [
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: new Date(now - 10000).toISOString(), // 10s ago
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
+          payload: {},
+        },
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: new Date(now - 60000).toISOString(), // 1m ago
+          event_type: "VIOLATION",
+          violation_type: "coordinator_edited_file",
+          payload: {},
+        },
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: new Date(now - 3000).toISOString(), // 3s ago
+          event_type: "OUTCOME",
+          outcome_type: "subtask_success",
+          payload: {},
+        },
+      ];
+      const result = filterEventsSince(events, 30000); // Last 30s
+      expect(result).toHaveLength(2); // 10s and 3s ago
+    });
+    test("returns all events when sinceMs is very large", () => {
+      const now = Date.now();
+      const events: CoordinatorEvent[] = [
+        {
+          session_id: "s1",
+          epic_id: "e1",
+          timestamp: new Date(now - 1000).toISOString(),
+          event_type: "DECISION",
+          decision_type: "worker_spawned",
+          payload: {},
+        },
+      ];
+      const result = filterEventsSince(events, 86400000); // 1 day
+      expect(result).toHaveLength(1);
+    });
+  });
+});
 // ============================================================================
 // Cells Command Tests (TDD)
 // ============================================================================
@@ -639,3 +1045,478 @@ describe("Log command helpers", () => {
     });
   });
 });
+// ============================================================================
+// Eval Commands Tests (TDD)
+// ============================================================================
+describe("Eval commands", () => {
+  describe("formatEvalStatus", () => {
+    test("displays phase, thresholds, and recent scores", () => {
+      const status = {
+        phase: "stabilization" as const,
+        runCount: 25,
+        thresholds: {
+          stabilization: 0.1,
+          production: 0.05,
+        },
+        recentScores: [
+          { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
+          { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
+          { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
+        ],
+      };
+      const output = formatEvalStatus(status);
+      // Should show phase
+      expect(output).toContain("stabilization");
+      // Should show run count
+      expect(output).toContain("25");
+      // Should show thresholds
+      expect(output).toContain("10%"); // stabilization threshold
+      expect(output).toContain("5%");  // production threshold
+      // Should show recent scores
+      expect(output).toContain("0.85");
+      expect(output).toContain("0.87");
+      expect(output).toContain("0.82");
+    });
+    test("shows bootstrap phase message", () => {
+      const status = {
+        phase: "bootstrap" as const,
+        runCount: 5,
+        thresholds: {
+          stabilization: 0.1,
+          production: 0.05,
+        },
+        recentScores: [],
+      };
+      const output = formatEvalStatus(status);
+      expect(output).toContain("bootstrap");
+      expect(output).toContain("collecting data");
+    });
+    test("shows production phase message", () => {
+      const status = {
+        phase: "production" as const,
+        runCount: 75,
+        thresholds: {
+          stabilization: 0.1,
+          production: 0.05,
+        },
+        recentScores: [],
+      };
+      const output = formatEvalStatus(status);
+      expect(output).toContain("production");
+    });
+  });
+  describe("formatEvalHistory", () => {
+    test("shows eval entries with timestamps and scores", () => {
+      const history = [
+        {
+          timestamp: "2024-12-24T10:00:00.000Z",
+          eval_name: "swarm-decomposition",
+          score: 0.85,
+          run_count: 1,
+        },
+        {
+          timestamp: "2024-12-24T11:00:00.000Z",
+          eval_name: "swarm-decomposition",
+          score: 0.87,
+          run_count: 2,
+        },
+        {
+          timestamp: "2024-12-24T12:00:00.000Z",
+          eval_name: "coordinator-behavior",
+          score: 0.92,
+          run_count: 1,
+        },
+      ];
+      const output = formatEvalHistory(history);
+      // Should show all eval names
+      expect(output).toContain("swarm-decomposition");
+      expect(output).toContain("coordinator-behavior");
+      // Should show scores
+      expect(output).toContain("0.85");
+      expect(output).toContain("0.87");
+      expect(output).toContain("0.92");
+      // Should show run counts
+      expect(output).toContain("run #1");
+      expect(output).toContain("run #2");
+    });
+    test("returns empty message for no history", () => {
+      const output = formatEvalHistory([]);
+      expect(output).toContain("No eval history");
+    });
+    test("formats timestamps as readable dates", () => {
+      const history = [
+        {
+          timestamp: "2024-12-24T10:00:00.000Z",
+          eval_name: "test",
+          score: 0.85,
+          run_count: 1,
+        },
+      ];
+      const output = formatEvalHistory(history);
+      // Should contain a formatted date (not raw ISO)
+      expect(output).not.toContain("2024-12-24T10:00:00.000Z");
+      expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
+    });
+  });
+  describe("generateSparkline", () => {
+    test("generates sparkline from scores", () => {
+      const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
+      const sparkline = generateSparkline(scores);
+      // Should use sparkline characters
+      expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
+      // Length should match input
+      expect(sparkline.length).toBe(scores.length);
+      // Should show ascending trend
+      expect(sparkline).toContain("▁"); // Low score
+      expect(sparkline).toContain("█"); // High score
+    });
+    test("handles single score", () => {
+      const sparkline = generateSparkline([0.5]);
+      expect(sparkline.length).toBe(1);
+      expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
+    });
+    test("handles all same scores", () => {
+      const sparkline = generateSparkline([0.5, 0.5, 0.5]);
+      expect(sparkline.length).toBe(3);
+      // All should be same character
+      expect(new Set(sparkline.split("")).size).toBe(1);
+    });
+    test("returns empty for empty array", () => {
+      const sparkline = generateSparkline([]);
+      expect(sparkline).toBe("");
+    });
+  });
+  describe("formatEvalRunResult", () => {
+    test("shows pass/fail with gate result", () => {
+      const result = {
+        passed: true,
+        phase: "production" as const,
+        message: "Production phase: 2.5% regression - acceptable",
+        baseline: 0.85,
+        currentScore: 0.83,
+        regressionPercent: 0.025,
+      };
+      const output = formatEvalRunResult(result);
+      expect(output).toContain("PASS");
+      expect(output).toContain("production");
+      expect(output).toContain("0.83"); // current score
+      expect(output).toContain("2.5%"); // regression
+    });
+    test("shows failure with details", () => {
+      const result = {
+        passed: false,
+        phase: "production" as const,
+        message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
+        baseline: 0.85,
+        currentScore: 0.78,
+        regressionPercent: 0.08,
+      };
+      const output = formatEvalRunResult(result);
+      expect(output).toContain("FAIL");
+      expect(output).toContain("8.0%");
+      expect(output).toContain("exceeds");
+    });
+    test("shows bootstrap phase without baseline", () => {
+      const result = {
+        passed: true,
+        phase: "bootstrap" as const,
+        message: "Bootstrap phase (5/10 runs) - collecting data",
+        currentScore: 0.85,
+      };
+      const output = formatEvalRunResult(result);
+      expect(output).toContain("bootstrap");
+      expect(output).toContain("collecting data");
+      expect(output).not.toContain("baseline");
+    });
+  });
+});
+// ============================================================================
+// Eval Command Helpers (Implementation)
+// ============================================================================
+/**
+ * Generate sparkline from array of scores (0-1 range)
+ */
+function generateSparkline(scores: number[]): string {
+  if (scores.length === 0) return "";
+  const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
+  const min = Math.min(...scores);
+  const max = Math.max(...scores);
+  const range = max - min;
+  if (range === 0) {
+    // All scores the same
+    return chars[4].repeat(scores.length);
+  }
+  return scores
+    .map((score) => {
+      const normalized = (score - min) / range;
+      const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
+      return chars[index];
+    })
+    .join("");
+}
+/**
+ * Format eval status for display
+ */
+function formatEvalStatus(status: {
+  phase: "bootstrap" | "stabilization" | "production";
+  runCount: number;
+  thresholds: { stabilization: number; production: number };
+  recentScores: Array<{ timestamp: string; score: number }>;
+}): string {
+  const lines: string[] = [];
+  // Phase banner
+  const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
+  lines.push(`${phaseEmoji} Phase: ${status.phase}`);
+  lines.push(`Runs: ${status.runCount}`);
+  lines.push("");
+  // Thresholds
+  lines.push("Thresholds:");
+  lines.push(`  Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
+  lines.push(`  Production:    ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
+  lines.push("");
+  // Recent scores with sparkline
+  if (status.recentScores.length > 0) {
+    lines.push("Recent scores:");
+    const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
+    lines.push(`  ${sparkline}`);
+    for (const { timestamp, score } of status.recentScores) {
+      const time = new Date(timestamp).toLocaleString();
+      lines.push(`  ${time}: ${score.toFixed(2)}`);
+    }
+  } else {
+    lines.push("No scores yet - collecting data");
+  }
+  return lines.join("\n");
+}
+/**
+ * Format eval history for display
+ */
+function formatEvalHistory(history: Array<{
+  timestamp: string;
+  eval_name: string;
+  score: number;
+  run_count: number;
+}>): string {
+  if (history.length === 0) {
+    return "No eval history found";
+  }
+  const lines: string[] = [];
+  lines.push("Eval History:");
+  lines.push("");
+  // Group by eval name
+  const grouped = new Map<string, typeof history>();
+  for (const entry of history) {
+    if (!grouped.has(entry.eval_name)) {
+      grouped.set(entry.eval_name, []);
+    }
+    grouped.get(entry.eval_name)!.push(entry);
+  }
+  // Display each eval group
+  for (const [evalName, entries] of grouped) {
+    lines.push(`${evalName}:`);
+    const sparkline = generateSparkline(entries.map((e) => e.score));
+    lines.push(`  Trend: ${sparkline}`);
+    // Show latest 5 entries
+    const latest = entries.slice(-5);
+    for (const entry of latest) {
+      const time = new Date(entry.timestamp).toLocaleTimeString();
+      lines.push(`  ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
+    }
+    if (entries.length > 5) {
+      lines.push(`  ... and ${entries.length - 5} more`);
+    }
+    lines.push("");
+  }
+  return lines.join("\n");
+}
+/**
+ * Format eval run result (gate check)
+ */
+function formatEvalRunResult(result: {
+  passed: boolean;
+  phase: "bootstrap" | "stabilization" | "production";
+  message: string;
+  baseline?: number;
+  currentScore: number;
+  regressionPercent?: number;
+}): string {
+  const lines: string[] = [];
+  // Pass/fail banner
+  const status = result.passed ? "✅ PASS" : "❌ FAIL";
+  lines.push(status);
+  lines.push("");
+  // Phase and score
+  lines.push(`Phase: ${result.phase}`);
+  lines.push(`Score: ${result.currentScore.toFixed(2)}`);
+  if (result.baseline !== undefined) {
+    lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
+  }
+  if (result.regressionPercent !== undefined) {
+    const sign = result.regressionPercent > 0 ? "+" : "";
+    lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
+  }
+  lines.push("");
+  lines.push(result.message);
+  return lines.join("\n");
+}
+// ============================================================================
+// Eval Run Tests
+// ============================================================================
+describe("Eval Run CI Mode", () => {
+  let testDir: string;
+  beforeEach(() => {
+    testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
+    mkdirSync(testDir, { recursive: true });
+  });
+  afterEach(() => {
+    if (existsSync(testDir)) {
+      rmSync(testDir, { recursive: true, force: true });
+    }
+  });
+  test("writes eval results JSON file", async () => {
+    // Import the function we need to test
+    const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
+    const { checkGate } = await import("../src/eval-gates.js");
+    const { ensureHiveDirectory } = await import("../src/hive.js");
+    // Set up test data
+    const evalName = "test-eval";
+    const mockScore = 0.85;
+    // Ensure directory exists
+    ensureHiveDirectory(testDir);
+    // Get history and record run (simulating what eval run does)
+    const history = getScoreHistory(testDir, evalName);
+    recordEvalRun(testDir, {
+      timestamp: new Date().toISOString(),
+      eval_name: evalName,
+      score: mockScore,
+      run_count: history.length + 1,
+    });
+    // Check gate
+    const gateResult = checkGate(testDir, evalName, mockScore);
+    // Write results file (simulating CI mode)
+    const resultsPath = join(testDir, ".hive", "eval-results.json");
+    const results = { [evalName]: gateResult };
+    writeFileSync(resultsPath, JSON.stringify(results, null, 2));
+    // Verify file exists and has correct structure
+    expect(existsSync(resultsPath)).toBe(true);
+    const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
+    expect(savedResults).toHaveProperty(evalName);
+    expect(savedResults[evalName]).toMatchObject({
+      passed: true,
+      phase: "bootstrap",
+      currentScore: mockScore,
+    });
+  });
+  test("bootstrap phase always passes", async () => {
+    const { checkGate } = await import("../src/eval-gates.js");
+    // Even with a low score, bootstrap phase should pass
+    const result = checkGate(testDir, "test-eval", 0.1);
+    expect(result.passed).toBe(true);
+    expect(result.phase).toBe("bootstrap");
+    expect(result.message).toContain("Bootstrap phase");
+  });
+  test("production phase fails on regression", async () => {
+    const { recordEvalRun } = await import("../src/eval-history.js");
+    const { checkGate } = await import("../src/eval-gates.js");
+    const { ensureHiveDirectory } = await import("../src/hive.js");
+    ensureHiveDirectory(testDir);
+    // Simulate 60 runs with consistent high scores to reach production phase
+    for (let i = 0; i < 60; i++) {
+      recordEvalRun(testDir, {
+        timestamp: new Date().toISOString(),
+        eval_name: "test-eval",
+        score: 0.9,
+        run_count: i + 1,
+      });
+    }
+    // Now test with a regressed score (>5% drop from 0.9 baseline)
+    const regressedScore = 0.8; // 11% drop
+    const result = checkGate(testDir, "test-eval", regressedScore);
+    expect(result.passed).toBe(false);
+    expect(result.phase).toBe("production");
+    expect(result.message).toContain("FAIL");
+  });
+});