npm - opencode-swarm-plugin - Versions diffs - 0.38.0 → 0.40.0 - Mend

opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/.env +2 -0
package/.hive/eval-results.json +26 -0
package/.hive/issues.jsonl +27 -0
package/.hive/memories.jsonl +23 -1
package/.opencode/eval-history.jsonl +12 -0
package/CHANGELOG.md +182 -0
package/README.md +29 -12
package/bin/swarm.test.ts +881 -0
package/bin/swarm.ts +686 -0
package/dist/compaction-hook.d.ts +8 -1
package/dist/compaction-hook.d.ts.map +1 -1
package/dist/compaction-observability.d.ts +173 -0
package/dist/compaction-observability.d.ts.map +1 -0
package/dist/compaction-prompt-scoring.d.ts +124 -0
package/dist/compaction-prompt-scoring.d.ts.map +1 -0
package/dist/eval-capture.d.ts +174 -1
package/dist/eval-capture.d.ts.map +1 -1
package/dist/eval-gates.d.ts +84 -0
package/dist/eval-gates.d.ts.map +1 -0
package/dist/eval-history.d.ts +117 -0
package/dist/eval-history.d.ts.map +1 -0
package/dist/eval-learning.d.ts +216 -0
package/dist/eval-learning.d.ts.map +1 -0
package/dist/hive.d.ts.map +1 -1
package/dist/index.d.ts +80 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +16098 -651
package/dist/plugin.js +16012 -756
package/dist/post-compaction-tracker.d.ts +133 -0
package/dist/post-compaction-tracker.d.ts.map +1 -0
package/dist/schemas/task.d.ts +3 -3
package/dist/swarm-orchestrate.d.ts +23 -0
package/dist/swarm-orchestrate.d.ts.map +1 -1
package/dist/swarm-prompts.d.ts +25 -1
package/dist/swarm-prompts.d.ts.map +1 -1
package/dist/swarm.d.ts +4 -0
package/dist/swarm.d.ts.map +1 -1
package/evals/README.md +702 -105
package/evals/compaction-prompt.eval.ts +149 -0
package/evals/coordinator-behavior.eval.ts +8 -8
package/evals/fixtures/compaction-prompt-cases.ts +305 -0
package/evals/lib/compaction-loader.test.ts +248 -0
package/evals/lib/compaction-loader.ts +320 -0
package/evals/lib/data-loader.test.ts +345 -0
package/evals/lib/data-loader.ts +107 -6
package/evals/scorers/compaction-prompt-scorers.ts +145 -0
package/evals/scorers/compaction-scorers.ts +13 -13
package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
package/evals/scorers/coordinator-discipline.ts +348 -15
package/evals/scorers/index.test.ts +146 -0
package/evals/scorers/index.ts +104 -0
package/evals/swarm-decomposition.eval.ts +9 -2
package/examples/commands/swarm.md +291 -21
package/examples/plugin-wrapper-template.ts +117 -0
package/package.json +7 -5
package/scripts/migrate-unknown-sessions.ts +349 -0
package/src/compaction-capture.integration.test.ts +257 -0
package/src/compaction-hook.test.ts +42 -0
package/src/compaction-hook.ts +315 -86
package/src/compaction-observability.integration.test.ts +139 -0
package/src/compaction-observability.test.ts +187 -0
package/src/compaction-observability.ts +324 -0
package/src/compaction-prompt-scorers.test.ts +299 -0
package/src/compaction-prompt-scoring.ts +298 -0
package/src/eval-capture.test.ts +626 -1
package/src/eval-capture.ts +286 -2
package/src/eval-gates.test.ts +306 -0
package/src/eval-gates.ts +218 -0
package/src/eval-history.test.ts +508 -0
package/src/eval-history.ts +214 -0
package/src/eval-learning.test.ts +378 -0
package/src/eval-learning.ts +360 -0
package/src/eval-runner.test.ts +96 -0
package/src/eval-runner.ts +356 -0
package/src/hive.ts +34 -0
package/src/index.ts +115 -2
package/src/memory.test.ts +110 -0
package/src/memory.ts +34 -0
package/src/post-compaction-tracker.test.ts +251 -0
package/src/post-compaction-tracker.ts +237 -0
package/src/swarm-decompose.ts +2 -2
package/src/swarm-orchestrate.ts +2 -2
package/src/swarm-prompts.ts +2 -2
package/src/swarm-review.ts +3 -3
package/dist/beads.d.ts +0 -386
package/dist/beads.d.ts.map +0 -1
package/dist/schemas/bead-events.d.ts +0 -698
package/dist/schemas/bead-events.d.ts.map +0 -1
package/dist/schemas/bead.d.ts +0 -255
package/dist/schemas/bead.d.ts.map +0 -1
/package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0

package/src/eval-capture.ts CHANGED Viewed

@@ -9,9 +9,15 @@
  * 2. swarm_complete captures: outcome signals per subtask
  * 3. swarm_record_outcome captures: learning signals
  * 4. Human feedback (optional): accept/reject/modify
- * 5. Coordinator events: decisions, violations, outcomes
+ * 5. Coordinator events: decisions, violations, outcomes, compaction
  * 6. Session capture: full coordinator session to ~/.config/swarm-tools/sessions/
  *
+ * Event types:
+ * - DECISION: strategy_selected, worker_spawned, review_completed, decomposition_complete, researcher_spawned, skill_loaded, inbox_checked, blocker_resolved, scope_change_approved, scope_change_rejected
+ * - VIOLATION: coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned
+ * - OUTCOME: subtask_success, subtask_retry, subtask_failed, epic_complete, blocker_detected
+ * - COMPACTION: detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked
+ *
  * @module eval-capture
  */
 import * as fs from "node:fs";
@@ -123,7 +129,7 @@ export type PartialEvalRecord = Partial<EvalRecord> & {
 };
 /**
- * Coordinator Event - captures coordinator decisions, violations, and outcomes
+ * Coordinator Event - captures coordinator decisions, violations, outcomes, and compaction
  */
 export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
   // DECISION events
@@ -137,6 +143,12 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
       "worker_spawned",
       "review_completed",
       "decomposition_complete",
+      "researcher_spawned",
+      "skill_loaded",
+      "inbox_checked",
+      "blocker_resolved",
+      "scope_change_approved",
+      "scope_change_rejected",
     ]),
     payload: z.any(),
   }),
@@ -165,6 +177,22 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
       "subtask_retry",
       "subtask_failed",
       "epic_complete",
+      "blocker_detected",
+    ]),
+    payload: z.any(),
+  }),
+  // COMPACTION events
+  z.object({
+    session_id: z.string(),
+    epic_id: z.string(),
+    timestamp: z.string(),
+    event_type: z.literal("COMPACTION"),
+    compaction_type: z.enum([
+      "detection_complete",
+      "prompt_generated",
+      "context_injected",
+      "resumption_started",
+      "tool_call_tracked",
     ]),
     payload: z.any(),
   }),
@@ -595,6 +623,262 @@ export function captureCoordinatorEvent(event: CoordinatorEvent): void {
   fs.appendFileSync(sessionPath, line, "utf-8");
 }
+/**
+ * Capture a compaction event to the session file
+ *
+ * Helper for capturing COMPACTION events with automatic timestamp generation.
+ * Tracks compaction hook lifecycle: detection → prompt generation → context injection → resumption.
+ *
+ * **Part of eval-driven development pipeline:** Compaction events are used by `compaction-prompt.eval.ts`
+ * to score prompt quality (ID specificity, actionability, coordinator identity).
+ *
+ * **Lifecycle stages:**
+ * - `detection_complete` - Compaction detected (confidence level, context type)
+ * - `prompt_generated` - Continuation prompt created (FULL content stored for eval)
+ * - `context_injected` - Prompt injected into OpenCode context
+ * - `resumption_started` - Coordinator resumed from checkpoint
+ * - `tool_call_tracked` - First tool called post-compaction (measures discipline)
+ *
+ * @param params - Compaction event parameters
+ * @param params.session_id - Coordinator session ID
+ * @param params.epic_id - Epic ID being coordinated
+ * @param params.compaction_type - Stage of compaction lifecycle
+ * @param params.payload - Event-specific data (full prompt content, detection results, etc.)
+ *
+ * @example
+ * // Capture detection complete
+ * captureCompactionEvent({
+ *   session_id: "session-123",
+ *   epic_id: "bd-456",
+ *   compaction_type: "detection_complete",
+ *   payload: {
+ *     confidence: "high",
+ *     context_type: "full",
+ *     epic_id: "bd-456",
+ *   },
+ * });
+ *
+ * @example
+ * // Capture prompt generated (with full content for eval)
+ * captureCompactionEvent({
+ *   session_id: "session-123",
+ *   epic_id: "bd-456",
+ *   compaction_type: "prompt_generated",
+ *   payload: {
+ *     prompt_length: 5000,
+ *     full_prompt: "You are a coordinator...", // Full prompt, not truncated - used for quality scoring
+ *     context_type: "full",
+ *   },
+ * });
+ */
+export function captureCompactionEvent(params: {
+  session_id: string;
+  epic_id: string;
+  compaction_type:
+    | "detection_complete"
+    | "prompt_generated"
+    | "context_injected"
+    | "resumption_started"
+    | "tool_call_tracked";
+  payload: any;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "COMPACTION",
+    compaction_type: params.compaction_type,
+    payload: params.payload,
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture a researcher spawned event
+ *
+ * Called when coordinator spawns a swarm-researcher to handle unfamiliar technology
+ * or gather documentation before decomposition.
+ */
+export function captureResearcherSpawned(params: {
+  session_id: string;
+  epic_id: string;
+  researcher_id: string;
+  research_topic: string;
+  tools_used?: string[];
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "DECISION",
+    decision_type: "researcher_spawned",
+    payload: {
+      researcher_id: params.researcher_id,
+      research_topic: params.research_topic,
+      tools_used: params.tools_used || [],
+    },
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture a skill loaded event
+ *
+ * Called when coordinator loads domain knowledge via skills_use().
+ */
+export function captureSkillLoaded(params: {
+  session_id: string;
+  epic_id: string;
+  skill_name: string;
+  context?: string;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "DECISION",
+    decision_type: "skill_loaded",
+    payload: {
+      skill_name: params.skill_name,
+      context: params.context,
+    },
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture an inbox checked event
+ *
+ * Called when coordinator checks swarmmail inbox for worker messages.
+ * Tracks monitoring frequency and responsiveness.
+ */
+export function captureInboxChecked(params: {
+  session_id: string;
+  epic_id: string;
+  message_count: number;
+  urgent_count: number;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "DECISION",
+    decision_type: "inbox_checked",
+    payload: {
+      message_count: params.message_count,
+      urgent_count: params.urgent_count,
+    },
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture a blocker resolved event
+ *
+ * Called when coordinator successfully unblocks a worker.
+ */
+export function captureBlockerResolved(params: {
+  session_id: string;
+  epic_id: string;
+  worker_id: string;
+  subtask_id: string;
+  blocker_type: string;
+  resolution: string;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "DECISION",
+    decision_type: "blocker_resolved",
+    payload: {
+      worker_id: params.worker_id,
+      subtask_id: params.subtask_id,
+      blocker_type: params.blocker_type,
+      resolution: params.resolution,
+    },
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture a scope change decision event
+ *
+ * Called when coordinator approves or rejects a worker's scope expansion request.
+ */
+export function captureScopeChangeDecision(params: {
+  session_id: string;
+  epic_id: string;
+  worker_id: string;
+  subtask_id: string;
+  approved: boolean;
+  original_scope?: string;
+  new_scope?: string;
+  requested_scope?: string;
+  rejection_reason?: string;
+  estimated_time_add?: number;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "DECISION",
+    decision_type: params.approved ? "scope_change_approved" : "scope_change_rejected",
+    payload: params.approved
+      ? {
+          worker_id: params.worker_id,
+          subtask_id: params.subtask_id,
+          original_scope: params.original_scope,
+          new_scope: params.new_scope,
+          estimated_time_add: params.estimated_time_add,
+        }
+      : {
+          worker_id: params.worker_id,
+          subtask_id: params.subtask_id,
+          requested_scope: params.requested_scope,
+          rejection_reason: params.rejection_reason,
+        },
+  };
+  captureCoordinatorEvent(event);
+}
+/**
+ * Capture a blocker detected event
+ *
+ * Called when a worker reports being blocked (OUTCOME event, not DECISION).
+ */
+export function captureBlockerDetected(params: {
+  session_id: string;
+  epic_id: string;
+  worker_id: string;
+  subtask_id: string;
+  blocker_type: string;
+  blocker_description: string;
+}): void {
+  const event: CoordinatorEvent = {
+    session_id: params.session_id,
+    epic_id: params.epic_id,
+    timestamp: new Date().toISOString(),
+    event_type: "OUTCOME",
+    outcome_type: "blocker_detected",
+    payload: {
+      worker_id: params.worker_id,
+      subtask_id: params.subtask_id,
+      blocker_type: params.blocker_type,
+      blocker_description: params.blocker_description,
+      reported_at: new Date().toISOString(),
+    },
+  };
+  captureCoordinatorEvent(event);
+}
 /**
  * Read all events from a session file
  */

package/src/eval-gates.test.ts ADDED Viewed

@@ -0,0 +1,306 @@
+/**
+ * Tests for progressive eval gates
+ *
+ * TDD approach:
+ * RED: Tests written first, all failing
+ * GREEN: Minimal implementation to pass
+ * REFACTOR: Clean up while keeping tests green
+ */
+import { afterEach, beforeEach, describe, expect, test } from "bun:test";
+import * as fs from "node:fs";
+import { checkGate } from "./eval-gates.js";
+import { recordEvalRun } from "./eval-history.js";
+const TEST_PROJECT = "/tmp/eval-gates-test";
+beforeEach(() => {
+	// Clean slate for each test
+	if (fs.existsSync(TEST_PROJECT)) {
+		fs.rmSync(TEST_PROJECT, { recursive: true });
+	}
+	fs.mkdirSync(TEST_PROJECT, { recursive: true });
+});
+afterEach(() => {
+	// Cleanup
+	if (fs.existsSync(TEST_PROJECT)) {
+		fs.rmSync(TEST_PROJECT, { recursive: true });
+	}
+});
+/**
+ * Helper to create run history
+ */
+function seedHistory(evalName: string, scores: number[]): void {
+	for (let i = 0; i < scores.length; i++) {
+		recordEvalRun(TEST_PROJECT, {
+			timestamp: new Date(Date.now() + i * 1000).toISOString(),
+			eval_name: evalName,
+			score: scores[i],
+			run_count: i + 1,
+		});
+	}
+}
+describe("checkGate - Bootstrap Phase (<10 runs)", () => {
+	test("always passes with 0 runs", () => {
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("bootstrap");
+		expect(result.message).toContain("Bootstrap phase");
+	});
+	test("always passes with 9 runs, even with score drop", () => {
+		seedHistory("my-eval", [0.9, 0.88, 0.87, 0.86, 0.85, 0.84, 0.83, 0.82, 0.81]);
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.5); // 50% drop
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("bootstrap");
+		expect(result.message).toContain("Bootstrap phase");
+	});
+	test("provides run count in message", () => {
+		seedHistory("my-eval", [0.8, 0.8, 0.8]);
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
+		expect(result.message).toContain("3/10");
+	});
+});
+describe("checkGate - Stabilization Phase (10-50 runs)", () => {
+	test("exactly 10 runs enters stabilization", () => {
+		seedHistory("my-eval", Array(10).fill(0.85));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
+		expect(result.phase).toBe("stabilization");
+	});
+	test("passes with <10% regression", () => {
+		seedHistory("my-eval", Array(15).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.82); // 8.8% drop
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("stabilization");
+		expect(result.message).toContain("acceptable");
+	});
+	test("WARNS on >10% regression but still passes", () => {
+		seedHistory("my-eval", Array(15).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.8); // 11.1% drop
+		expect(result.passed).toBe(true); // Still passes in stabilization
+		expect(result.phase).toBe("stabilization");
+		expect(result.message).toContain("regression");
+		expect(result.message).toMatch(/10%|11%/); // Should mention threshold
+	});
+	test("edge case: exactly 10% regression", () => {
+		seedHistory("my-eval", Array(20).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.81); // Exactly 10% drop
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("stabilization");
+	});
+	test("passes with score improvement", () => {
+		seedHistory("my-eval", Array(25).fill(0.8));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
+		expect(result.passed).toBe(true);
+		expect(result.message).toContain("acceptable");
+	});
+	test("exactly 50 runs still in stabilization", () => {
+		seedHistory("my-eval", Array(50).fill(0.85));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
+		expect(result.phase).toBe("stabilization");
+	});
+});
+describe("checkGate - Production Phase (>50 runs + variance <0.1)", () => {
+	test("enters production with 51 stable runs", () => {
+		seedHistory("my-eval", Array(51).fill(0.85));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
+		expect(result.phase).toBe("production");
+	});
+	test("FAILS on >5% regression in production", () => {
+		seedHistory("my-eval", Array(60).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.84); // 6.7% drop
+		expect(result.passed).toBe(false);
+		expect(result.phase).toBe("production");
+		expect(result.message).toContain("FAIL");
+		expect(result.message).toMatch(/5%|6%/);
+	});
+	test("passes with <5% regression in production", () => {
+		seedHistory("my-eval", Array(60).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.86); // 4.4% drop
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("production");
+		expect(result.message).toContain("acceptable");
+	});
+	test("edge case: exactly 5% regression", () => {
+		seedHistory("my-eval", Array(60).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.855); // Exactly 5% drop
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("production");
+	});
+	test("stays in stabilization if variance too high (>0.1) despite >50 runs", () => {
+		// Need significant wild variance to push above 0.1
+		// From memory: 60 stable + 50 alternating wild = variance ~0.103
+		const stableRuns = Array(60).fill(0.85);
+		const wildRuns = Array(50)
+			.fill(0)
+			.map((_, i) => (i % 2 === 0 ? 0.1 : 0.9));
+		seedHistory("my-eval", [...stableRuns, ...wildRuns]);
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
+		expect(result.phase).toBe("stabilization");
+		expect(result.message).toContain("variance");
+	});
+	test("passes with score improvement in production", () => {
+		seedHistory("my-eval", Array(60).fill(0.8));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("production");
+	});
+});
+describe("checkGate - Baseline Calculation", () => {
+	test("uses mean of all historical scores as baseline", () => {
+		// Need 10+ runs to exit bootstrap and see baseline in message
+		seedHistory("my-eval", [0.8, 0.85, 0.9, 0.95, 1.0, 0.9, 0.9, 0.9, 0.9, 0.9]); // mean = 0.9
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.88);
+		// 0.88 is ~2.2% drop from 0.9 mean (within stabilization tolerance)
+		expect(result.passed).toBe(true);
+		expect(result.message).toContain("0.90"); // Should show baseline
+	});
+	test("handles different eval names independently", () => {
+		seedHistory("eval-a", Array(15).fill(0.9));
+		seedHistory("eval-b", Array(15).fill(0.5));
+		const resultA = checkGate(TEST_PROJECT, "eval-a", 0.88);
+		const resultB = checkGate(TEST_PROJECT, "eval-b", 0.48);
+		expect(resultA.passed).toBe(true);
+		expect(resultB.passed).toBe(true);
+	});
+});
+describe("checkGate - Edge Cases", () => {
+	test("handles score of 0", () => {
+		seedHistory("my-eval", Array(15).fill(0.8));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0);
+		expect(result.passed).toBe(true); // Still passes in stabilization with warning
+		expect(result.message).toContain("regression");
+	});
+	test("handles perfect score of 1.0", () => {
+		seedHistory("my-eval", Array(15).fill(0.9));
+		const result = checkGate(TEST_PROJECT, "my-eval", 1.0);
+		expect(result.passed).toBe(true);
+	});
+	test("handles no history file (first run)", () => {
+		// No seedHistory call - empty project
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("bootstrap");
+	});
+	test("handles baseline of 0 (avoid division by zero)", () => {
+		seedHistory("my-eval", Array(15).fill(0));
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
+		expect(result.passed).toBe(true);
+		expect(result.message).not.toContain("NaN");
+		expect(result.message).not.toContain("Infinity");
+	});
+});
+describe("checkGate - Configurable Thresholds", () => {
+	test("accepts custom stabilization threshold", () => {
+		seedHistory("my-eval", Array(15).fill(0.9));
+		// 15% regression with custom 20% threshold - should pass
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.765, {
+			stabilizationThreshold: 0.2, // 20% instead of default 10%
+		});
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("stabilization");
+		expect(result.message).toContain("acceptable");
+	});
+	test("accepts custom production threshold", () => {
+		seedHistory("my-eval", Array(60).fill(0.9));
+		// 7% regression with custom 10% threshold - should pass
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
+			productionThreshold: 0.1, // 10% instead of default 5%
+		});
+		expect(result.passed).toBe(true);
+		expect(result.phase).toBe("production");
+	});
+	test("custom threshold makes test fail when exceeded", () => {
+		seedHistory("my-eval", Array(60).fill(0.9));
+		// 7% regression with custom 3% threshold - should fail
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
+			productionThreshold: 0.03, // 3% instead of default 5%
+		});
+		expect(result.passed).toBe(false);
+		expect(result.phase).toBe("production");
+		expect(result.message).toContain("FAIL");
+	});
+	test("partial config uses defaults for unspecified thresholds", () => {
+		seedHistory("my-eval", Array(15).fill(0.9));
+		// Only override production threshold
+		const result = checkGate(TEST_PROJECT, "my-eval", 0.88, {
+			productionThreshold: 0.01,
+			// stabilizationThreshold not specified - uses default 0.1
+		});
+		expect(result.passed).toBe(true); // 2.2% regression < 10% stabilization default
+	});
+});