opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -9,9 +9,15 @@
9
9
  * 2. swarm_complete captures: outcome signals per subtask
10
10
  * 3. swarm_record_outcome captures: learning signals
11
11
  * 4. Human feedback (optional): accept/reject/modify
12
- * 5. Coordinator events: decisions, violations, outcomes
12
+ * 5. Coordinator events: decisions, violations, outcomes, compaction
13
13
  * 6. Session capture: full coordinator session to ~/.config/swarm-tools/sessions/
14
14
  *
15
+ * Event types:
16
+ * - DECISION: strategy_selected, worker_spawned, review_completed, decomposition_complete, researcher_spawned, skill_loaded, inbox_checked, blocker_resolved, scope_change_approved, scope_change_rejected
17
+ * - VIOLATION: coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned
18
+ * - OUTCOME: subtask_success, subtask_retry, subtask_failed, epic_complete, blocker_detected
19
+ * - COMPACTION: detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked
20
+ *
15
21
  * @module eval-capture
16
22
  */
17
23
  import * as fs from "node:fs";
@@ -123,7 +129,7 @@ export type PartialEvalRecord = Partial<EvalRecord> & {
123
129
  };
124
130
 
125
131
  /**
126
- * Coordinator Event - captures coordinator decisions, violations, and outcomes
132
+ * Coordinator Event - captures coordinator decisions, violations, outcomes, and compaction
127
133
  */
128
134
  export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
129
135
  // DECISION events
@@ -137,6 +143,12 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
137
143
  "worker_spawned",
138
144
  "review_completed",
139
145
  "decomposition_complete",
146
+ "researcher_spawned",
147
+ "skill_loaded",
148
+ "inbox_checked",
149
+ "blocker_resolved",
150
+ "scope_change_approved",
151
+ "scope_change_rejected",
140
152
  ]),
141
153
  payload: z.any(),
142
154
  }),
@@ -165,6 +177,22 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
165
177
  "subtask_retry",
166
178
  "subtask_failed",
167
179
  "epic_complete",
180
+ "blocker_detected",
181
+ ]),
182
+ payload: z.any(),
183
+ }),
184
+ // COMPACTION events
185
+ z.object({
186
+ session_id: z.string(),
187
+ epic_id: z.string(),
188
+ timestamp: z.string(),
189
+ event_type: z.literal("COMPACTION"),
190
+ compaction_type: z.enum([
191
+ "detection_complete",
192
+ "prompt_generated",
193
+ "context_injected",
194
+ "resumption_started",
195
+ "tool_call_tracked",
168
196
  ]),
169
197
  payload: z.any(),
170
198
  }),
@@ -595,6 +623,262 @@ export function captureCoordinatorEvent(event: CoordinatorEvent): void {
595
623
  fs.appendFileSync(sessionPath, line, "utf-8");
596
624
  }
597
625
 
626
+ /**
627
+ * Capture a compaction event to the session file
628
+ *
629
+ * Helper for capturing COMPACTION events with automatic timestamp generation.
630
+ * Tracks compaction hook lifecycle: detection → prompt generation → context injection → resumption.
631
+ *
632
+ * **Part of eval-driven development pipeline:** Compaction events are used by `compaction-prompt.eval.ts`
633
+ * to score prompt quality (ID specificity, actionability, coordinator identity).
634
+ *
635
+ * **Lifecycle stages:**
636
+ * - `detection_complete` - Compaction detected (confidence level, context type)
637
+ * - `prompt_generated` - Continuation prompt created (FULL content stored for eval)
638
+ * - `context_injected` - Prompt injected into OpenCode context
639
+ * - `resumption_started` - Coordinator resumed from checkpoint
640
+ * - `tool_call_tracked` - First tool called post-compaction (measures discipline)
641
+ *
642
+ * @param params - Compaction event parameters
643
+ * @param params.session_id - Coordinator session ID
644
+ * @param params.epic_id - Epic ID being coordinated
645
+ * @param params.compaction_type - Stage of compaction lifecycle
646
+ * @param params.payload - Event-specific data (full prompt content, detection results, etc.)
647
+ *
648
+ * @example
649
+ * // Capture detection complete
650
+ * captureCompactionEvent({
651
+ * session_id: "session-123",
652
+ * epic_id: "bd-456",
653
+ * compaction_type: "detection_complete",
654
+ * payload: {
655
+ * confidence: "high",
656
+ * context_type: "full",
657
+ * epic_id: "bd-456",
658
+ * },
659
+ * });
660
+ *
661
+ * @example
662
+ * // Capture prompt generated (with full content for eval)
663
+ * captureCompactionEvent({
664
+ * session_id: "session-123",
665
+ * epic_id: "bd-456",
666
+ * compaction_type: "prompt_generated",
667
+ * payload: {
668
+ * prompt_length: 5000,
669
+ * full_prompt: "You are a coordinator...", // Full prompt, not truncated - used for quality scoring
670
+ * context_type: "full",
671
+ * },
672
+ * });
673
+ */
674
+ export function captureCompactionEvent(params: {
675
+ session_id: string;
676
+ epic_id: string;
677
+ compaction_type:
678
+ | "detection_complete"
679
+ | "prompt_generated"
680
+ | "context_injected"
681
+ | "resumption_started"
682
+ | "tool_call_tracked";
683
+ payload: any;
684
+ }): void {
685
+ const event: CoordinatorEvent = {
686
+ session_id: params.session_id,
687
+ epic_id: params.epic_id,
688
+ timestamp: new Date().toISOString(),
689
+ event_type: "COMPACTION",
690
+ compaction_type: params.compaction_type,
691
+ payload: params.payload,
692
+ };
693
+
694
+ captureCoordinatorEvent(event);
695
+ }
696
+
697
+ /**
698
+ * Capture a researcher spawned event
699
+ *
700
+ * Called when coordinator spawns a swarm-researcher to handle unfamiliar technology
701
+ * or gather documentation before decomposition.
702
+ */
703
+ export function captureResearcherSpawned(params: {
704
+ session_id: string;
705
+ epic_id: string;
706
+ researcher_id: string;
707
+ research_topic: string;
708
+ tools_used?: string[];
709
+ }): void {
710
+ const event: CoordinatorEvent = {
711
+ session_id: params.session_id,
712
+ epic_id: params.epic_id,
713
+ timestamp: new Date().toISOString(),
714
+ event_type: "DECISION",
715
+ decision_type: "researcher_spawned",
716
+ payload: {
717
+ researcher_id: params.researcher_id,
718
+ research_topic: params.research_topic,
719
+ tools_used: params.tools_used || [],
720
+ },
721
+ };
722
+
723
+ captureCoordinatorEvent(event);
724
+ }
725
+
726
+ /**
727
+ * Capture a skill loaded event
728
+ *
729
+ * Called when coordinator loads domain knowledge via skills_use().
730
+ */
731
+ export function captureSkillLoaded(params: {
732
+ session_id: string;
733
+ epic_id: string;
734
+ skill_name: string;
735
+ context?: string;
736
+ }): void {
737
+ const event: CoordinatorEvent = {
738
+ session_id: params.session_id,
739
+ epic_id: params.epic_id,
740
+ timestamp: new Date().toISOString(),
741
+ event_type: "DECISION",
742
+ decision_type: "skill_loaded",
743
+ payload: {
744
+ skill_name: params.skill_name,
745
+ context: params.context,
746
+ },
747
+ };
748
+
749
+ captureCoordinatorEvent(event);
750
+ }
751
+
752
+ /**
753
+ * Capture an inbox checked event
754
+ *
755
+ * Called when coordinator checks swarmmail inbox for worker messages.
756
+ * Tracks monitoring frequency and responsiveness.
757
+ */
758
+ export function captureInboxChecked(params: {
759
+ session_id: string;
760
+ epic_id: string;
761
+ message_count: number;
762
+ urgent_count: number;
763
+ }): void {
764
+ const event: CoordinatorEvent = {
765
+ session_id: params.session_id,
766
+ epic_id: params.epic_id,
767
+ timestamp: new Date().toISOString(),
768
+ event_type: "DECISION",
769
+ decision_type: "inbox_checked",
770
+ payload: {
771
+ message_count: params.message_count,
772
+ urgent_count: params.urgent_count,
773
+ },
774
+ };
775
+
776
+ captureCoordinatorEvent(event);
777
+ }
778
+
779
+ /**
780
+ * Capture a blocker resolved event
781
+ *
782
+ * Called when coordinator successfully unblocks a worker.
783
+ */
784
+ export function captureBlockerResolved(params: {
785
+ session_id: string;
786
+ epic_id: string;
787
+ worker_id: string;
788
+ subtask_id: string;
789
+ blocker_type: string;
790
+ resolution: string;
791
+ }): void {
792
+ const event: CoordinatorEvent = {
793
+ session_id: params.session_id,
794
+ epic_id: params.epic_id,
795
+ timestamp: new Date().toISOString(),
796
+ event_type: "DECISION",
797
+ decision_type: "blocker_resolved",
798
+ payload: {
799
+ worker_id: params.worker_id,
800
+ subtask_id: params.subtask_id,
801
+ blocker_type: params.blocker_type,
802
+ resolution: params.resolution,
803
+ },
804
+ };
805
+
806
+ captureCoordinatorEvent(event);
807
+ }
808
+
809
+ /**
810
+ * Capture a scope change decision event
811
+ *
812
+ * Called when coordinator approves or rejects a worker's scope expansion request.
813
+ */
814
+ export function captureScopeChangeDecision(params: {
815
+ session_id: string;
816
+ epic_id: string;
817
+ worker_id: string;
818
+ subtask_id: string;
819
+ approved: boolean;
820
+ original_scope?: string;
821
+ new_scope?: string;
822
+ requested_scope?: string;
823
+ rejection_reason?: string;
824
+ estimated_time_add?: number;
825
+ }): void {
826
+ const event: CoordinatorEvent = {
827
+ session_id: params.session_id,
828
+ epic_id: params.epic_id,
829
+ timestamp: new Date().toISOString(),
830
+ event_type: "DECISION",
831
+ decision_type: params.approved ? "scope_change_approved" : "scope_change_rejected",
832
+ payload: params.approved
833
+ ? {
834
+ worker_id: params.worker_id,
835
+ subtask_id: params.subtask_id,
836
+ original_scope: params.original_scope,
837
+ new_scope: params.new_scope,
838
+ estimated_time_add: params.estimated_time_add,
839
+ }
840
+ : {
841
+ worker_id: params.worker_id,
842
+ subtask_id: params.subtask_id,
843
+ requested_scope: params.requested_scope,
844
+ rejection_reason: params.rejection_reason,
845
+ },
846
+ };
847
+
848
+ captureCoordinatorEvent(event);
849
+ }
850
+
851
+ /**
852
+ * Capture a blocker detected event
853
+ *
854
+ * Called when a worker reports being blocked (OUTCOME event, not DECISION).
855
+ */
856
+ export function captureBlockerDetected(params: {
857
+ session_id: string;
858
+ epic_id: string;
859
+ worker_id: string;
860
+ subtask_id: string;
861
+ blocker_type: string;
862
+ blocker_description: string;
863
+ }): void {
864
+ const event: CoordinatorEvent = {
865
+ session_id: params.session_id,
866
+ epic_id: params.epic_id,
867
+ timestamp: new Date().toISOString(),
868
+ event_type: "OUTCOME",
869
+ outcome_type: "blocker_detected",
870
+ payload: {
871
+ worker_id: params.worker_id,
872
+ subtask_id: params.subtask_id,
873
+ blocker_type: params.blocker_type,
874
+ blocker_description: params.blocker_description,
875
+ reported_at: new Date().toISOString(),
876
+ },
877
+ };
878
+
879
+ captureCoordinatorEvent(event);
880
+ }
881
+
598
882
  /**
599
883
  * Read all events from a session file
600
884
  */
@@ -0,0 +1,306 @@
1
+ /**
2
+ * Tests for progressive eval gates
3
+ *
4
+ * TDD approach:
5
+ * RED: Tests written first, all failing
6
+ * GREEN: Minimal implementation to pass
7
+ * REFACTOR: Clean up while keeping tests green
8
+ */
9
+ import { afterEach, beforeEach, describe, expect, test } from "bun:test";
10
+ import * as fs from "node:fs";
11
+ import { checkGate } from "./eval-gates.js";
12
+ import { recordEvalRun } from "./eval-history.js";
13
+
14
+ const TEST_PROJECT = "/tmp/eval-gates-test";
15
+
16
+ beforeEach(() => {
17
+ // Clean slate for each test
18
+ if (fs.existsSync(TEST_PROJECT)) {
19
+ fs.rmSync(TEST_PROJECT, { recursive: true });
20
+ }
21
+ fs.mkdirSync(TEST_PROJECT, { recursive: true });
22
+ });
23
+
24
+ afterEach(() => {
25
+ // Cleanup
26
+ if (fs.existsSync(TEST_PROJECT)) {
27
+ fs.rmSync(TEST_PROJECT, { recursive: true });
28
+ }
29
+ });
30
+
31
+ /**
32
+ * Helper to create run history
33
+ */
34
+ function seedHistory(evalName: string, scores: number[]): void {
35
+ for (let i = 0; i < scores.length; i++) {
36
+ recordEvalRun(TEST_PROJECT, {
37
+ timestamp: new Date(Date.now() + i * 1000).toISOString(),
38
+ eval_name: evalName,
39
+ score: scores[i],
40
+ run_count: i + 1,
41
+ });
42
+ }
43
+ }
44
+
45
+ describe("checkGate - Bootstrap Phase (<10 runs)", () => {
46
+ test("always passes with 0 runs", () => {
47
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
48
+
49
+ expect(result.passed).toBe(true);
50
+ expect(result.phase).toBe("bootstrap");
51
+ expect(result.message).toContain("Bootstrap phase");
52
+ });
53
+
54
+ test("always passes with 9 runs, even with score drop", () => {
55
+ seedHistory("my-eval", [0.9, 0.88, 0.87, 0.86, 0.85, 0.84, 0.83, 0.82, 0.81]);
56
+
57
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.5); // 50% drop
58
+
59
+ expect(result.passed).toBe(true);
60
+ expect(result.phase).toBe("bootstrap");
61
+ expect(result.message).toContain("Bootstrap phase");
62
+ });
63
+
64
+ test("provides run count in message", () => {
65
+ seedHistory("my-eval", [0.8, 0.8, 0.8]);
66
+
67
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
68
+
69
+ expect(result.message).toContain("3/10");
70
+ });
71
+ });
72
+
73
+ describe("checkGate - Stabilization Phase (10-50 runs)", () => {
74
+ test("exactly 10 runs enters stabilization", () => {
75
+ seedHistory("my-eval", Array(10).fill(0.85));
76
+
77
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
78
+
79
+ expect(result.phase).toBe("stabilization");
80
+ });
81
+
82
+ test("passes with <10% regression", () => {
83
+ seedHistory("my-eval", Array(15).fill(0.9));
84
+
85
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.82); // 8.8% drop
86
+
87
+ expect(result.passed).toBe(true);
88
+ expect(result.phase).toBe("stabilization");
89
+ expect(result.message).toContain("acceptable");
90
+ });
91
+
92
+ test("WARNS on >10% regression but still passes", () => {
93
+ seedHistory("my-eval", Array(15).fill(0.9));
94
+
95
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.8); // 11.1% drop
96
+
97
+ expect(result.passed).toBe(true); // Still passes in stabilization
98
+ expect(result.phase).toBe("stabilization");
99
+ expect(result.message).toContain("regression");
100
+ expect(result.message).toMatch(/10%|11%/); // Should mention threshold
101
+ });
102
+
103
+ test("edge case: exactly 10% regression", () => {
104
+ seedHistory("my-eval", Array(20).fill(0.9));
105
+
106
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.81); // Exactly 10% drop
107
+
108
+ expect(result.passed).toBe(true);
109
+ expect(result.phase).toBe("stabilization");
110
+ });
111
+
112
+ test("passes with score improvement", () => {
113
+ seedHistory("my-eval", Array(25).fill(0.8));
114
+
115
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
116
+
117
+ expect(result.passed).toBe(true);
118
+ expect(result.message).toContain("acceptable");
119
+ });
120
+
121
+ test("exactly 50 runs still in stabilization", () => {
122
+ seedHistory("my-eval", Array(50).fill(0.85));
123
+
124
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
125
+
126
+ expect(result.phase).toBe("stabilization");
127
+ });
128
+ });
129
+
130
+ describe("checkGate - Production Phase (>50 runs + variance <0.1)", () => {
131
+ test("enters production with 51 stable runs", () => {
132
+ seedHistory("my-eval", Array(51).fill(0.85));
133
+
134
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
135
+
136
+ expect(result.phase).toBe("production");
137
+ });
138
+
139
+ test("FAILS on >5% regression in production", () => {
140
+ seedHistory("my-eval", Array(60).fill(0.9));
141
+
142
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.84); // 6.7% drop
143
+
144
+ expect(result.passed).toBe(false);
145
+ expect(result.phase).toBe("production");
146
+ expect(result.message).toContain("FAIL");
147
+ expect(result.message).toMatch(/5%|6%/);
148
+ });
149
+
150
+ test("passes with <5% regression in production", () => {
151
+ seedHistory("my-eval", Array(60).fill(0.9));
152
+
153
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.86); // 4.4% drop
154
+
155
+ expect(result.passed).toBe(true);
156
+ expect(result.phase).toBe("production");
157
+ expect(result.message).toContain("acceptable");
158
+ });
159
+
160
+ test("edge case: exactly 5% regression", () => {
161
+ seedHistory("my-eval", Array(60).fill(0.9));
162
+
163
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.855); // Exactly 5% drop
164
+
165
+ expect(result.passed).toBe(true);
166
+ expect(result.phase).toBe("production");
167
+ });
168
+
169
+ test("stays in stabilization if variance too high (>0.1) despite >50 runs", () => {
170
+ // Need significant wild variance to push above 0.1
171
+ // From memory: 60 stable + 50 alternating wild = variance ~0.103
172
+ const stableRuns = Array(60).fill(0.85);
173
+ const wildRuns = Array(50)
174
+ .fill(0)
175
+ .map((_, i) => (i % 2 === 0 ? 0.1 : 0.9));
176
+ seedHistory("my-eval", [...stableRuns, ...wildRuns]);
177
+
178
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
179
+
180
+ expect(result.phase).toBe("stabilization");
181
+ expect(result.message).toContain("variance");
182
+ });
183
+
184
+ test("passes with score improvement in production", () => {
185
+ seedHistory("my-eval", Array(60).fill(0.8));
186
+
187
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
188
+
189
+ expect(result.passed).toBe(true);
190
+ expect(result.phase).toBe("production");
191
+ });
192
+ });
193
+
194
+ describe("checkGate - Baseline Calculation", () => {
195
+ test("uses mean of all historical scores as baseline", () => {
196
+ // Need 10+ runs to exit bootstrap and see baseline in message
197
+ seedHistory("my-eval", [0.8, 0.85, 0.9, 0.95, 1.0, 0.9, 0.9, 0.9, 0.9, 0.9]); // mean = 0.9
198
+
199
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.88);
200
+
201
+ // 0.88 is ~2.2% drop from 0.9 mean (within stabilization tolerance)
202
+ expect(result.passed).toBe(true);
203
+ expect(result.message).toContain("0.90"); // Should show baseline
204
+ });
205
+
206
+ test("handles different eval names independently", () => {
207
+ seedHistory("eval-a", Array(15).fill(0.9));
208
+ seedHistory("eval-b", Array(15).fill(0.5));
209
+
210
+ const resultA = checkGate(TEST_PROJECT, "eval-a", 0.88);
211
+ const resultB = checkGate(TEST_PROJECT, "eval-b", 0.48);
212
+
213
+ expect(resultA.passed).toBe(true);
214
+ expect(resultB.passed).toBe(true);
215
+ });
216
+ });
217
+
218
+ describe("checkGate - Edge Cases", () => {
219
+ test("handles score of 0", () => {
220
+ seedHistory("my-eval", Array(15).fill(0.8));
221
+
222
+ const result = checkGate(TEST_PROJECT, "my-eval", 0);
223
+
224
+ expect(result.passed).toBe(true); // Still passes in stabilization with warning
225
+ expect(result.message).toContain("regression");
226
+ });
227
+
228
+ test("handles perfect score of 1.0", () => {
229
+ seedHistory("my-eval", Array(15).fill(0.9));
230
+
231
+ const result = checkGate(TEST_PROJECT, "my-eval", 1.0);
232
+
233
+ expect(result.passed).toBe(true);
234
+ });
235
+
236
+ test("handles no history file (first run)", () => {
237
+ // No seedHistory call - empty project
238
+
239
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
240
+
241
+ expect(result.passed).toBe(true);
242
+ expect(result.phase).toBe("bootstrap");
243
+ });
244
+
245
+ test("handles baseline of 0 (avoid division by zero)", () => {
246
+ seedHistory("my-eval", Array(15).fill(0));
247
+
248
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
249
+
250
+ expect(result.passed).toBe(true);
251
+ expect(result.message).not.toContain("NaN");
252
+ expect(result.message).not.toContain("Infinity");
253
+ });
254
+ });
255
+
256
+ describe("checkGate - Configurable Thresholds", () => {
257
+ test("accepts custom stabilization threshold", () => {
258
+ seedHistory("my-eval", Array(15).fill(0.9));
259
+
260
+ // 15% regression with custom 20% threshold - should pass
261
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.765, {
262
+ stabilizationThreshold: 0.2, // 20% instead of default 10%
263
+ });
264
+
265
+ expect(result.passed).toBe(true);
266
+ expect(result.phase).toBe("stabilization");
267
+ expect(result.message).toContain("acceptable");
268
+ });
269
+
270
+ test("accepts custom production threshold", () => {
271
+ seedHistory("my-eval", Array(60).fill(0.9));
272
+
273
+ // 7% regression with custom 10% threshold - should pass
274
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
275
+ productionThreshold: 0.1, // 10% instead of default 5%
276
+ });
277
+
278
+ expect(result.passed).toBe(true);
279
+ expect(result.phase).toBe("production");
280
+ });
281
+
282
+ test("custom threshold makes test fail when exceeded", () => {
283
+ seedHistory("my-eval", Array(60).fill(0.9));
284
+
285
+ // 7% regression with custom 3% threshold - should fail
286
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
287
+ productionThreshold: 0.03, // 3% instead of default 5%
288
+ });
289
+
290
+ expect(result.passed).toBe(false);
291
+ expect(result.phase).toBe("production");
292
+ expect(result.message).toContain("FAIL");
293
+ });
294
+
295
+ test("partial config uses defaults for unspecified thresholds", () => {
296
+ seedHistory("my-eval", Array(15).fill(0.9));
297
+
298
+ // Only override production threshold
299
+ const result = checkGate(TEST_PROJECT, "my-eval", 0.88, {
300
+ productionThreshold: 0.01,
301
+ // stabilizationThreshold not specified - uses default 0.1
302
+ });
303
+
304
+ expect(result.passed).toBe(true); // 2.2% regression < 10% stabilization default
305
+ });
306
+ });