opencode-swarm-plugin 0.38.0 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +27 -0
- package/.hive/memories.jsonl +23 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/CHANGELOG.md +182 -0
- package/README.md +29 -12
- package/bin/swarm.test.ts +881 -0
- package/bin/swarm.ts +686 -0
- package/dist/compaction-hook.d.ts +8 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-observability.d.ts +173 -0
- package/dist/compaction-observability.d.ts.map +1 -0
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +174 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +80 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +16098 -651
- package/dist/plugin.js +16012 -756
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/schemas/task.d.ts +3 -3
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +4 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +702 -105
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
- package/evals/scorers/coordinator-discipline.ts +348 -15
- package/evals/scorers/index.test.ts +146 -0
- package/evals/scorers/index.ts +104 -0
- package/evals/swarm-decomposition.eval.ts +9 -2
- package/examples/commands/swarm.md +291 -21
- package/examples/plugin-wrapper-template.ts +117 -0
- package/package.json +7 -5
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +42 -0
- package/src/compaction-hook.ts +315 -86
- package/src/compaction-observability.integration.test.ts +139 -0
- package/src/compaction-observability.test.ts +187 -0
- package/src/compaction-observability.ts +324 -0
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +626 -1
- package/src/eval-capture.ts +286 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/eval-runner.test.ts +96 -0
- package/src/eval-runner.ts +356 -0
- package/src/hive.ts +34 -0
- package/src/index.ts +115 -2
- package/src/memory.test.ts +110 -0
- package/src/memory.ts +34 -0
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.ts +2 -2
- package/src/swarm-prompts.ts +2 -2
- package/src/swarm-review.ts +3 -3
- package/dist/beads.d.ts +0 -386
- package/dist/beads.d.ts.map +0 -1
- package/dist/schemas/bead-events.d.ts +0 -698
- package/dist/schemas/bead-events.d.ts.map +0 -1
- package/dist/schemas/bead.d.ts +0 -255
- package/dist/schemas/bead.d.ts.map +0 -1
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
package/src/eval-capture.ts
CHANGED
|
@@ -9,9 +9,15 @@
|
|
|
9
9
|
* 2. swarm_complete captures: outcome signals per subtask
|
|
10
10
|
* 3. swarm_record_outcome captures: learning signals
|
|
11
11
|
* 4. Human feedback (optional): accept/reject/modify
|
|
12
|
-
* 5. Coordinator events: decisions, violations, outcomes
|
|
12
|
+
* 5. Coordinator events: decisions, violations, outcomes, compaction
|
|
13
13
|
* 6. Session capture: full coordinator session to ~/.config/swarm-tools/sessions/
|
|
14
14
|
*
|
|
15
|
+
* Event types:
|
|
16
|
+
* - DECISION: strategy_selected, worker_spawned, review_completed, decomposition_complete, researcher_spawned, skill_loaded, inbox_checked, blocker_resolved, scope_change_approved, scope_change_rejected
|
|
17
|
+
* - VIOLATION: coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned
|
|
18
|
+
* - OUTCOME: subtask_success, subtask_retry, subtask_failed, epic_complete, blocker_detected
|
|
19
|
+
* - COMPACTION: detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked
|
|
20
|
+
*
|
|
15
21
|
* @module eval-capture
|
|
16
22
|
*/
|
|
17
23
|
import * as fs from "node:fs";
|
|
@@ -123,7 +129,7 @@ export type PartialEvalRecord = Partial<EvalRecord> & {
|
|
|
123
129
|
};
|
|
124
130
|
|
|
125
131
|
/**
|
|
126
|
-
* Coordinator Event - captures coordinator decisions, violations, and
|
|
132
|
+
* Coordinator Event - captures coordinator decisions, violations, outcomes, and compaction
|
|
127
133
|
*/
|
|
128
134
|
export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
|
|
129
135
|
// DECISION events
|
|
@@ -137,6 +143,12 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
|
|
|
137
143
|
"worker_spawned",
|
|
138
144
|
"review_completed",
|
|
139
145
|
"decomposition_complete",
|
|
146
|
+
"researcher_spawned",
|
|
147
|
+
"skill_loaded",
|
|
148
|
+
"inbox_checked",
|
|
149
|
+
"blocker_resolved",
|
|
150
|
+
"scope_change_approved",
|
|
151
|
+
"scope_change_rejected",
|
|
140
152
|
]),
|
|
141
153
|
payload: z.any(),
|
|
142
154
|
}),
|
|
@@ -165,6 +177,22 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
|
|
|
165
177
|
"subtask_retry",
|
|
166
178
|
"subtask_failed",
|
|
167
179
|
"epic_complete",
|
|
180
|
+
"blocker_detected",
|
|
181
|
+
]),
|
|
182
|
+
payload: z.any(),
|
|
183
|
+
}),
|
|
184
|
+
// COMPACTION events
|
|
185
|
+
z.object({
|
|
186
|
+
session_id: z.string(),
|
|
187
|
+
epic_id: z.string(),
|
|
188
|
+
timestamp: z.string(),
|
|
189
|
+
event_type: z.literal("COMPACTION"),
|
|
190
|
+
compaction_type: z.enum([
|
|
191
|
+
"detection_complete",
|
|
192
|
+
"prompt_generated",
|
|
193
|
+
"context_injected",
|
|
194
|
+
"resumption_started",
|
|
195
|
+
"tool_call_tracked",
|
|
168
196
|
]),
|
|
169
197
|
payload: z.any(),
|
|
170
198
|
}),
|
|
@@ -595,6 +623,262 @@ export function captureCoordinatorEvent(event: CoordinatorEvent): void {
|
|
|
595
623
|
fs.appendFileSync(sessionPath, line, "utf-8");
|
|
596
624
|
}
|
|
597
625
|
|
|
626
|
+
/**
|
|
627
|
+
* Capture a compaction event to the session file
|
|
628
|
+
*
|
|
629
|
+
* Helper for capturing COMPACTION events with automatic timestamp generation.
|
|
630
|
+
* Tracks compaction hook lifecycle: detection → prompt generation → context injection → resumption.
|
|
631
|
+
*
|
|
632
|
+
* **Part of eval-driven development pipeline:** Compaction events are used by `compaction-prompt.eval.ts`
|
|
633
|
+
* to score prompt quality (ID specificity, actionability, coordinator identity).
|
|
634
|
+
*
|
|
635
|
+
* **Lifecycle stages:**
|
|
636
|
+
* - `detection_complete` - Compaction detected (confidence level, context type)
|
|
637
|
+
* - `prompt_generated` - Continuation prompt created (FULL content stored for eval)
|
|
638
|
+
* - `context_injected` - Prompt injected into OpenCode context
|
|
639
|
+
* - `resumption_started` - Coordinator resumed from checkpoint
|
|
640
|
+
* - `tool_call_tracked` - First tool called post-compaction (measures discipline)
|
|
641
|
+
*
|
|
642
|
+
* @param params - Compaction event parameters
|
|
643
|
+
* @param params.session_id - Coordinator session ID
|
|
644
|
+
* @param params.epic_id - Epic ID being coordinated
|
|
645
|
+
* @param params.compaction_type - Stage of compaction lifecycle
|
|
646
|
+
* @param params.payload - Event-specific data (full prompt content, detection results, etc.)
|
|
647
|
+
*
|
|
648
|
+
* @example
|
|
649
|
+
* // Capture detection complete
|
|
650
|
+
* captureCompactionEvent({
|
|
651
|
+
* session_id: "session-123",
|
|
652
|
+
* epic_id: "bd-456",
|
|
653
|
+
* compaction_type: "detection_complete",
|
|
654
|
+
* payload: {
|
|
655
|
+
* confidence: "high",
|
|
656
|
+
* context_type: "full",
|
|
657
|
+
* epic_id: "bd-456",
|
|
658
|
+
* },
|
|
659
|
+
* });
|
|
660
|
+
*
|
|
661
|
+
* @example
|
|
662
|
+
* // Capture prompt generated (with full content for eval)
|
|
663
|
+
* captureCompactionEvent({
|
|
664
|
+
* session_id: "session-123",
|
|
665
|
+
* epic_id: "bd-456",
|
|
666
|
+
* compaction_type: "prompt_generated",
|
|
667
|
+
* payload: {
|
|
668
|
+
* prompt_length: 5000,
|
|
669
|
+
* full_prompt: "You are a coordinator...", // Full prompt, not truncated - used for quality scoring
|
|
670
|
+
* context_type: "full",
|
|
671
|
+
* },
|
|
672
|
+
* });
|
|
673
|
+
*/
|
|
674
|
+
export function captureCompactionEvent(params: {
|
|
675
|
+
session_id: string;
|
|
676
|
+
epic_id: string;
|
|
677
|
+
compaction_type:
|
|
678
|
+
| "detection_complete"
|
|
679
|
+
| "prompt_generated"
|
|
680
|
+
| "context_injected"
|
|
681
|
+
| "resumption_started"
|
|
682
|
+
| "tool_call_tracked";
|
|
683
|
+
payload: any;
|
|
684
|
+
}): void {
|
|
685
|
+
const event: CoordinatorEvent = {
|
|
686
|
+
session_id: params.session_id,
|
|
687
|
+
epic_id: params.epic_id,
|
|
688
|
+
timestamp: new Date().toISOString(),
|
|
689
|
+
event_type: "COMPACTION",
|
|
690
|
+
compaction_type: params.compaction_type,
|
|
691
|
+
payload: params.payload,
|
|
692
|
+
};
|
|
693
|
+
|
|
694
|
+
captureCoordinatorEvent(event);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* Capture a researcher spawned event
|
|
699
|
+
*
|
|
700
|
+
* Called when coordinator spawns a swarm-researcher to handle unfamiliar technology
|
|
701
|
+
* or gather documentation before decomposition.
|
|
702
|
+
*/
|
|
703
|
+
export function captureResearcherSpawned(params: {
|
|
704
|
+
session_id: string;
|
|
705
|
+
epic_id: string;
|
|
706
|
+
researcher_id: string;
|
|
707
|
+
research_topic: string;
|
|
708
|
+
tools_used?: string[];
|
|
709
|
+
}): void {
|
|
710
|
+
const event: CoordinatorEvent = {
|
|
711
|
+
session_id: params.session_id,
|
|
712
|
+
epic_id: params.epic_id,
|
|
713
|
+
timestamp: new Date().toISOString(),
|
|
714
|
+
event_type: "DECISION",
|
|
715
|
+
decision_type: "researcher_spawned",
|
|
716
|
+
payload: {
|
|
717
|
+
researcher_id: params.researcher_id,
|
|
718
|
+
research_topic: params.research_topic,
|
|
719
|
+
tools_used: params.tools_used || [],
|
|
720
|
+
},
|
|
721
|
+
};
|
|
722
|
+
|
|
723
|
+
captureCoordinatorEvent(event);
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
/**
|
|
727
|
+
* Capture a skill loaded event
|
|
728
|
+
*
|
|
729
|
+
* Called when coordinator loads domain knowledge via skills_use().
|
|
730
|
+
*/
|
|
731
|
+
export function captureSkillLoaded(params: {
|
|
732
|
+
session_id: string;
|
|
733
|
+
epic_id: string;
|
|
734
|
+
skill_name: string;
|
|
735
|
+
context?: string;
|
|
736
|
+
}): void {
|
|
737
|
+
const event: CoordinatorEvent = {
|
|
738
|
+
session_id: params.session_id,
|
|
739
|
+
epic_id: params.epic_id,
|
|
740
|
+
timestamp: new Date().toISOString(),
|
|
741
|
+
event_type: "DECISION",
|
|
742
|
+
decision_type: "skill_loaded",
|
|
743
|
+
payload: {
|
|
744
|
+
skill_name: params.skill_name,
|
|
745
|
+
context: params.context,
|
|
746
|
+
},
|
|
747
|
+
};
|
|
748
|
+
|
|
749
|
+
captureCoordinatorEvent(event);
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
/**
|
|
753
|
+
* Capture an inbox checked event
|
|
754
|
+
*
|
|
755
|
+
* Called when coordinator checks swarmmail inbox for worker messages.
|
|
756
|
+
* Tracks monitoring frequency and responsiveness.
|
|
757
|
+
*/
|
|
758
|
+
export function captureInboxChecked(params: {
|
|
759
|
+
session_id: string;
|
|
760
|
+
epic_id: string;
|
|
761
|
+
message_count: number;
|
|
762
|
+
urgent_count: number;
|
|
763
|
+
}): void {
|
|
764
|
+
const event: CoordinatorEvent = {
|
|
765
|
+
session_id: params.session_id,
|
|
766
|
+
epic_id: params.epic_id,
|
|
767
|
+
timestamp: new Date().toISOString(),
|
|
768
|
+
event_type: "DECISION",
|
|
769
|
+
decision_type: "inbox_checked",
|
|
770
|
+
payload: {
|
|
771
|
+
message_count: params.message_count,
|
|
772
|
+
urgent_count: params.urgent_count,
|
|
773
|
+
},
|
|
774
|
+
};
|
|
775
|
+
|
|
776
|
+
captureCoordinatorEvent(event);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Capture a blocker resolved event
|
|
781
|
+
*
|
|
782
|
+
* Called when coordinator successfully unblocks a worker.
|
|
783
|
+
*/
|
|
784
|
+
export function captureBlockerResolved(params: {
|
|
785
|
+
session_id: string;
|
|
786
|
+
epic_id: string;
|
|
787
|
+
worker_id: string;
|
|
788
|
+
subtask_id: string;
|
|
789
|
+
blocker_type: string;
|
|
790
|
+
resolution: string;
|
|
791
|
+
}): void {
|
|
792
|
+
const event: CoordinatorEvent = {
|
|
793
|
+
session_id: params.session_id,
|
|
794
|
+
epic_id: params.epic_id,
|
|
795
|
+
timestamp: new Date().toISOString(),
|
|
796
|
+
event_type: "DECISION",
|
|
797
|
+
decision_type: "blocker_resolved",
|
|
798
|
+
payload: {
|
|
799
|
+
worker_id: params.worker_id,
|
|
800
|
+
subtask_id: params.subtask_id,
|
|
801
|
+
blocker_type: params.blocker_type,
|
|
802
|
+
resolution: params.resolution,
|
|
803
|
+
},
|
|
804
|
+
};
|
|
805
|
+
|
|
806
|
+
captureCoordinatorEvent(event);
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* Capture a scope change decision event
|
|
811
|
+
*
|
|
812
|
+
* Called when coordinator approves or rejects a worker's scope expansion request.
|
|
813
|
+
*/
|
|
814
|
+
export function captureScopeChangeDecision(params: {
|
|
815
|
+
session_id: string;
|
|
816
|
+
epic_id: string;
|
|
817
|
+
worker_id: string;
|
|
818
|
+
subtask_id: string;
|
|
819
|
+
approved: boolean;
|
|
820
|
+
original_scope?: string;
|
|
821
|
+
new_scope?: string;
|
|
822
|
+
requested_scope?: string;
|
|
823
|
+
rejection_reason?: string;
|
|
824
|
+
estimated_time_add?: number;
|
|
825
|
+
}): void {
|
|
826
|
+
const event: CoordinatorEvent = {
|
|
827
|
+
session_id: params.session_id,
|
|
828
|
+
epic_id: params.epic_id,
|
|
829
|
+
timestamp: new Date().toISOString(),
|
|
830
|
+
event_type: "DECISION",
|
|
831
|
+
decision_type: params.approved ? "scope_change_approved" : "scope_change_rejected",
|
|
832
|
+
payload: params.approved
|
|
833
|
+
? {
|
|
834
|
+
worker_id: params.worker_id,
|
|
835
|
+
subtask_id: params.subtask_id,
|
|
836
|
+
original_scope: params.original_scope,
|
|
837
|
+
new_scope: params.new_scope,
|
|
838
|
+
estimated_time_add: params.estimated_time_add,
|
|
839
|
+
}
|
|
840
|
+
: {
|
|
841
|
+
worker_id: params.worker_id,
|
|
842
|
+
subtask_id: params.subtask_id,
|
|
843
|
+
requested_scope: params.requested_scope,
|
|
844
|
+
rejection_reason: params.rejection_reason,
|
|
845
|
+
},
|
|
846
|
+
};
|
|
847
|
+
|
|
848
|
+
captureCoordinatorEvent(event);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
/**
|
|
852
|
+
* Capture a blocker detected event
|
|
853
|
+
*
|
|
854
|
+
* Called when a worker reports being blocked (OUTCOME event, not DECISION).
|
|
855
|
+
*/
|
|
856
|
+
export function captureBlockerDetected(params: {
|
|
857
|
+
session_id: string;
|
|
858
|
+
epic_id: string;
|
|
859
|
+
worker_id: string;
|
|
860
|
+
subtask_id: string;
|
|
861
|
+
blocker_type: string;
|
|
862
|
+
blocker_description: string;
|
|
863
|
+
}): void {
|
|
864
|
+
const event: CoordinatorEvent = {
|
|
865
|
+
session_id: params.session_id,
|
|
866
|
+
epic_id: params.epic_id,
|
|
867
|
+
timestamp: new Date().toISOString(),
|
|
868
|
+
event_type: "OUTCOME",
|
|
869
|
+
outcome_type: "blocker_detected",
|
|
870
|
+
payload: {
|
|
871
|
+
worker_id: params.worker_id,
|
|
872
|
+
subtask_id: params.subtask_id,
|
|
873
|
+
blocker_type: params.blocker_type,
|
|
874
|
+
blocker_description: params.blocker_description,
|
|
875
|
+
reported_at: new Date().toISOString(),
|
|
876
|
+
},
|
|
877
|
+
};
|
|
878
|
+
|
|
879
|
+
captureCoordinatorEvent(event);
|
|
880
|
+
}
|
|
881
|
+
|
|
598
882
|
/**
|
|
599
883
|
* Read all events from a session file
|
|
600
884
|
*/
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for progressive eval gates
|
|
3
|
+
*
|
|
4
|
+
* TDD approach:
|
|
5
|
+
* RED: Tests written first, all failing
|
|
6
|
+
* GREEN: Minimal implementation to pass
|
|
7
|
+
* REFACTOR: Clean up while keeping tests green
|
|
8
|
+
*/
|
|
9
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
10
|
+
import * as fs from "node:fs";
|
|
11
|
+
import { checkGate } from "./eval-gates.js";
|
|
12
|
+
import { recordEvalRun } from "./eval-history.js";
|
|
13
|
+
|
|
14
|
+
const TEST_PROJECT = "/tmp/eval-gates-test";
|
|
15
|
+
|
|
16
|
+
beforeEach(() => {
|
|
17
|
+
// Clean slate for each test
|
|
18
|
+
if (fs.existsSync(TEST_PROJECT)) {
|
|
19
|
+
fs.rmSync(TEST_PROJECT, { recursive: true });
|
|
20
|
+
}
|
|
21
|
+
fs.mkdirSync(TEST_PROJECT, { recursive: true });
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
afterEach(() => {
|
|
25
|
+
// Cleanup
|
|
26
|
+
if (fs.existsSync(TEST_PROJECT)) {
|
|
27
|
+
fs.rmSync(TEST_PROJECT, { recursive: true });
|
|
28
|
+
}
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Helper to create run history
|
|
33
|
+
*/
|
|
34
|
+
function seedHistory(evalName: string, scores: number[]): void {
|
|
35
|
+
for (let i = 0; i < scores.length; i++) {
|
|
36
|
+
recordEvalRun(TEST_PROJECT, {
|
|
37
|
+
timestamp: new Date(Date.now() + i * 1000).toISOString(),
|
|
38
|
+
eval_name: evalName,
|
|
39
|
+
score: scores[i],
|
|
40
|
+
run_count: i + 1,
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
describe("checkGate - Bootstrap Phase (<10 runs)", () => {
|
|
46
|
+
test("always passes with 0 runs", () => {
|
|
47
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
|
|
48
|
+
|
|
49
|
+
expect(result.passed).toBe(true);
|
|
50
|
+
expect(result.phase).toBe("bootstrap");
|
|
51
|
+
expect(result.message).toContain("Bootstrap phase");
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test("always passes with 9 runs, even with score drop", () => {
|
|
55
|
+
seedHistory("my-eval", [0.9, 0.88, 0.87, 0.86, 0.85, 0.84, 0.83, 0.82, 0.81]);
|
|
56
|
+
|
|
57
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.5); // 50% drop
|
|
58
|
+
|
|
59
|
+
expect(result.passed).toBe(true);
|
|
60
|
+
expect(result.phase).toBe("bootstrap");
|
|
61
|
+
expect(result.message).toContain("Bootstrap phase");
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test("provides run count in message", () => {
|
|
65
|
+
seedHistory("my-eval", [0.8, 0.8, 0.8]);
|
|
66
|
+
|
|
67
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
|
|
68
|
+
|
|
69
|
+
expect(result.message).toContain("3/10");
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
describe("checkGate - Stabilization Phase (10-50 runs)", () => {
|
|
74
|
+
test("exactly 10 runs enters stabilization", () => {
|
|
75
|
+
seedHistory("my-eval", Array(10).fill(0.85));
|
|
76
|
+
|
|
77
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
|
|
78
|
+
|
|
79
|
+
expect(result.phase).toBe("stabilization");
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("passes with <10% regression", () => {
|
|
83
|
+
seedHistory("my-eval", Array(15).fill(0.9));
|
|
84
|
+
|
|
85
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.82); // 8.8% drop
|
|
86
|
+
|
|
87
|
+
expect(result.passed).toBe(true);
|
|
88
|
+
expect(result.phase).toBe("stabilization");
|
|
89
|
+
expect(result.message).toContain("acceptable");
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
test("WARNS on >10% regression but still passes", () => {
|
|
93
|
+
seedHistory("my-eval", Array(15).fill(0.9));
|
|
94
|
+
|
|
95
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.8); // 11.1% drop
|
|
96
|
+
|
|
97
|
+
expect(result.passed).toBe(true); // Still passes in stabilization
|
|
98
|
+
expect(result.phase).toBe("stabilization");
|
|
99
|
+
expect(result.message).toContain("regression");
|
|
100
|
+
expect(result.message).toMatch(/10%|11%/); // Should mention threshold
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
test("edge case: exactly 10% regression", () => {
|
|
104
|
+
seedHistory("my-eval", Array(20).fill(0.9));
|
|
105
|
+
|
|
106
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.81); // Exactly 10% drop
|
|
107
|
+
|
|
108
|
+
expect(result.passed).toBe(true);
|
|
109
|
+
expect(result.phase).toBe("stabilization");
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
test("passes with score improvement", () => {
|
|
113
|
+
seedHistory("my-eval", Array(25).fill(0.8));
|
|
114
|
+
|
|
115
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
|
|
116
|
+
|
|
117
|
+
expect(result.passed).toBe(true);
|
|
118
|
+
expect(result.message).toContain("acceptable");
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
test("exactly 50 runs still in stabilization", () => {
|
|
122
|
+
seedHistory("my-eval", Array(50).fill(0.85));
|
|
123
|
+
|
|
124
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
|
|
125
|
+
|
|
126
|
+
expect(result.phase).toBe("stabilization");
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
describe("checkGate - Production Phase (>50 runs + variance <0.1)", () => {
|
|
131
|
+
test("enters production with 51 stable runs", () => {
|
|
132
|
+
seedHistory("my-eval", Array(51).fill(0.85));
|
|
133
|
+
|
|
134
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
|
|
135
|
+
|
|
136
|
+
expect(result.phase).toBe("production");
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
test("FAILS on >5% regression in production", () => {
|
|
140
|
+
seedHistory("my-eval", Array(60).fill(0.9));
|
|
141
|
+
|
|
142
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.84); // 6.7% drop
|
|
143
|
+
|
|
144
|
+
expect(result.passed).toBe(false);
|
|
145
|
+
expect(result.phase).toBe("production");
|
|
146
|
+
expect(result.message).toContain("FAIL");
|
|
147
|
+
expect(result.message).toMatch(/5%|6%/);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test("passes with <5% regression in production", () => {
|
|
151
|
+
seedHistory("my-eval", Array(60).fill(0.9));
|
|
152
|
+
|
|
153
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.86); // 4.4% drop
|
|
154
|
+
|
|
155
|
+
expect(result.passed).toBe(true);
|
|
156
|
+
expect(result.phase).toBe("production");
|
|
157
|
+
expect(result.message).toContain("acceptable");
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
test("edge case: exactly 5% regression", () => {
|
|
161
|
+
seedHistory("my-eval", Array(60).fill(0.9));
|
|
162
|
+
|
|
163
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.855); // Exactly 5% drop
|
|
164
|
+
|
|
165
|
+
expect(result.passed).toBe(true);
|
|
166
|
+
expect(result.phase).toBe("production");
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
test("stays in stabilization if variance too high (>0.1) despite >50 runs", () => {
|
|
170
|
+
// Need significant wild variance to push above 0.1
|
|
171
|
+
// From memory: 60 stable + 50 alternating wild = variance ~0.103
|
|
172
|
+
const stableRuns = Array(60).fill(0.85);
|
|
173
|
+
const wildRuns = Array(50)
|
|
174
|
+
.fill(0)
|
|
175
|
+
.map((_, i) => (i % 2 === 0 ? 0.1 : 0.9));
|
|
176
|
+
seedHistory("my-eval", [...stableRuns, ...wildRuns]);
|
|
177
|
+
|
|
178
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.84);
|
|
179
|
+
|
|
180
|
+
expect(result.phase).toBe("stabilization");
|
|
181
|
+
expect(result.message).toContain("variance");
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
test("passes with score improvement in production", () => {
|
|
185
|
+
seedHistory("my-eval", Array(60).fill(0.8));
|
|
186
|
+
|
|
187
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.95);
|
|
188
|
+
|
|
189
|
+
expect(result.passed).toBe(true);
|
|
190
|
+
expect(result.phase).toBe("production");
|
|
191
|
+
});
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
describe("checkGate - Baseline Calculation", () => {
|
|
195
|
+
test("uses mean of all historical scores as baseline", () => {
|
|
196
|
+
// Need 10+ runs to exit bootstrap and see baseline in message
|
|
197
|
+
seedHistory("my-eval", [0.8, 0.85, 0.9, 0.95, 1.0, 0.9, 0.9, 0.9, 0.9, 0.9]); // mean = 0.9
|
|
198
|
+
|
|
199
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.88);
|
|
200
|
+
|
|
201
|
+
// 0.88 is ~2.2% drop from 0.9 mean (within stabilization tolerance)
|
|
202
|
+
expect(result.passed).toBe(true);
|
|
203
|
+
expect(result.message).toContain("0.90"); // Should show baseline
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
test("handles different eval names independently", () => {
|
|
207
|
+
seedHistory("eval-a", Array(15).fill(0.9));
|
|
208
|
+
seedHistory("eval-b", Array(15).fill(0.5));
|
|
209
|
+
|
|
210
|
+
const resultA = checkGate(TEST_PROJECT, "eval-a", 0.88);
|
|
211
|
+
const resultB = checkGate(TEST_PROJECT, "eval-b", 0.48);
|
|
212
|
+
|
|
213
|
+
expect(resultA.passed).toBe(true);
|
|
214
|
+
expect(resultB.passed).toBe(true);
|
|
215
|
+
});
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
describe("checkGate - Edge Cases", () => {
|
|
219
|
+
test("handles score of 0", () => {
|
|
220
|
+
seedHistory("my-eval", Array(15).fill(0.8));
|
|
221
|
+
|
|
222
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0);
|
|
223
|
+
|
|
224
|
+
expect(result.passed).toBe(true); // Still passes in stabilization with warning
|
|
225
|
+
expect(result.message).toContain("regression");
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
test("handles perfect score of 1.0", () => {
|
|
229
|
+
seedHistory("my-eval", Array(15).fill(0.9));
|
|
230
|
+
|
|
231
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 1.0);
|
|
232
|
+
|
|
233
|
+
expect(result.passed).toBe(true);
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
test("handles no history file (first run)", () => {
|
|
237
|
+
// No seedHistory call - empty project
|
|
238
|
+
|
|
239
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.75);
|
|
240
|
+
|
|
241
|
+
expect(result.passed).toBe(true);
|
|
242
|
+
expect(result.phase).toBe("bootstrap");
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
test("handles baseline of 0 (avoid division by zero)", () => {
|
|
246
|
+
seedHistory("my-eval", Array(15).fill(0));
|
|
247
|
+
|
|
248
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.5);
|
|
249
|
+
|
|
250
|
+
expect(result.passed).toBe(true);
|
|
251
|
+
expect(result.message).not.toContain("NaN");
|
|
252
|
+
expect(result.message).not.toContain("Infinity");
|
|
253
|
+
});
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
describe("checkGate - Configurable Thresholds", () => {
|
|
257
|
+
test("accepts custom stabilization threshold", () => {
|
|
258
|
+
seedHistory("my-eval", Array(15).fill(0.9));
|
|
259
|
+
|
|
260
|
+
// 15% regression with custom 20% threshold - should pass
|
|
261
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.765, {
|
|
262
|
+
stabilizationThreshold: 0.2, // 20% instead of default 10%
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
expect(result.passed).toBe(true);
|
|
266
|
+
expect(result.phase).toBe("stabilization");
|
|
267
|
+
expect(result.message).toContain("acceptable");
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
test("accepts custom production threshold", () => {
|
|
271
|
+
seedHistory("my-eval", Array(60).fill(0.9));
|
|
272
|
+
|
|
273
|
+
// 7% regression with custom 10% threshold - should pass
|
|
274
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
|
|
275
|
+
productionThreshold: 0.1, // 10% instead of default 5%
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
expect(result.passed).toBe(true);
|
|
279
|
+
expect(result.phase).toBe("production");
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
test("custom threshold makes test fail when exceeded", () => {
|
|
283
|
+
seedHistory("my-eval", Array(60).fill(0.9));
|
|
284
|
+
|
|
285
|
+
// 7% regression with custom 3% threshold - should fail
|
|
286
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.837, {
|
|
287
|
+
productionThreshold: 0.03, // 3% instead of default 5%
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
expect(result.passed).toBe(false);
|
|
291
|
+
expect(result.phase).toBe("production");
|
|
292
|
+
expect(result.message).toContain("FAIL");
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
test("partial config uses defaults for unspecified thresholds", () => {
|
|
296
|
+
seedHistory("my-eval", Array(15).fill(0.9));
|
|
297
|
+
|
|
298
|
+
// Only override production threshold
|
|
299
|
+
const result = checkGate(TEST_PROJECT, "my-eval", 0.88, {
|
|
300
|
+
productionThreshold: 0.01,
|
|
301
|
+
// stabilizationThreshold not specified - uses default 0.1
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
expect(result.passed).toBe(true); // 2.2% regression < 10% stabilization default
|
|
305
|
+
});
|
|
306
|
+
});
|