opencode-swarm-plugin 0.37.0 → 0.39.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +20 -5
- package/.hive/memories.jsonl +35 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/.turbo/turbo-build.log +4 -4
- package/.turbo/turbo-test.log +319 -319
- package/CHANGELOG.md +258 -0
- package/README.md +50 -0
- package/bin/swarm.test.ts +475 -0
- package/bin/swarm.ts +385 -208
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +81 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts +59 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +87 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +823 -131
- package/dist/plugin.js +655 -131
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/swarm-decompose.d.ts +30 -0
- package/dist/swarm-decompose.d.ts.map +1 -1
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +19 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +595 -94
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
- package/evals/scorers/coordinator-discipline.ts +13 -13
- package/examples/plugin-wrapper-template.ts +177 -8
- package/package.json +7 -2
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +139 -2
- package/src/compaction-hook.ts +113 -2
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +422 -0
- package/src/eval-capture.ts +94 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/index.ts +61 -1
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.test.ts +40 -47
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.test.ts +270 -7
- package/src/swarm-orchestrate.ts +100 -13
- package/src/swarm-prompts.test.ts +121 -0
- package/src/swarm-prompts.ts +297 -4
- package/src/swarm-research.integration.test.ts +157 -0
- package/src/swarm-review.ts +3 -3
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
package/bin/swarm.test.ts
CHANGED
|
@@ -639,3 +639,478 @@ describe("Log command helpers", () => {
|
|
|
639
639
|
});
|
|
640
640
|
});
|
|
641
641
|
});
|
|
642
|
+
|
|
643
|
+
// ============================================================================
|
|
644
|
+
// Eval Commands Tests (TDD)
|
|
645
|
+
// ============================================================================
|
|
646
|
+
|
|
647
|
+
describe("Eval commands", () => {
|
|
648
|
+
describe("formatEvalStatus", () => {
|
|
649
|
+
test("displays phase, thresholds, and recent scores", () => {
|
|
650
|
+
const status = {
|
|
651
|
+
phase: "stabilization" as const,
|
|
652
|
+
runCount: 25,
|
|
653
|
+
thresholds: {
|
|
654
|
+
stabilization: 0.1,
|
|
655
|
+
production: 0.05,
|
|
656
|
+
},
|
|
657
|
+
recentScores: [
|
|
658
|
+
{ timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
|
|
659
|
+
{ timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
|
|
660
|
+
{ timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
|
|
661
|
+
],
|
|
662
|
+
};
|
|
663
|
+
|
|
664
|
+
const output = formatEvalStatus(status);
|
|
665
|
+
|
|
666
|
+
// Should show phase
|
|
667
|
+
expect(output).toContain("stabilization");
|
|
668
|
+
|
|
669
|
+
// Should show run count
|
|
670
|
+
expect(output).toContain("25");
|
|
671
|
+
|
|
672
|
+
// Should show thresholds
|
|
673
|
+
expect(output).toContain("10%"); // stabilization threshold
|
|
674
|
+
expect(output).toContain("5%"); // production threshold
|
|
675
|
+
|
|
676
|
+
// Should show recent scores
|
|
677
|
+
expect(output).toContain("0.85");
|
|
678
|
+
expect(output).toContain("0.87");
|
|
679
|
+
expect(output).toContain("0.82");
|
|
680
|
+
});
|
|
681
|
+
|
|
682
|
+
test("shows bootstrap phase message", () => {
|
|
683
|
+
const status = {
|
|
684
|
+
phase: "bootstrap" as const,
|
|
685
|
+
runCount: 5,
|
|
686
|
+
thresholds: {
|
|
687
|
+
stabilization: 0.1,
|
|
688
|
+
production: 0.05,
|
|
689
|
+
},
|
|
690
|
+
recentScores: [],
|
|
691
|
+
};
|
|
692
|
+
|
|
693
|
+
const output = formatEvalStatus(status);
|
|
694
|
+
|
|
695
|
+
expect(output).toContain("bootstrap");
|
|
696
|
+
expect(output).toContain("collecting data");
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
test("shows production phase message", () => {
|
|
700
|
+
const status = {
|
|
701
|
+
phase: "production" as const,
|
|
702
|
+
runCount: 75,
|
|
703
|
+
thresholds: {
|
|
704
|
+
stabilization: 0.1,
|
|
705
|
+
production: 0.05,
|
|
706
|
+
},
|
|
707
|
+
recentScores: [],
|
|
708
|
+
};
|
|
709
|
+
|
|
710
|
+
const output = formatEvalStatus(status);
|
|
711
|
+
|
|
712
|
+
expect(output).toContain("production");
|
|
713
|
+
});
|
|
714
|
+
});
|
|
715
|
+
|
|
716
|
+
describe("formatEvalHistory", () => {
|
|
717
|
+
test("shows eval entries with timestamps and scores", () => {
|
|
718
|
+
const history = [
|
|
719
|
+
{
|
|
720
|
+
timestamp: "2024-12-24T10:00:00.000Z",
|
|
721
|
+
eval_name: "swarm-decomposition",
|
|
722
|
+
score: 0.85,
|
|
723
|
+
run_count: 1,
|
|
724
|
+
},
|
|
725
|
+
{
|
|
726
|
+
timestamp: "2024-12-24T11:00:00.000Z",
|
|
727
|
+
eval_name: "swarm-decomposition",
|
|
728
|
+
score: 0.87,
|
|
729
|
+
run_count: 2,
|
|
730
|
+
},
|
|
731
|
+
{
|
|
732
|
+
timestamp: "2024-12-24T12:00:00.000Z",
|
|
733
|
+
eval_name: "coordinator-behavior",
|
|
734
|
+
score: 0.92,
|
|
735
|
+
run_count: 1,
|
|
736
|
+
},
|
|
737
|
+
];
|
|
738
|
+
|
|
739
|
+
const output = formatEvalHistory(history);
|
|
740
|
+
|
|
741
|
+
// Should show all eval names
|
|
742
|
+
expect(output).toContain("swarm-decomposition");
|
|
743
|
+
expect(output).toContain("coordinator-behavior");
|
|
744
|
+
|
|
745
|
+
// Should show scores
|
|
746
|
+
expect(output).toContain("0.85");
|
|
747
|
+
expect(output).toContain("0.87");
|
|
748
|
+
expect(output).toContain("0.92");
|
|
749
|
+
|
|
750
|
+
// Should show run counts
|
|
751
|
+
expect(output).toContain("run #1");
|
|
752
|
+
expect(output).toContain("run #2");
|
|
753
|
+
});
|
|
754
|
+
|
|
755
|
+
test("returns empty message for no history", () => {
|
|
756
|
+
const output = formatEvalHistory([]);
|
|
757
|
+
expect(output).toContain("No eval history");
|
|
758
|
+
});
|
|
759
|
+
|
|
760
|
+
test("formats timestamps as readable dates", () => {
|
|
761
|
+
const history = [
|
|
762
|
+
{
|
|
763
|
+
timestamp: "2024-12-24T10:00:00.000Z",
|
|
764
|
+
eval_name: "test",
|
|
765
|
+
score: 0.85,
|
|
766
|
+
run_count: 1,
|
|
767
|
+
},
|
|
768
|
+
];
|
|
769
|
+
|
|
770
|
+
const output = formatEvalHistory(history);
|
|
771
|
+
|
|
772
|
+
// Should contain a formatted date (not raw ISO)
|
|
773
|
+
expect(output).not.toContain("2024-12-24T10:00:00.000Z");
|
|
774
|
+
expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
|
|
775
|
+
});
|
|
776
|
+
});
|
|
777
|
+
|
|
778
|
+
describe("generateSparkline", () => {
|
|
779
|
+
test("generates sparkline from scores", () => {
|
|
780
|
+
const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
|
|
781
|
+
const sparkline = generateSparkline(scores);
|
|
782
|
+
|
|
783
|
+
// Should use sparkline characters
|
|
784
|
+
expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
|
|
785
|
+
|
|
786
|
+
// Length should match input
|
|
787
|
+
expect(sparkline.length).toBe(scores.length);
|
|
788
|
+
|
|
789
|
+
// Should show ascending trend
|
|
790
|
+
expect(sparkline).toContain("▁"); // Low score
|
|
791
|
+
expect(sparkline).toContain("█"); // High score
|
|
792
|
+
});
|
|
793
|
+
|
|
794
|
+
test("handles single score", () => {
|
|
795
|
+
const sparkline = generateSparkline([0.5]);
|
|
796
|
+
expect(sparkline.length).toBe(1);
|
|
797
|
+
expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
|
|
798
|
+
});
|
|
799
|
+
|
|
800
|
+
test("handles all same scores", () => {
|
|
801
|
+
const sparkline = generateSparkline([0.5, 0.5, 0.5]);
|
|
802
|
+
expect(sparkline.length).toBe(3);
|
|
803
|
+
// All should be same character
|
|
804
|
+
expect(new Set(sparkline.split("")).size).toBe(1);
|
|
805
|
+
});
|
|
806
|
+
|
|
807
|
+
test("returns empty for empty array", () => {
|
|
808
|
+
const sparkline = generateSparkline([]);
|
|
809
|
+
expect(sparkline).toBe("");
|
|
810
|
+
});
|
|
811
|
+
});
|
|
812
|
+
|
|
813
|
+
describe("formatEvalRunResult", () => {
|
|
814
|
+
test("shows pass/fail with gate result", () => {
|
|
815
|
+
const result = {
|
|
816
|
+
passed: true,
|
|
817
|
+
phase: "production" as const,
|
|
818
|
+
message: "Production phase: 2.5% regression - acceptable",
|
|
819
|
+
baseline: 0.85,
|
|
820
|
+
currentScore: 0.83,
|
|
821
|
+
regressionPercent: 0.025,
|
|
822
|
+
};
|
|
823
|
+
|
|
824
|
+
const output = formatEvalRunResult(result);
|
|
825
|
+
|
|
826
|
+
expect(output).toContain("PASS");
|
|
827
|
+
expect(output).toContain("production");
|
|
828
|
+
expect(output).toContain("0.83"); // current score
|
|
829
|
+
expect(output).toContain("2.5%"); // regression
|
|
830
|
+
});
|
|
831
|
+
|
|
832
|
+
test("shows failure with details", () => {
|
|
833
|
+
const result = {
|
|
834
|
+
passed: false,
|
|
835
|
+
phase: "production" as const,
|
|
836
|
+
message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
|
|
837
|
+
baseline: 0.85,
|
|
838
|
+
currentScore: 0.78,
|
|
839
|
+
regressionPercent: 0.08,
|
|
840
|
+
};
|
|
841
|
+
|
|
842
|
+
const output = formatEvalRunResult(result);
|
|
843
|
+
|
|
844
|
+
expect(output).toContain("FAIL");
|
|
845
|
+
expect(output).toContain("8.0%");
|
|
846
|
+
expect(output).toContain("exceeds");
|
|
847
|
+
});
|
|
848
|
+
|
|
849
|
+
test("shows bootstrap phase without baseline", () => {
|
|
850
|
+
const result = {
|
|
851
|
+
passed: true,
|
|
852
|
+
phase: "bootstrap" as const,
|
|
853
|
+
message: "Bootstrap phase (5/10 runs) - collecting data",
|
|
854
|
+
currentScore: 0.85,
|
|
855
|
+
};
|
|
856
|
+
|
|
857
|
+
const output = formatEvalRunResult(result);
|
|
858
|
+
|
|
859
|
+
expect(output).toContain("bootstrap");
|
|
860
|
+
expect(output).toContain("collecting data");
|
|
861
|
+
expect(output).not.toContain("baseline");
|
|
862
|
+
});
|
|
863
|
+
});
|
|
864
|
+
});
|
|
865
|
+
|
|
866
|
+
// ============================================================================
|
|
867
|
+
// Eval Command Helpers (Implementation)
|
|
868
|
+
// ============================================================================
|
|
869
|
+
|
|
870
|
+
/**
|
|
871
|
+
* Generate sparkline from array of scores (0-1 range)
|
|
872
|
+
*/
|
|
873
|
+
function generateSparkline(scores: number[]): string {
|
|
874
|
+
if (scores.length === 0) return "";
|
|
875
|
+
|
|
876
|
+
const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
|
|
877
|
+
const min = Math.min(...scores);
|
|
878
|
+
const max = Math.max(...scores);
|
|
879
|
+
const range = max - min;
|
|
880
|
+
|
|
881
|
+
if (range === 0) {
|
|
882
|
+
// All scores the same
|
|
883
|
+
return chars[4].repeat(scores.length);
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
return scores
|
|
887
|
+
.map((score) => {
|
|
888
|
+
const normalized = (score - min) / range;
|
|
889
|
+
const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
|
|
890
|
+
return chars[index];
|
|
891
|
+
})
|
|
892
|
+
.join("");
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
/**
|
|
896
|
+
* Format eval status for display
|
|
897
|
+
*/
|
|
898
|
+
function formatEvalStatus(status: {
|
|
899
|
+
phase: "bootstrap" | "stabilization" | "production";
|
|
900
|
+
runCount: number;
|
|
901
|
+
thresholds: { stabilization: number; production: number };
|
|
902
|
+
recentScores: Array<{ timestamp: string; score: number }>;
|
|
903
|
+
}): string {
|
|
904
|
+
const lines: string[] = [];
|
|
905
|
+
|
|
906
|
+
// Phase banner
|
|
907
|
+
const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
|
|
908
|
+
lines.push(`${phaseEmoji} Phase: ${status.phase}`);
|
|
909
|
+
lines.push(`Runs: ${status.runCount}`);
|
|
910
|
+
lines.push("");
|
|
911
|
+
|
|
912
|
+
// Thresholds
|
|
913
|
+
lines.push("Thresholds:");
|
|
914
|
+
lines.push(` Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
|
|
915
|
+
lines.push(` Production: ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
|
|
916
|
+
lines.push("");
|
|
917
|
+
|
|
918
|
+
// Recent scores with sparkline
|
|
919
|
+
if (status.recentScores.length > 0) {
|
|
920
|
+
lines.push("Recent scores:");
|
|
921
|
+
const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
|
|
922
|
+
lines.push(` ${sparkline}`);
|
|
923
|
+
for (const { timestamp, score } of status.recentScores) {
|
|
924
|
+
const time = new Date(timestamp).toLocaleString();
|
|
925
|
+
lines.push(` ${time}: ${score.toFixed(2)}`);
|
|
926
|
+
}
|
|
927
|
+
} else {
|
|
928
|
+
lines.push("No scores yet - collecting data");
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
return lines.join("\n");
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
/**
|
|
935
|
+
* Format eval history for display
|
|
936
|
+
*/
|
|
937
|
+
function formatEvalHistory(history: Array<{
|
|
938
|
+
timestamp: string;
|
|
939
|
+
eval_name: string;
|
|
940
|
+
score: number;
|
|
941
|
+
run_count: number;
|
|
942
|
+
}>): string {
|
|
943
|
+
if (history.length === 0) {
|
|
944
|
+
return "No eval history found";
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
const lines: string[] = [];
|
|
948
|
+
lines.push("Eval History:");
|
|
949
|
+
lines.push("");
|
|
950
|
+
|
|
951
|
+
// Group by eval name
|
|
952
|
+
const grouped = new Map<string, typeof history>();
|
|
953
|
+
for (const entry of history) {
|
|
954
|
+
if (!grouped.has(entry.eval_name)) {
|
|
955
|
+
grouped.set(entry.eval_name, []);
|
|
956
|
+
}
|
|
957
|
+
grouped.get(entry.eval_name)!.push(entry);
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
// Display each eval group
|
|
961
|
+
for (const [evalName, entries] of grouped) {
|
|
962
|
+
lines.push(`${evalName}:`);
|
|
963
|
+
const sparkline = generateSparkline(entries.map((e) => e.score));
|
|
964
|
+
lines.push(` Trend: ${sparkline}`);
|
|
965
|
+
|
|
966
|
+
// Show latest 5 entries
|
|
967
|
+
const latest = entries.slice(-5);
|
|
968
|
+
for (const entry of latest) {
|
|
969
|
+
const time = new Date(entry.timestamp).toLocaleTimeString();
|
|
970
|
+
lines.push(` ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
if (entries.length > 5) {
|
|
974
|
+
lines.push(` ... and ${entries.length - 5} more`);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
lines.push("");
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
return lines.join("\n");
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
/**
|
|
984
|
+
* Format eval run result (gate check)
|
|
985
|
+
*/
|
|
986
|
+
function formatEvalRunResult(result: {
|
|
987
|
+
passed: boolean;
|
|
988
|
+
phase: "bootstrap" | "stabilization" | "production";
|
|
989
|
+
message: string;
|
|
990
|
+
baseline?: number;
|
|
991
|
+
currentScore: number;
|
|
992
|
+
regressionPercent?: number;
|
|
993
|
+
}): string {
|
|
994
|
+
const lines: string[] = [];
|
|
995
|
+
|
|
996
|
+
// Pass/fail banner
|
|
997
|
+
const status = result.passed ? "✅ PASS" : "❌ FAIL";
|
|
998
|
+
lines.push(status);
|
|
999
|
+
lines.push("");
|
|
1000
|
+
|
|
1001
|
+
// Phase and score
|
|
1002
|
+
lines.push(`Phase: ${result.phase}`);
|
|
1003
|
+
lines.push(`Score: ${result.currentScore.toFixed(2)}`);
|
|
1004
|
+
|
|
1005
|
+
if (result.baseline !== undefined) {
|
|
1006
|
+
lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
if (result.regressionPercent !== undefined) {
|
|
1010
|
+
const sign = result.regressionPercent > 0 ? "+" : "";
|
|
1011
|
+
lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
lines.push("");
|
|
1015
|
+
lines.push(result.message);
|
|
1016
|
+
|
|
1017
|
+
return lines.join("\n");
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
// ============================================================================
|
|
1021
|
+
// Eval Run Tests
|
|
1022
|
+
// ============================================================================
|
|
1023
|
+
|
|
1024
|
+
describe("Eval Run CI Mode", () => {
|
|
1025
|
+
let testDir: string;
|
|
1026
|
+
|
|
1027
|
+
beforeEach(() => {
|
|
1028
|
+
testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
|
|
1029
|
+
mkdirSync(testDir, { recursive: true });
|
|
1030
|
+
});
|
|
1031
|
+
|
|
1032
|
+
afterEach(() => {
|
|
1033
|
+
if (existsSync(testDir)) {
|
|
1034
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
1035
|
+
}
|
|
1036
|
+
});
|
|
1037
|
+
|
|
1038
|
+
test("writes eval results JSON file", async () => {
|
|
1039
|
+
// Import the function we need to test
|
|
1040
|
+
const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
|
|
1041
|
+
const { checkGate } = await import("../src/eval-gates.js");
|
|
1042
|
+
const { ensureHiveDirectory } = await import("../src/hive.js");
|
|
1043
|
+
|
|
1044
|
+
// Set up test data
|
|
1045
|
+
const evalName = "test-eval";
|
|
1046
|
+
const mockScore = 0.85;
|
|
1047
|
+
|
|
1048
|
+
// Ensure directory exists
|
|
1049
|
+
ensureHiveDirectory(testDir);
|
|
1050
|
+
|
|
1051
|
+
// Get history and record run (simulating what eval run does)
|
|
1052
|
+
const history = getScoreHistory(testDir, evalName);
|
|
1053
|
+
recordEvalRun(testDir, {
|
|
1054
|
+
timestamp: new Date().toISOString(),
|
|
1055
|
+
eval_name: evalName,
|
|
1056
|
+
score: mockScore,
|
|
1057
|
+
run_count: history.length + 1,
|
|
1058
|
+
});
|
|
1059
|
+
|
|
1060
|
+
// Check gate
|
|
1061
|
+
const gateResult = checkGate(testDir, evalName, mockScore);
|
|
1062
|
+
|
|
1063
|
+
// Write results file (simulating CI mode)
|
|
1064
|
+
const resultsPath = join(testDir, ".hive", "eval-results.json");
|
|
1065
|
+
const results = { [evalName]: gateResult };
|
|
1066
|
+
writeFileSync(resultsPath, JSON.stringify(results, null, 2));
|
|
1067
|
+
|
|
1068
|
+
// Verify file exists and has correct structure
|
|
1069
|
+
expect(existsSync(resultsPath)).toBe(true);
|
|
1070
|
+
|
|
1071
|
+
const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
1072
|
+
expect(savedResults).toHaveProperty(evalName);
|
|
1073
|
+
expect(savedResults[evalName]).toMatchObject({
|
|
1074
|
+
passed: true,
|
|
1075
|
+
phase: "bootstrap",
|
|
1076
|
+
currentScore: mockScore,
|
|
1077
|
+
});
|
|
1078
|
+
});
|
|
1079
|
+
|
|
1080
|
+
test("bootstrap phase always passes", async () => {
|
|
1081
|
+
const { checkGate } = await import("../src/eval-gates.js");
|
|
1082
|
+
|
|
1083
|
+
// Even with a low score, bootstrap phase should pass
|
|
1084
|
+
const result = checkGate(testDir, "test-eval", 0.1);
|
|
1085
|
+
|
|
1086
|
+
expect(result.passed).toBe(true);
|
|
1087
|
+
expect(result.phase).toBe("bootstrap");
|
|
1088
|
+
expect(result.message).toContain("Bootstrap phase");
|
|
1089
|
+
});
|
|
1090
|
+
|
|
1091
|
+
test("production phase fails on regression", async () => {
|
|
1092
|
+
const { recordEvalRun } = await import("../src/eval-history.js");
|
|
1093
|
+
const { checkGate } = await import("../src/eval-gates.js");
|
|
1094
|
+
const { ensureHiveDirectory } = await import("../src/hive.js");
|
|
1095
|
+
|
|
1096
|
+
ensureHiveDirectory(testDir);
|
|
1097
|
+
|
|
1098
|
+
// Simulate 60 runs with consistent high scores to reach production phase
|
|
1099
|
+
for (let i = 0; i < 60; i++) {
|
|
1100
|
+
recordEvalRun(testDir, {
|
|
1101
|
+
timestamp: new Date().toISOString(),
|
|
1102
|
+
eval_name: "test-eval",
|
|
1103
|
+
score: 0.9,
|
|
1104
|
+
run_count: i + 1,
|
|
1105
|
+
});
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
// Now test with a regressed score (>5% drop from 0.9 baseline)
|
|
1109
|
+
const regressedScore = 0.8; // 11% drop
|
|
1110
|
+
const result = checkGate(testDir, "test-eval", regressedScore);
|
|
1111
|
+
|
|
1112
|
+
expect(result.passed).toBe(false);
|
|
1113
|
+
expect(result.phase).toBe("production");
|
|
1114
|
+
expect(result.message).toContain("FAIL");
|
|
1115
|
+
});
|
|
1116
|
+
});
|