opencode-swarm-plugin 0.38.0 → 0.39.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +11 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +130 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +475 -0
  9. package/bin/swarm.ts +383 -0
  10. package/dist/compaction-hook.d.ts +1 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-prompt-scoring.d.ts +124 -0
  13. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  14. package/dist/eval-capture.d.ts +81 -1
  15. package/dist/eval-capture.d.ts.map +1 -1
  16. package/dist/eval-gates.d.ts +84 -0
  17. package/dist/eval-gates.d.ts.map +1 -0
  18. package/dist/eval-history.d.ts +117 -0
  19. package/dist/eval-history.d.ts.map +1 -0
  20. package/dist/eval-learning.d.ts +216 -0
  21. package/dist/eval-learning.d.ts.map +1 -0
  22. package/dist/index.d.ts +44 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +370 -13
  25. package/dist/plugin.js +203 -13
  26. package/dist/post-compaction-tracker.d.ts +133 -0
  27. package/dist/post-compaction-tracker.d.ts.map +1 -0
  28. package/dist/swarm-orchestrate.d.ts +23 -0
  29. package/dist/swarm-orchestrate.d.ts.map +1 -1
  30. package/dist/swarm-prompts.d.ts +25 -1
  31. package/dist/swarm-prompts.d.ts.map +1 -1
  32. package/dist/swarm.d.ts +4 -0
  33. package/dist/swarm.d.ts.map +1 -1
  34. package/evals/README.md +589 -105
  35. package/evals/compaction-prompt.eval.ts +149 -0
  36. package/evals/coordinator-behavior.eval.ts +8 -8
  37. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  38. package/evals/lib/compaction-loader.test.ts +248 -0
  39. package/evals/lib/compaction-loader.ts +320 -0
  40. package/evals/lib/data-loader.test.ts +345 -0
  41. package/evals/lib/data-loader.ts +107 -6
  42. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  43. package/evals/scorers/compaction-scorers.ts +13 -13
  44. package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
  45. package/evals/scorers/coordinator-discipline.ts +13 -13
  46. package/examples/plugin-wrapper-template.ts +117 -0
  47. package/package.json +7 -5
  48. package/scripts/migrate-unknown-sessions.ts +349 -0
  49. package/src/compaction-capture.integration.test.ts +257 -0
  50. package/src/compaction-hook.test.ts +42 -0
  51. package/src/compaction-hook.ts +81 -0
  52. package/src/compaction-prompt-scorers.test.ts +299 -0
  53. package/src/compaction-prompt-scoring.ts +298 -0
  54. package/src/eval-capture.test.ts +422 -0
  55. package/src/eval-capture.ts +94 -2
  56. package/src/eval-gates.test.ts +306 -0
  57. package/src/eval-gates.ts +218 -0
  58. package/src/eval-history.test.ts +508 -0
  59. package/src/eval-history.ts +214 -0
  60. package/src/eval-learning.test.ts +378 -0
  61. package/src/eval-learning.ts +360 -0
  62. package/src/index.ts +61 -1
  63. package/src/post-compaction-tracker.test.ts +251 -0
  64. package/src/post-compaction-tracker.ts +237 -0
  65. package/src/swarm-decompose.ts +2 -2
  66. package/src/swarm-orchestrate.ts +2 -2
  67. package/src/swarm-prompts.ts +2 -2
  68. package/src/swarm-review.ts +3 -3
  69. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
package/bin/swarm.test.ts CHANGED
@@ -639,3 +639,478 @@ describe("Log command helpers", () => {
639
639
  });
640
640
  });
641
641
  });
642
+
643
+ // ============================================================================
644
+ // Eval Commands Tests (TDD)
645
+ // ============================================================================
646
+
647
+ describe("Eval commands", () => {
648
+ describe("formatEvalStatus", () => {
649
+ test("displays phase, thresholds, and recent scores", () => {
650
+ const status = {
651
+ phase: "stabilization" as const,
652
+ runCount: 25,
653
+ thresholds: {
654
+ stabilization: 0.1,
655
+ production: 0.05,
656
+ },
657
+ recentScores: [
658
+ { timestamp: "2024-12-24T10:00:00.000Z", score: 0.85 },
659
+ { timestamp: "2024-12-24T11:00:00.000Z", score: 0.87 },
660
+ { timestamp: "2024-12-24T12:00:00.000Z", score: 0.82 },
661
+ ],
662
+ };
663
+
664
+ const output = formatEvalStatus(status);
665
+
666
+ // Should show phase
667
+ expect(output).toContain("stabilization");
668
+
669
+ // Should show run count
670
+ expect(output).toContain("25");
671
+
672
+ // Should show thresholds
673
+ expect(output).toContain("10%"); // stabilization threshold
674
+ expect(output).toContain("5%"); // production threshold
675
+
676
+ // Should show recent scores
677
+ expect(output).toContain("0.85");
678
+ expect(output).toContain("0.87");
679
+ expect(output).toContain("0.82");
680
+ });
681
+
682
+ test("shows bootstrap phase message", () => {
683
+ const status = {
684
+ phase: "bootstrap" as const,
685
+ runCount: 5,
686
+ thresholds: {
687
+ stabilization: 0.1,
688
+ production: 0.05,
689
+ },
690
+ recentScores: [],
691
+ };
692
+
693
+ const output = formatEvalStatus(status);
694
+
695
+ expect(output).toContain("bootstrap");
696
+ expect(output).toContain("collecting data");
697
+ });
698
+
699
+ test("shows production phase message", () => {
700
+ const status = {
701
+ phase: "production" as const,
702
+ runCount: 75,
703
+ thresholds: {
704
+ stabilization: 0.1,
705
+ production: 0.05,
706
+ },
707
+ recentScores: [],
708
+ };
709
+
710
+ const output = formatEvalStatus(status);
711
+
712
+ expect(output).toContain("production");
713
+ });
714
+ });
715
+
716
+ describe("formatEvalHistory", () => {
717
+ test("shows eval entries with timestamps and scores", () => {
718
+ const history = [
719
+ {
720
+ timestamp: "2024-12-24T10:00:00.000Z",
721
+ eval_name: "swarm-decomposition",
722
+ score: 0.85,
723
+ run_count: 1,
724
+ },
725
+ {
726
+ timestamp: "2024-12-24T11:00:00.000Z",
727
+ eval_name: "swarm-decomposition",
728
+ score: 0.87,
729
+ run_count: 2,
730
+ },
731
+ {
732
+ timestamp: "2024-12-24T12:00:00.000Z",
733
+ eval_name: "coordinator-behavior",
734
+ score: 0.92,
735
+ run_count: 1,
736
+ },
737
+ ];
738
+
739
+ const output = formatEvalHistory(history);
740
+
741
+ // Should show all eval names
742
+ expect(output).toContain("swarm-decomposition");
743
+ expect(output).toContain("coordinator-behavior");
744
+
745
+ // Should show scores
746
+ expect(output).toContain("0.85");
747
+ expect(output).toContain("0.87");
748
+ expect(output).toContain("0.92");
749
+
750
+ // Should show run counts
751
+ expect(output).toContain("run #1");
752
+ expect(output).toContain("run #2");
753
+ });
754
+
755
+ test("returns empty message for no history", () => {
756
+ const output = formatEvalHistory([]);
757
+ expect(output).toContain("No eval history");
758
+ });
759
+
760
+ test("formats timestamps as readable dates", () => {
761
+ const history = [
762
+ {
763
+ timestamp: "2024-12-24T10:00:00.000Z",
764
+ eval_name: "test",
765
+ score: 0.85,
766
+ run_count: 1,
767
+ },
768
+ ];
769
+
770
+ const output = formatEvalHistory(history);
771
+
772
+ // Should contain a formatted date (not raw ISO)
773
+ expect(output).not.toContain("2024-12-24T10:00:00.000Z");
774
+ expect(output).toMatch(/\d{1,2}:\d{2}/); // Time format
775
+ });
776
+ });
777
+
778
+ describe("generateSparkline", () => {
779
+ test("generates sparkline from scores", () => {
780
+ const scores = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0];
781
+ const sparkline = generateSparkline(scores);
782
+
783
+ // Should use sparkline characters
784
+ expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
785
+
786
+ // Length should match input
787
+ expect(sparkline.length).toBe(scores.length);
788
+
789
+ // Should show ascending trend
790
+ expect(sparkline).toContain("▁"); // Low score
791
+ expect(sparkline).toContain("█"); // High score
792
+ });
793
+
794
+ test("handles single score", () => {
795
+ const sparkline = generateSparkline([0.5]);
796
+ expect(sparkline.length).toBe(1);
797
+ expect(sparkline).toMatch(/[▁▂▃▄▅▆▇█]/);
798
+ });
799
+
800
+ test("handles all same scores", () => {
801
+ const sparkline = generateSparkline([0.5, 0.5, 0.5]);
802
+ expect(sparkline.length).toBe(3);
803
+ // All should be same character
804
+ expect(new Set(sparkline.split("")).size).toBe(1);
805
+ });
806
+
807
+ test("returns empty for empty array", () => {
808
+ const sparkline = generateSparkline([]);
809
+ expect(sparkline).toBe("");
810
+ });
811
+ });
812
+
813
+ describe("formatEvalRunResult", () => {
814
+ test("shows pass/fail with gate result", () => {
815
+ const result = {
816
+ passed: true,
817
+ phase: "production" as const,
818
+ message: "Production phase: 2.5% regression - acceptable",
819
+ baseline: 0.85,
820
+ currentScore: 0.83,
821
+ regressionPercent: 0.025,
822
+ };
823
+
824
+ const output = formatEvalRunResult(result);
825
+
826
+ expect(output).toContain("PASS");
827
+ expect(output).toContain("production");
828
+ expect(output).toContain("0.83"); // current score
829
+ expect(output).toContain("2.5%"); // regression
830
+ });
831
+
832
+ test("shows failure with details", () => {
833
+ const result = {
834
+ passed: false,
835
+ phase: "production" as const,
836
+ message: "Production phase FAIL: 8.0% regression - exceeds 5% threshold",
837
+ baseline: 0.85,
838
+ currentScore: 0.78,
839
+ regressionPercent: 0.08,
840
+ };
841
+
842
+ const output = formatEvalRunResult(result);
843
+
844
+ expect(output).toContain("FAIL");
845
+ expect(output).toContain("8.0%");
846
+ expect(output).toContain("exceeds");
847
+ });
848
+
849
+ test("shows bootstrap phase without baseline", () => {
850
+ const result = {
851
+ passed: true,
852
+ phase: "bootstrap" as const,
853
+ message: "Bootstrap phase (5/10 runs) - collecting data",
854
+ currentScore: 0.85,
855
+ };
856
+
857
+ const output = formatEvalRunResult(result);
858
+
859
+ expect(output).toContain("bootstrap");
860
+ expect(output).toContain("collecting data");
861
+ expect(output).not.toContain("baseline");
862
+ });
863
+ });
864
+ });
865
+
866
+ // ============================================================================
867
+ // Eval Command Helpers (Implementation)
868
+ // ============================================================================
869
+
870
+ /**
871
+ * Generate sparkline from array of scores (0-1 range)
872
+ */
873
+ function generateSparkline(scores: number[]): string {
874
+ if (scores.length === 0) return "";
875
+
876
+ const chars = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"];
877
+ const min = Math.min(...scores);
878
+ const max = Math.max(...scores);
879
+ const range = max - min;
880
+
881
+ if (range === 0) {
882
+ // All scores the same
883
+ return chars[4].repeat(scores.length);
884
+ }
885
+
886
+ return scores
887
+ .map((score) => {
888
+ const normalized = (score - min) / range;
889
+ const index = Math.min(Math.floor(normalized * chars.length), chars.length - 1);
890
+ return chars[index];
891
+ })
892
+ .join("");
893
+ }
894
+
895
+ /**
896
+ * Format eval status for display
897
+ */
898
+ function formatEvalStatus(status: {
899
+ phase: "bootstrap" | "stabilization" | "production";
900
+ runCount: number;
901
+ thresholds: { stabilization: number; production: number };
902
+ recentScores: Array<{ timestamp: string; score: number }>;
903
+ }): string {
904
+ const lines: string[] = [];
905
+
906
+ // Phase banner
907
+ const phaseEmoji = status.phase === "bootstrap" ? "🌱" : status.phase === "stabilization" ? "⚙️" : "🚀";
908
+ lines.push(`${phaseEmoji} Phase: ${status.phase}`);
909
+ lines.push(`Runs: ${status.runCount}`);
910
+ lines.push("");
911
+
912
+ // Thresholds
913
+ lines.push("Thresholds:");
914
+ lines.push(` Stabilization: ${(status.thresholds.stabilization * 100).toFixed(0)}% regression warning`);
915
+ lines.push(` Production: ${(status.thresholds.production * 100).toFixed(0)}% regression failure`);
916
+ lines.push("");
917
+
918
+ // Recent scores with sparkline
919
+ if (status.recentScores.length > 0) {
920
+ lines.push("Recent scores:");
921
+ const sparkline = generateSparkline(status.recentScores.map((s) => s.score));
922
+ lines.push(` ${sparkline}`);
923
+ for (const { timestamp, score } of status.recentScores) {
924
+ const time = new Date(timestamp).toLocaleString();
925
+ lines.push(` ${time}: ${score.toFixed(2)}`);
926
+ }
927
+ } else {
928
+ lines.push("No scores yet - collecting data");
929
+ }
930
+
931
+ return lines.join("\n");
932
+ }
933
+
934
+ /**
935
+ * Format eval history for display
936
+ */
937
+ function formatEvalHistory(history: Array<{
938
+ timestamp: string;
939
+ eval_name: string;
940
+ score: number;
941
+ run_count: number;
942
+ }>): string {
943
+ if (history.length === 0) {
944
+ return "No eval history found";
945
+ }
946
+
947
+ const lines: string[] = [];
948
+ lines.push("Eval History:");
949
+ lines.push("");
950
+
951
+ // Group by eval name
952
+ const grouped = new Map<string, typeof history>();
953
+ for (const entry of history) {
954
+ if (!grouped.has(entry.eval_name)) {
955
+ grouped.set(entry.eval_name, []);
956
+ }
957
+ grouped.get(entry.eval_name)!.push(entry);
958
+ }
959
+
960
+ // Display each eval group
961
+ for (const [evalName, entries] of grouped) {
962
+ lines.push(`${evalName}:`);
963
+ const sparkline = generateSparkline(entries.map((e) => e.score));
964
+ lines.push(` Trend: ${sparkline}`);
965
+
966
+ // Show latest 5 entries
967
+ const latest = entries.slice(-5);
968
+ for (const entry of latest) {
969
+ const time = new Date(entry.timestamp).toLocaleTimeString();
970
+ lines.push(` ${time} - run #${entry.run_count}: ${entry.score.toFixed(2)}`);
971
+ }
972
+
973
+ if (entries.length > 5) {
974
+ lines.push(` ... and ${entries.length - 5} more`);
975
+ }
976
+
977
+ lines.push("");
978
+ }
979
+
980
+ return lines.join("\n");
981
+ }
982
+
983
+ /**
984
+ * Format eval run result (gate check)
985
+ */
986
+ function formatEvalRunResult(result: {
987
+ passed: boolean;
988
+ phase: "bootstrap" | "stabilization" | "production";
989
+ message: string;
990
+ baseline?: number;
991
+ currentScore: number;
992
+ regressionPercent?: number;
993
+ }): string {
994
+ const lines: string[] = [];
995
+
996
+ // Pass/fail banner
997
+ const status = result.passed ? "✅ PASS" : "❌ FAIL";
998
+ lines.push(status);
999
+ lines.push("");
1000
+
1001
+ // Phase and score
1002
+ lines.push(`Phase: ${result.phase}`);
1003
+ lines.push(`Score: ${result.currentScore.toFixed(2)}`);
1004
+
1005
+ if (result.baseline !== undefined) {
1006
+ lines.push(`Baseline: ${result.baseline.toFixed(2)}`);
1007
+ }
1008
+
1009
+ if (result.regressionPercent !== undefined) {
1010
+ const sign = result.regressionPercent > 0 ? "+" : "";
1011
+ lines.push(`Regression: ${sign}${(result.regressionPercent * 100).toFixed(1)}%`);
1012
+ }
1013
+
1014
+ lines.push("");
1015
+ lines.push(result.message);
1016
+
1017
+ return lines.join("\n");
1018
+ }
1019
+
1020
+ // ============================================================================
1021
+ // Eval Run Tests
1022
+ // ============================================================================
1023
+
1024
+ describe("Eval Run CI Mode", () => {
1025
+ let testDir: string;
1026
+
1027
+ beforeEach(() => {
1028
+ testDir = join(tmpdir(), `eval-run-test-${Date.now()}`);
1029
+ mkdirSync(testDir, { recursive: true });
1030
+ });
1031
+
1032
+ afterEach(() => {
1033
+ if (existsSync(testDir)) {
1034
+ rmSync(testDir, { recursive: true, force: true });
1035
+ }
1036
+ });
1037
+
1038
+ test("writes eval results JSON file", async () => {
1039
+ // Import the function we need to test
1040
+ const { recordEvalRun, getScoreHistory } = await import("../src/eval-history.js");
1041
+ const { checkGate } = await import("../src/eval-gates.js");
1042
+ const { ensureHiveDirectory } = await import("../src/hive.js");
1043
+
1044
+ // Set up test data
1045
+ const evalName = "test-eval";
1046
+ const mockScore = 0.85;
1047
+
1048
+ // Ensure directory exists
1049
+ ensureHiveDirectory(testDir);
1050
+
1051
+ // Get history and record run (simulating what eval run does)
1052
+ const history = getScoreHistory(testDir, evalName);
1053
+ recordEvalRun(testDir, {
1054
+ timestamp: new Date().toISOString(),
1055
+ eval_name: evalName,
1056
+ score: mockScore,
1057
+ run_count: history.length + 1,
1058
+ });
1059
+
1060
+ // Check gate
1061
+ const gateResult = checkGate(testDir, evalName, mockScore);
1062
+
1063
+ // Write results file (simulating CI mode)
1064
+ const resultsPath = join(testDir, ".hive", "eval-results.json");
1065
+ const results = { [evalName]: gateResult };
1066
+ writeFileSync(resultsPath, JSON.stringify(results, null, 2));
1067
+
1068
+ // Verify file exists and has correct structure
1069
+ expect(existsSync(resultsPath)).toBe(true);
1070
+
1071
+ const savedResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
1072
+ expect(savedResults).toHaveProperty(evalName);
1073
+ expect(savedResults[evalName]).toMatchObject({
1074
+ passed: true,
1075
+ phase: "bootstrap",
1076
+ currentScore: mockScore,
1077
+ });
1078
+ });
1079
+
1080
+ test("bootstrap phase always passes", async () => {
1081
+ const { checkGate } = await import("../src/eval-gates.js");
1082
+
1083
+ // Even with a low score, bootstrap phase should pass
1084
+ const result = checkGate(testDir, "test-eval", 0.1);
1085
+
1086
+ expect(result.passed).toBe(true);
1087
+ expect(result.phase).toBe("bootstrap");
1088
+ expect(result.message).toContain("Bootstrap phase");
1089
+ });
1090
+
1091
+ test("production phase fails on regression", async () => {
1092
+ const { recordEvalRun } = await import("../src/eval-history.js");
1093
+ const { checkGate } = await import("../src/eval-gates.js");
1094
+ const { ensureHiveDirectory } = await import("../src/hive.js");
1095
+
1096
+ ensureHiveDirectory(testDir);
1097
+
1098
+ // Simulate 60 runs with consistent high scores to reach production phase
1099
+ for (let i = 0; i < 60; i++) {
1100
+ recordEvalRun(testDir, {
1101
+ timestamp: new Date().toISOString(),
1102
+ eval_name: "test-eval",
1103
+ score: 0.9,
1104
+ run_count: i + 1,
1105
+ });
1106
+ }
1107
+
1108
+ // Now test with a regressed score (>5% drop from 0.9 baseline)
1109
+ const regressedScore = 0.8; // 11% drop
1110
+ const result = checkGate(testDir, "test-eval", regressedScore);
1111
+
1112
+ expect(result.passed).toBe(false);
1113
+ expect(result.phase).toBe("production");
1114
+ expect(result.message).toContain("FAIL");
1115
+ });
1116
+ });