opencode-swarm-plugin 0.44.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/bin/swarm.serve.test.ts +6 -4
  2. package/bin/swarm.ts +16 -10
  3. package/dist/compaction-prompt-scoring.js +139 -0
  4. package/dist/eval-capture.js +12811 -0
  5. package/dist/hive.d.ts.map +1 -1
  6. package/dist/index.js +7644 -62599
  7. package/dist/plugin.js +23766 -78721
  8. package/dist/swarm-orchestrate.d.ts.map +1 -1
  9. package/dist/swarm-prompts.d.ts.map +1 -1
  10. package/dist/swarm-review.d.ts.map +1 -1
  11. package/package.json +17 -5
  12. package/.changeset/swarm-insights-data-layer.md +0 -63
  13. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  14. package/.hive/analysis/session-data-quality-audit.md +0 -320
  15. package/.hive/eval-results.json +0 -483
  16. package/.hive/issues.jsonl +0 -138
  17. package/.hive/memories.jsonl +0 -729
  18. package/.opencode/eval-history.jsonl +0 -327
  19. package/.turbo/turbo-build.log +0 -9
  20. package/CHANGELOG.md +0 -2286
  21. package/SCORER-ANALYSIS.md +0 -598
  22. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  23. package/docs/analysis-socratic-planner-pattern.md +0 -504
  24. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  25. package/docs/planning/ADR-002-package-extraction.md +0 -393
  26. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  27. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  28. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  29. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  30. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  31. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  32. package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
  33. package/docs/planning/ROADMAP.md +0 -368
  34. package/docs/semantic-memory-cli-syntax.md +0 -123
  35. package/docs/swarm-mail-architecture.md +0 -1147
  36. package/docs/testing/context-recovery-test.md +0 -470
  37. package/evals/ARCHITECTURE.md +0 -1189
  38. package/evals/README.md +0 -768
  39. package/evals/compaction-prompt.eval.ts +0 -149
  40. package/evals/compaction-resumption.eval.ts +0 -289
  41. package/evals/coordinator-behavior.eval.ts +0 -307
  42. package/evals/coordinator-session.eval.ts +0 -154
  43. package/evals/evalite.config.ts.bak +0 -15
  44. package/evals/example.eval.ts +0 -31
  45. package/evals/fixtures/cass-baseline.ts +0 -217
  46. package/evals/fixtures/compaction-cases.ts +0 -350
  47. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  48. package/evals/fixtures/coordinator-sessions.ts +0 -328
  49. package/evals/fixtures/decomposition-cases.ts +0 -105
  50. package/evals/lib/compaction-loader.test.ts +0 -248
  51. package/evals/lib/compaction-loader.ts +0 -320
  52. package/evals/lib/data-loader.evalite-test.ts +0 -289
  53. package/evals/lib/data-loader.test.ts +0 -345
  54. package/evals/lib/data-loader.ts +0 -281
  55. package/evals/lib/llm.ts +0 -115
  56. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  57. package/evals/scorers/compaction-scorers.ts +0 -305
  58. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  59. package/evals/scorers/coordinator-discipline.ts +0 -325
  60. package/evals/scorers/index.test.ts +0 -146
  61. package/evals/scorers/index.ts +0 -328
  62. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  63. package/evals/scorers/outcome-scorers.ts +0 -349
  64. package/evals/swarm-decomposition.eval.ts +0 -121
  65. package/examples/commands/swarm.md +0 -745
  66. package/examples/plugin-wrapper-template.ts +0 -2515
  67. package/examples/skills/hive-workflow/SKILL.md +0 -212
  68. package/examples/skills/skill-creator/SKILL.md +0 -223
  69. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  70. package/global-skills/cli-builder/SKILL.md +0 -344
  71. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  72. package/global-skills/learning-systems/SKILL.md +0 -644
  73. package/global-skills/skill-creator/LICENSE.txt +0 -202
  74. package/global-skills/skill-creator/SKILL.md +0 -352
  75. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  76. package/global-skills/skill-creator/references/workflows.md +0 -28
  77. package/global-skills/swarm-coordination/SKILL.md +0 -995
  78. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  79. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  80. package/global-skills/system-design/SKILL.md +0 -213
  81. package/global-skills/testing-patterns/SKILL.md +0 -430
  82. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  83. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  84. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  85. package/scripts/cleanup-test-memories.ts +0 -346
  86. package/scripts/init-skill.ts +0 -222
  87. package/scripts/migrate-unknown-sessions.ts +0 -349
  88. package/scripts/validate-skill.ts +0 -204
  89. package/src/agent-mail.ts +0 -1724
  90. package/src/anti-patterns.test.ts +0 -1167
  91. package/src/anti-patterns.ts +0 -448
  92. package/src/compaction-capture.integration.test.ts +0 -257
  93. package/src/compaction-hook.test.ts +0 -838
  94. package/src/compaction-hook.ts +0 -1204
  95. package/src/compaction-observability.integration.test.ts +0 -139
  96. package/src/compaction-observability.test.ts +0 -187
  97. package/src/compaction-observability.ts +0 -324
  98. package/src/compaction-prompt-scorers.test.ts +0 -475
  99. package/src/compaction-prompt-scoring.ts +0 -300
  100. package/src/contributor-tools.test.ts +0 -133
  101. package/src/contributor-tools.ts +0 -201
  102. package/src/dashboard.test.ts +0 -611
  103. package/src/dashboard.ts +0 -462
  104. package/src/error-enrichment.test.ts +0 -403
  105. package/src/error-enrichment.ts +0 -219
  106. package/src/eval-capture.test.ts +0 -1015
  107. package/src/eval-capture.ts +0 -929
  108. package/src/eval-gates.test.ts +0 -306
  109. package/src/eval-gates.ts +0 -218
  110. package/src/eval-history.test.ts +0 -508
  111. package/src/eval-history.ts +0 -214
  112. package/src/eval-learning.test.ts +0 -378
  113. package/src/eval-learning.ts +0 -360
  114. package/src/eval-runner.test.ts +0 -223
  115. package/src/eval-runner.ts +0 -402
  116. package/src/export-tools.test.ts +0 -476
  117. package/src/export-tools.ts +0 -257
  118. package/src/hive.integration.test.ts +0 -2241
  119. package/src/hive.ts +0 -1628
  120. package/src/index.ts +0 -940
  121. package/src/learning.integration.test.ts +0 -1815
  122. package/src/learning.ts +0 -1079
  123. package/src/logger.test.ts +0 -189
  124. package/src/logger.ts +0 -135
  125. package/src/mandate-promotion.test.ts +0 -473
  126. package/src/mandate-promotion.ts +0 -239
  127. package/src/mandate-storage.integration.test.ts +0 -601
  128. package/src/mandate-storage.test.ts +0 -578
  129. package/src/mandate-storage.ts +0 -794
  130. package/src/mandates.ts +0 -540
  131. package/src/memory-tools.test.ts +0 -195
  132. package/src/memory-tools.ts +0 -344
  133. package/src/memory.integration.test.ts +0 -334
  134. package/src/memory.test.ts +0 -158
  135. package/src/memory.ts +0 -527
  136. package/src/model-selection.test.ts +0 -188
  137. package/src/model-selection.ts +0 -68
  138. package/src/observability-tools.test.ts +0 -359
  139. package/src/observability-tools.ts +0 -871
  140. package/src/output-guardrails.test.ts +0 -438
  141. package/src/output-guardrails.ts +0 -381
  142. package/src/pattern-maturity.test.ts +0 -1160
  143. package/src/pattern-maturity.ts +0 -525
  144. package/src/planning-guardrails.test.ts +0 -491
  145. package/src/planning-guardrails.ts +0 -438
  146. package/src/plugin.ts +0 -23
  147. package/src/post-compaction-tracker.test.ts +0 -251
  148. package/src/post-compaction-tracker.ts +0 -237
  149. package/src/query-tools.test.ts +0 -636
  150. package/src/query-tools.ts +0 -324
  151. package/src/rate-limiter.integration.test.ts +0 -466
  152. package/src/rate-limiter.ts +0 -774
  153. package/src/replay-tools.test.ts +0 -496
  154. package/src/replay-tools.ts +0 -240
  155. package/src/repo-crawl.integration.test.ts +0 -441
  156. package/src/repo-crawl.ts +0 -610
  157. package/src/schemas/cell-events.test.ts +0 -347
  158. package/src/schemas/cell-events.ts +0 -807
  159. package/src/schemas/cell.ts +0 -257
  160. package/src/schemas/evaluation.ts +0 -166
  161. package/src/schemas/index.test.ts +0 -199
  162. package/src/schemas/index.ts +0 -286
  163. package/src/schemas/mandate.ts +0 -232
  164. package/src/schemas/swarm-context.ts +0 -115
  165. package/src/schemas/task.ts +0 -161
  166. package/src/schemas/worker-handoff.test.ts +0 -302
  167. package/src/schemas/worker-handoff.ts +0 -131
  168. package/src/sessions/agent-discovery.test.ts +0 -137
  169. package/src/sessions/agent-discovery.ts +0 -112
  170. package/src/sessions/index.ts +0 -15
  171. package/src/skills.integration.test.ts +0 -1192
  172. package/src/skills.test.ts +0 -643
  173. package/src/skills.ts +0 -1549
  174. package/src/storage.integration.test.ts +0 -341
  175. package/src/storage.ts +0 -884
  176. package/src/structured.integration.test.ts +0 -817
  177. package/src/structured.test.ts +0 -1046
  178. package/src/structured.ts +0 -762
  179. package/src/swarm-decompose.test.ts +0 -188
  180. package/src/swarm-decompose.ts +0 -1302
  181. package/src/swarm-deferred.integration.test.ts +0 -157
  182. package/src/swarm-deferred.test.ts +0 -38
  183. package/src/swarm-insights.test.ts +0 -214
  184. package/src/swarm-insights.ts +0 -459
  185. package/src/swarm-mail.integration.test.ts +0 -970
  186. package/src/swarm-mail.ts +0 -739
  187. package/src/swarm-orchestrate.integration.test.ts +0 -282
  188. package/src/swarm-orchestrate.test.ts +0 -548
  189. package/src/swarm-orchestrate.ts +0 -3084
  190. package/src/swarm-prompts.test.ts +0 -1270
  191. package/src/swarm-prompts.ts +0 -2077
  192. package/src/swarm-research.integration.test.ts +0 -701
  193. package/src/swarm-research.test.ts +0 -698
  194. package/src/swarm-research.ts +0 -472
  195. package/src/swarm-review.integration.test.ts +0 -285
  196. package/src/swarm-review.test.ts +0 -879
  197. package/src/swarm-review.ts +0 -709
  198. package/src/swarm-strategies.ts +0 -407
  199. package/src/swarm-worktree.test.ts +0 -501
  200. package/src/swarm-worktree.ts +0 -575
  201. package/src/swarm.integration.test.ts +0 -2377
  202. package/src/swarm.ts +0 -38
  203. package/src/tool-adapter.integration.test.ts +0 -1221
  204. package/src/tool-availability.ts +0 -461
  205. package/tsconfig.json +0 -28
@@ -1,214 +0,0 @@
1
- /**
2
- * Eval History Tracker - Progressive gates based on run history
3
- *
4
- * Tracks eval run scores over time and calculates the current phase:
5
- * - Bootstrap (<10 runs): No gates, just collect data
6
- * - Stabilization (10-50 runs): Warn on >10% regression
7
- * - Production (>50 runs + variance <0.1): Fail on >5% regression
8
- *
9
- * @module eval-history
10
- */
11
- import * as fs from "node:fs";
12
- import * as path from "node:path";
13
-
14
- /**
15
- * Progressive phases based on run count and variance
16
- */
17
- export type Phase = "bootstrap" | "stabilization" | "production";
18
-
19
- /**
20
- * Single eval run record
21
- */
22
- export interface EvalRunRecord {
23
- /** ISO-8601 timestamp */
24
- timestamp: string;
25
- /** Name of the eval (e.g., "swarm-decomposition") */
26
- eval_name: string;
27
- /** Score (0-1 range typically) */
28
- score: number;
29
- /** Run count (monotonically increasing per eval) */
30
- run_count: number;
31
- }
32
-
33
- /**
34
- * Default path for eval history
35
- */
36
- export const DEFAULT_EVAL_HISTORY_PATH = ".opencode/eval-history.jsonl";
37
-
38
- /**
39
- * Variance threshold for production phase
40
- */
41
- export const VARIANCE_THRESHOLD = 0.1;
42
-
43
- /**
44
- * Run count thresholds for phase transitions
45
- */
46
- export const BOOTSTRAP_THRESHOLD = 10;
47
- export const STABILIZATION_THRESHOLD = 50;
48
-
49
- /**
50
- * Get the eval history file path
51
- */
52
- export function getEvalHistoryPath(projectPath: string): string {
53
- return path.join(projectPath, DEFAULT_EVAL_HISTORY_PATH);
54
- }
55
-
56
- /**
57
- * Ensure the eval history directory exists
58
- */
59
- export function ensureEvalHistoryDir(projectPath: string): void {
60
- const historyPath = getEvalHistoryPath(projectPath);
61
- const dir = path.dirname(historyPath);
62
- if (!fs.existsSync(dir)) {
63
- fs.mkdirSync(dir, { recursive: true });
64
- }
65
- }
66
-
67
- /**
68
- * Record an eval run to JSONL history
69
- *
70
- * Appends atomically to `.opencode/eval-history.jsonl`. Each line is a complete JSON object
71
- * representing one eval run (timestamp, eval name, score, run count).
72
- *
73
- * **Auto-creates directory** if `.opencode/` doesn't exist.
74
- *
75
- * **Thread-safe**: Uses `appendFileSync` for atomic writes (safe for concurrent eval runs).
76
- *
77
- * **Integration**: Called automatically by evalite runner after each eval completes.
78
- * Also callable manually for custom eval tracking.
79
- *
80
- * @param projectPath - Absolute path to project root
81
- * @param run - Eval run record with timestamp, eval_name, score, run_count
82
- *
83
- * @example
84
- * ```typescript
85
- * import { recordEvalRun } from "./eval-history.js";
86
- *
87
- * recordEvalRun("/path/to/project", {
88
- * timestamp: new Date().toISOString(),
89
- * eval_name: "swarm-decomposition",
90
- * score: 0.92,
91
- * run_count: 15,
92
- * });
93
- * ```
94
- */
95
- export function recordEvalRun(
96
- projectPath: string,
97
- run: EvalRunRecord,
98
- ): void {
99
- ensureEvalHistoryDir(projectPath);
100
- const historyPath = getEvalHistoryPath(projectPath);
101
- const line = `${JSON.stringify(run)}\n`;
102
- fs.appendFileSync(historyPath, line, "utf-8");
103
- }
104
-
105
- /**
106
- * Read all eval run records from JSONL file
107
- *
108
- * Internal helper for parsing the history file
109
- */
110
- function readAllRecords(projectPath: string): EvalRunRecord[] {
111
- const historyPath = getEvalHistoryPath(projectPath);
112
-
113
- if (!fs.existsSync(historyPath)) {
114
- return [];
115
- }
116
-
117
- const content = fs.readFileSync(historyPath, "utf-8");
118
- const lines = content.trim().split("\n").filter(Boolean);
119
-
120
- return lines.map((line) => JSON.parse(line) as EvalRunRecord);
121
- }
122
-
123
- /**
124
- * Get score history for a specific eval
125
- *
126
- * Returns runs in chronological order (oldest first)
127
- */
128
- export function getScoreHistory(
129
- projectPath: string,
130
- evalName: string,
131
- ): EvalRunRecord[] {
132
- return readAllRecords(projectPath).filter(
133
- (run) => run.eval_name === evalName,
134
- );
135
- }
136
-
137
- /**
138
- * Calculate statistical variance of scores
139
- *
140
- * Variance = mean of squared deviations from the mean
141
- * Formula: Σ((x - μ)²) / n
142
- */
143
- export function calculateVariance(scores: number[]): number {
144
- if (scores.length <= 1) {
145
- return 0;
146
- }
147
-
148
- const mean = scores.reduce((sum, score) => sum + score, 0) / scores.length;
149
-
150
- const variance = scores.reduce((sum, score) => {
151
- const deviation = score - mean;
152
- return sum + deviation * deviation;
153
- }, 0) / scores.length;
154
-
155
- return variance;
156
- }
157
-
158
- /**
159
- * Get the current phase for an eval based on run count and score variance
160
- *
161
- * Progressive phase logic ensures quality gates adapt to data maturity:
162
- *
163
- * - **Bootstrap (<10 runs)**: No gates, just collect baseline data
164
- * - **Stabilization (10-50 runs)**: Warn on >10% regression (but pass)
165
- * - **Production (>50 runs AND variance <0.1)**: Fail on >5% regression
166
- *
167
- * **Variance check**: If >50 runs but variance ≥0.1, stays in stabilization.
168
- * This prevents premature production gates when scores are still unstable.
169
- *
170
- * **Why variance matters**: An eval with wildly fluctuating scores isn't ready for
171
- * strict gates. Variance threshold (0.1) ensures the eval is consistent before
172
- * enforcing production-level quality control.
173
- *
174
- * @param projectPath - Absolute path to project root (contains `.opencode/eval-history.jsonl`)
175
- * @param evalName - Name of the eval (e.g., "swarm-decomposition")
176
- * @returns Current phase: "bootstrap" | "stabilization" | "production"
177
- *
178
- * @example
179
- * ```typescript
180
- * import { getPhase } from "./eval-history.js";
181
- *
182
- * const phase = getPhase("/path/to/project", "swarm-decomposition");
183
- *
184
- * if (phase === "production") {
185
- * console.log("🚀 Production phase - strict gates enabled");
186
- * } else if (phase === "stabilization") {
187
- * console.log("⚙️ Stabilization phase - warnings only");
188
- * } else {
189
- * console.log("🌱 Bootstrap phase - collecting data");
190
- * }
191
- * ```
192
- */
193
- export function getPhase(projectPath: string, evalName: string): Phase {
194
- const history = getScoreHistory(projectPath, evalName);
195
-
196
- if (history.length < BOOTSTRAP_THRESHOLD) {
197
- return "bootstrap";
198
- }
199
-
200
- if (history.length <= STABILIZATION_THRESHOLD) {
201
- return "stabilization";
202
- }
203
-
204
- // >50 runs - check variance
205
- const scores = history.map((run) => run.score);
206
- const variance = calculateVariance(scores);
207
-
208
- if (variance < VARIANCE_THRESHOLD) {
209
- return "production";
210
- }
211
-
212
- // High variance - stay in stabilization
213
- return "stabilization";
214
- }
@@ -1,378 +0,0 @@
1
- /**
2
- * Tests for eval-learning.ts - Eval-to-Learning Feedback Loop
3
- *
4
- * TDD RED phase: Write failing tests first, then implement.
5
- *
6
- * Core behavior:
7
- * - Detect significant eval score drops (>15% from rolling average)
8
- * - Store failure context to semantic-memory with structured tags
9
- * - Ignore minor fluctuations (<15% variance)
10
- * - Configurable threshold for sensitivity tuning
11
- */
12
- import { describe, test, expect, beforeEach, mock } from "bun:test";
13
- import {
14
- learnFromEvalFailure,
15
- type EvalLearningConfig,
16
- calculateRollingAverage,
17
- isSignificantDrop,
18
- formatFailureContext,
19
- createLearningConfig,
20
- DEFAULT_EVAL_LEARNING_CONFIG,
21
- } from "./eval-learning";
22
- import type { EvalRunRecord } from "./eval-history";
23
- import type { MemoryAdapter } from "./memory-tools";
24
-
25
- // ============================================================================
26
- // Mock Memory Adapter
27
- // ============================================================================
28
-
29
- /**
30
- * Create a mock memory adapter for testing
31
- *
32
- * Tracks store() calls without hitting real storage
33
- */
34
- function createMockMemoryAdapter(): MemoryAdapter {
35
- const storedMemories: Array<{
36
- information: string;
37
- tags?: string;
38
- metadata?: string;
39
- }> = [];
40
-
41
- return {
42
- store: mock(async (args) => {
43
- storedMemories.push(args);
44
- return {
45
- id: `mem_${Date.now()}`,
46
- message: "Stored successfully",
47
- };
48
- }),
49
- find: mock(async () => ({ results: [], total: 0 })),
50
- get: mock(async () => null),
51
- remove: mock(async () => ({ success: true, message: "Removed" })),
52
- validate: mock(async () => ({ success: true, message: "Validated" })),
53
- list: mock(async () => []),
54
- stats: mock(async () => ({
55
- total_memories: 0,
56
- total_embeddings: 0,
57
- collections: {},
58
- })),
59
- checkHealth: mock(async () => ({ ready: true, message: "OK" })),
60
- getStoredMemories: () => storedMemories,
61
- } as any;
62
- }
63
-
64
- // ============================================================================
65
- // Tests: Rolling Average Calculation
66
- // ============================================================================
67
-
68
- describe("calculateRollingAverage", () => {
69
- test("returns 0 for empty history", () => {
70
- const avg = calculateRollingAverage([]);
71
- expect(avg).toBe(0);
72
- });
73
-
74
- test("returns single score for history of 1", () => {
75
- const history: EvalRunRecord[] = [
76
- {
77
- eval_name: "test",
78
- score: 0.85,
79
- timestamp: "2024-12-01T00:00:00Z",
80
- run_count: 1,
81
- },
82
- ];
83
-
84
- const avg = calculateRollingAverage(history);
85
- expect(avg).toBe(0.85);
86
- });
87
-
88
- test("calculates average of last N runs (default 5)", () => {
89
- const history: EvalRunRecord[] = [
90
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
91
- { eval_name: "test", score: 0.82, timestamp: "2024-12-02", run_count: 2 },
92
- { eval_name: "test", score: 0.84, timestamp: "2024-12-03", run_count: 3 },
93
- { eval_name: "test", score: 0.86, timestamp: "2024-12-04", run_count: 4 },
94
- { eval_name: "test", score: 0.88, timestamp: "2024-12-05", run_count: 5 },
95
- { eval_name: "test", score: 0.9, timestamp: "2024-12-06", run_count: 6 },
96
- ];
97
-
98
- const avg = calculateRollingAverage(history);
99
- // Last 5: 0.82, 0.84, 0.86, 0.88, 0.9 => avg = 0.86
100
- expect(avg).toBeCloseTo(0.86, 2);
101
- });
102
-
103
- test("uses custom window size", () => {
104
- const history: EvalRunRecord[] = [
105
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
106
- { eval_name: "test", score: 0.85, timestamp: "2024-12-02", run_count: 2 },
107
- { eval_name: "test", score: 0.9, timestamp: "2024-12-03", run_count: 3 },
108
- ];
109
-
110
- const avg = calculateRollingAverage(history, 2);
111
- // Last 2: 0.85, 0.9 => avg = 0.875
112
- expect(avg).toBeCloseTo(0.875, 3);
113
- });
114
-
115
- test("handles window larger than history", () => {
116
- const history: EvalRunRecord[] = [
117
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
118
- { eval_name: "test", score: 0.9, timestamp: "2024-12-02", run_count: 2 },
119
- ];
120
-
121
- const avg = calculateRollingAverage(history, 10);
122
- // Uses all available: (0.8 + 0.9) / 2 = 0.85
123
- expect(avg).toBeCloseTo(0.85, 2);
124
- });
125
- });
126
-
127
- // ============================================================================
128
- // Tests: Significant Drop Detection
129
- // ============================================================================
130
-
131
- describe("isSignificantDrop", () => {
132
- test("returns false when current equals baseline", () => {
133
- expect(isSignificantDrop(0.85, 0.85)).toBe(false);
134
- });
135
-
136
- test("returns false when current is higher than baseline", () => {
137
- expect(isSignificantDrop(0.9, 0.85)).toBe(false);
138
- });
139
-
140
- test("returns false for drop below threshold (default 15%)", () => {
141
- // Drop of 10%: 0.85 -> 0.765 (90% of 0.85)
142
- expect(isSignificantDrop(0.765, 0.85)).toBe(false);
143
- });
144
-
145
- test("returns true for drop at threshold (15%)", () => {
146
- // Drop of exactly 15%: 0.85 -> 0.7225 (85% of 0.85)
147
- // Use slightly lower to account for floating point precision
148
- expect(isSignificantDrop(0.722, 0.85)).toBe(true);
149
- });
150
-
151
- test("returns true for drop above threshold (20%)", () => {
152
- // Drop of 20%: 0.85 -> 0.68 (80% of 0.85)
153
- expect(isSignificantDrop(0.68, 0.85)).toBe(true);
154
- });
155
-
156
- test("uses custom threshold", () => {
157
- // Drop of 8%: 0.85 -> 0.782 (92% of 0.85)
158
- // Default (15%) => false
159
- expect(isSignificantDrop(0.782, 0.85)).toBe(false);
160
-
161
- // Custom threshold (5%) => true
162
- expect(isSignificantDrop(0.782, 0.85, 0.05)).toBe(true);
163
- });
164
-
165
- test("returns false when baseline is 0 (avoid division by zero)", () => {
166
- expect(isSignificantDrop(0, 0)).toBe(false);
167
- expect(isSignificantDrop(0.5, 0)).toBe(false);
168
- });
169
- });
170
-
171
- // ============================================================================
172
- // Tests: Failure Context Formatting
173
- // ============================================================================
174
-
175
- describe("formatFailureContext", () => {
176
- test("includes eval name, scores, and drop percentage", () => {
177
- const context = formatFailureContext("compaction-test", 0.68, 0.85);
178
-
179
- expect(context).toContain("compaction-test");
180
- expect(context).toContain("0.68");
181
- expect(context).toContain("0.85");
182
- expect(context).toContain("20.0%"); // (0.85 - 0.68) / 0.85 = 20%
183
- });
184
-
185
- test("includes optional scorer context", () => {
186
- const scorerContext = "violationCount scorer failed: 5 violations detected";
187
- const context = formatFailureContext(
188
- "coordinator-behavior",
189
- 0.5,
190
- 0.8,
191
- scorerContext,
192
- );
193
-
194
- expect(context).toContain("coordinator-behavior");
195
- expect(context).toContain(scorerContext);
196
- });
197
-
198
- test("handles baseline of 0 gracefully", () => {
199
- const context = formatFailureContext("test", 0.5, 0);
200
- expect(context).not.toContain("NaN");
201
- expect(context).not.toContain("Infinity");
202
- });
203
- });
204
-
205
- // ============================================================================
206
- // Tests: Main learnFromEvalFailure Function
207
- // ============================================================================
208
-
209
- describe("learnFromEvalFailure", () => {
210
- let mockAdapter: MemoryAdapter;
211
-
212
- beforeEach(() => {
213
- mockAdapter = createMockMemoryAdapter();
214
- });
215
-
216
- test("stores memory when score drops significantly", async () => {
217
- const history: EvalRunRecord[] = [
218
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
219
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
220
- { eval_name: "test", score: 0.86, timestamp: "2024-12-03", run_count: 3 },
221
- { eval_name: "test", score: 0.85, timestamp: "2024-12-04", run_count: 4 },
222
- { eval_name: "test", score: 0.84, timestamp: "2024-12-05", run_count: 5 },
223
- ];
224
- const currentScore = 0.68; // Drop of ~20%
225
-
226
- const result = await learnFromEvalFailure(
227
- "test-eval",
228
- currentScore,
229
- history,
230
- mockAdapter,
231
- );
232
-
233
- expect(result.triggered).toBe(true);
234
- expect(result.baseline).toBeCloseTo(0.848, 2);
235
- expect(result.drop_percentage).toBeCloseTo(0.198, 2); // ~20%
236
-
237
- // Verify memory was stored
238
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
239
-
240
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
241
- expect(storedMemory.information).toContain("test-eval");
242
- expect(storedMemory.information).toContain("0.68");
243
- expect(storedMemory.tags).toContain("eval-failure");
244
- expect(storedMemory.tags).toContain("test-eval");
245
- });
246
-
247
- test("does not store memory for minor fluctuations", async () => {
248
- const history: EvalRunRecord[] = [
249
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
250
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
251
- ];
252
- const currentScore = 0.8; // Drop of ~5%, below 15% threshold
253
-
254
- const result = await learnFromEvalFailure(
255
- "test-eval",
256
- currentScore,
257
- history,
258
- mockAdapter,
259
- );
260
-
261
- expect(result.triggered).toBe(false);
262
- expect(mockAdapter.store).not.toHaveBeenCalled();
263
- });
264
-
265
- test("includes scorer context in memory if provided", async () => {
266
- const history: EvalRunRecord[] = [
267
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
268
- ];
269
- const currentScore = 0.7; // Drop of ~22%
270
- const scorerContext = "violationCount: 8 protocol violations";
271
-
272
- await learnFromEvalFailure(
273
- "coordinator-behavior",
274
- currentScore,
275
- history,
276
- mockAdapter,
277
- { scorerContext },
278
- );
279
-
280
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
281
- expect(storedMemory.information).toContain(scorerContext);
282
- });
283
-
284
- test("uses custom threshold when provided", async () => {
285
- const history: EvalRunRecord[] = [
286
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
287
- ];
288
- const currentScore = 0.85; // Drop of ~5.5%
289
-
290
- const customConfig: EvalLearningConfig = {
291
- ...DEFAULT_EVAL_LEARNING_CONFIG,
292
- dropThreshold: 0.05, // 5% threshold
293
- };
294
-
295
- const result = await learnFromEvalFailure(
296
- "test-eval",
297
- currentScore,
298
- history,
299
- mockAdapter,
300
- { config: customConfig },
301
- );
302
-
303
- expect(result.triggered).toBe(true);
304
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
305
- });
306
-
307
- test("handles empty history gracefully", async () => {
308
- const result = await learnFromEvalFailure(
309
- "test-eval",
310
- 0.5,
311
- [],
312
- mockAdapter,
313
- );
314
-
315
- expect(result.triggered).toBe(false);
316
- expect(result.baseline).toBe(0);
317
- expect(mockAdapter.store).not.toHaveBeenCalled();
318
- });
319
-
320
- test("generates structured tags for semantic search", async () => {
321
- const history: EvalRunRecord[] = [
322
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
323
- ];
324
- const currentScore = 0.7; // Significant drop
325
-
326
- await learnFromEvalFailure(
327
- "compaction-test",
328
- currentScore,
329
- history,
330
- mockAdapter,
331
- );
332
-
333
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
334
- const tags = storedMemory.tags;
335
-
336
- expect(tags).toContain("eval-failure");
337
- expect(tags).toContain("compaction-test");
338
- expect(tags).toContain("regression");
339
- });
340
-
341
- test("stores metadata for future prompt generation", async () => {
342
- const history: EvalRunRecord[] = [
343
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
344
- ];
345
- const currentScore = 0.7;
346
-
347
- await learnFromEvalFailure("test-eval", currentScore, history, mockAdapter);
348
-
349
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
350
- expect(storedMemory.metadata).toBeDefined();
351
-
352
- const metadata = JSON.parse(storedMemory.metadata!);
353
- expect(metadata.eval_name).toBe("test-eval");
354
- expect(metadata.baseline_score).toBeCloseTo(0.9, 2);
355
- expect(metadata.current_score).toBe(0.7);
356
- expect(metadata.drop_percentage).toBeCloseTo(0.222, 2);
357
- });
358
- });
359
-
360
- // ============================================================================
361
- // Tests: Convenience Helpers
362
- // ============================================================================
363
-
364
- describe("createLearningConfig", () => {
365
- test("creates config with custom threshold", () => {
366
- const config = createLearningConfig(0.1);
367
-
368
- expect(config.dropThreshold).toBe(0.1);
369
- expect(config.windowSize).toBe(DEFAULT_EVAL_LEARNING_CONFIG.windowSize);
370
- });
371
-
372
- test("accepts custom window size", () => {
373
- const config = createLearningConfig(0.2, 10);
374
-
375
- expect(config.dropThreshold).toBe(0.2);
376
- expect(config.windowSize).toBe(10);
377
- });
378
- });