opencode-swarm-plugin 0.43.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. package/bin/cass.characterization.test.ts +422 -0
  2. package/bin/swarm.serve.test.ts +6 -4
  3. package/bin/swarm.test.ts +68 -0
  4. package/bin/swarm.ts +81 -8
  5. package/dist/compaction-prompt-scoring.js +139 -0
  6. package/dist/contributor-tools.d.ts +42 -0
  7. package/dist/contributor-tools.d.ts.map +1 -0
  8. package/dist/eval-capture.js +12811 -0
  9. package/dist/hive.d.ts.map +1 -1
  10. package/dist/index.d.ts +12 -0
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +7728 -62590
  13. package/dist/plugin.js +23833 -78695
  14. package/dist/sessions/agent-discovery.d.ts +59 -0
  15. package/dist/sessions/agent-discovery.d.ts.map +1 -0
  16. package/dist/sessions/index.d.ts +10 -0
  17. package/dist/sessions/index.d.ts.map +1 -0
  18. package/dist/swarm-orchestrate.d.ts.map +1 -1
  19. package/dist/swarm-prompts.d.ts.map +1 -1
  20. package/dist/swarm-review.d.ts.map +1 -1
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2255
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ROADMAP.md +0 -368
  43. package/docs/semantic-memory-cli-syntax.md +0 -123
  44. package/docs/swarm-mail-architecture.md +0 -1147
  45. package/docs/testing/context-recovery-test.md +0 -470
  46. package/evals/ARCHITECTURE.md +0 -1189
  47. package/evals/README.md +0 -768
  48. package/evals/compaction-prompt.eval.ts +0 -149
  49. package/evals/compaction-resumption.eval.ts +0 -289
  50. package/evals/coordinator-behavior.eval.ts +0 -307
  51. package/evals/coordinator-session.eval.ts +0 -154
  52. package/evals/evalite.config.ts.bak +0 -15
  53. package/evals/example.eval.ts +0 -31
  54. package/evals/fixtures/compaction-cases.ts +0 -350
  55. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  56. package/evals/fixtures/coordinator-sessions.ts +0 -328
  57. package/evals/fixtures/decomposition-cases.ts +0 -105
  58. package/evals/lib/compaction-loader.test.ts +0 -248
  59. package/evals/lib/compaction-loader.ts +0 -320
  60. package/evals/lib/data-loader.evalite-test.ts +0 -289
  61. package/evals/lib/data-loader.test.ts +0 -345
  62. package/evals/lib/data-loader.ts +0 -281
  63. package/evals/lib/llm.ts +0 -115
  64. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  65. package/evals/scorers/compaction-scorers.ts +0 -305
  66. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  67. package/evals/scorers/coordinator-discipline.ts +0 -325
  68. package/evals/scorers/index.test.ts +0 -146
  69. package/evals/scorers/index.ts +0 -328
  70. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  71. package/evals/scorers/outcome-scorers.ts +0 -349
  72. package/evals/swarm-decomposition.eval.ts +0 -121
  73. package/examples/commands/swarm.md +0 -745
  74. package/examples/plugin-wrapper-template.ts +0 -2426
  75. package/examples/skills/hive-workflow/SKILL.md +0 -212
  76. package/examples/skills/skill-creator/SKILL.md +0 -223
  77. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  78. package/global-skills/cli-builder/SKILL.md +0 -344
  79. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  80. package/global-skills/learning-systems/SKILL.md +0 -644
  81. package/global-skills/skill-creator/LICENSE.txt +0 -202
  82. package/global-skills/skill-creator/SKILL.md +0 -352
  83. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  84. package/global-skills/skill-creator/references/workflows.md +0 -28
  85. package/global-skills/swarm-coordination/SKILL.md +0 -995
  86. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  87. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  88. package/global-skills/system-design/SKILL.md +0 -213
  89. package/global-skills/testing-patterns/SKILL.md +0 -430
  90. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  91. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  92. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  93. package/scripts/cleanup-test-memories.ts +0 -346
  94. package/scripts/init-skill.ts +0 -222
  95. package/scripts/migrate-unknown-sessions.ts +0 -349
  96. package/scripts/validate-skill.ts +0 -204
  97. package/src/agent-mail.ts +0 -1724
  98. package/src/anti-patterns.test.ts +0 -1167
  99. package/src/anti-patterns.ts +0 -448
  100. package/src/compaction-capture.integration.test.ts +0 -257
  101. package/src/compaction-hook.test.ts +0 -838
  102. package/src/compaction-hook.ts +0 -1204
  103. package/src/compaction-observability.integration.test.ts +0 -139
  104. package/src/compaction-observability.test.ts +0 -187
  105. package/src/compaction-observability.ts +0 -324
  106. package/src/compaction-prompt-scorers.test.ts +0 -475
  107. package/src/compaction-prompt-scoring.ts +0 -300
  108. package/src/dashboard.test.ts +0 -611
  109. package/src/dashboard.ts +0 -462
  110. package/src/error-enrichment.test.ts +0 -403
  111. package/src/error-enrichment.ts +0 -219
  112. package/src/eval-capture.test.ts +0 -1015
  113. package/src/eval-capture.ts +0 -929
  114. package/src/eval-gates.test.ts +0 -306
  115. package/src/eval-gates.ts +0 -218
  116. package/src/eval-history.test.ts +0 -508
  117. package/src/eval-history.ts +0 -214
  118. package/src/eval-learning.test.ts +0 -378
  119. package/src/eval-learning.ts +0 -360
  120. package/src/eval-runner.test.ts +0 -223
  121. package/src/eval-runner.ts +0 -402
  122. package/src/export-tools.test.ts +0 -476
  123. package/src/export-tools.ts +0 -257
  124. package/src/hive.integration.test.ts +0 -2241
  125. package/src/hive.ts +0 -1628
  126. package/src/index.ts +0 -935
  127. package/src/learning.integration.test.ts +0 -1815
  128. package/src/learning.ts +0 -1079
  129. package/src/logger.test.ts +0 -189
  130. package/src/logger.ts +0 -135
  131. package/src/mandate-promotion.test.ts +0 -473
  132. package/src/mandate-promotion.ts +0 -239
  133. package/src/mandate-storage.integration.test.ts +0 -601
  134. package/src/mandate-storage.test.ts +0 -578
  135. package/src/mandate-storage.ts +0 -794
  136. package/src/mandates.ts +0 -540
  137. package/src/memory-tools.test.ts +0 -195
  138. package/src/memory-tools.ts +0 -344
  139. package/src/memory.integration.test.ts +0 -334
  140. package/src/memory.test.ts +0 -158
  141. package/src/memory.ts +0 -527
  142. package/src/model-selection.test.ts +0 -188
  143. package/src/model-selection.ts +0 -68
  144. package/src/observability-tools.test.ts +0 -359
  145. package/src/observability-tools.ts +0 -871
  146. package/src/output-guardrails.test.ts +0 -438
  147. package/src/output-guardrails.ts +0 -381
  148. package/src/pattern-maturity.test.ts +0 -1160
  149. package/src/pattern-maturity.ts +0 -525
  150. package/src/planning-guardrails.test.ts +0 -491
  151. package/src/planning-guardrails.ts +0 -438
  152. package/src/plugin.ts +0 -23
  153. package/src/post-compaction-tracker.test.ts +0 -251
  154. package/src/post-compaction-tracker.ts +0 -237
  155. package/src/query-tools.test.ts +0 -636
  156. package/src/query-tools.ts +0 -324
  157. package/src/rate-limiter.integration.test.ts +0 -466
  158. package/src/rate-limiter.ts +0 -774
  159. package/src/replay-tools.test.ts +0 -496
  160. package/src/replay-tools.ts +0 -240
  161. package/src/repo-crawl.integration.test.ts +0 -441
  162. package/src/repo-crawl.ts +0 -610
  163. package/src/schemas/cell-events.test.ts +0 -347
  164. package/src/schemas/cell-events.ts +0 -807
  165. package/src/schemas/cell.ts +0 -257
  166. package/src/schemas/evaluation.ts +0 -166
  167. package/src/schemas/index.test.ts +0 -199
  168. package/src/schemas/index.ts +0 -286
  169. package/src/schemas/mandate.ts +0 -232
  170. package/src/schemas/swarm-context.ts +0 -115
  171. package/src/schemas/task.ts +0 -161
  172. package/src/schemas/worker-handoff.test.ts +0 -302
  173. package/src/schemas/worker-handoff.ts +0 -131
  174. package/src/skills.integration.test.ts +0 -1192
  175. package/src/skills.test.ts +0 -643
  176. package/src/skills.ts +0 -1549
  177. package/src/storage.integration.test.ts +0 -341
  178. package/src/storage.ts +0 -884
  179. package/src/structured.integration.test.ts +0 -817
  180. package/src/structured.test.ts +0 -1046
  181. package/src/structured.ts +0 -762
  182. package/src/swarm-decompose.test.ts +0 -188
  183. package/src/swarm-decompose.ts +0 -1302
  184. package/src/swarm-deferred.integration.test.ts +0 -157
  185. package/src/swarm-deferred.test.ts +0 -38
  186. package/src/swarm-insights.test.ts +0 -214
  187. package/src/swarm-insights.ts +0 -459
  188. package/src/swarm-mail.integration.test.ts +0 -970
  189. package/src/swarm-mail.ts +0 -739
  190. package/src/swarm-orchestrate.integration.test.ts +0 -282
  191. package/src/swarm-orchestrate.test.ts +0 -548
  192. package/src/swarm-orchestrate.ts +0 -3084
  193. package/src/swarm-prompts.test.ts +0 -1270
  194. package/src/swarm-prompts.ts +0 -2077
  195. package/src/swarm-research.integration.test.ts +0 -701
  196. package/src/swarm-research.test.ts +0 -698
  197. package/src/swarm-research.ts +0 -472
  198. package/src/swarm-review.integration.test.ts +0 -285
  199. package/src/swarm-review.test.ts +0 -879
  200. package/src/swarm-review.ts +0 -709
  201. package/src/swarm-strategies.ts +0 -407
  202. package/src/swarm-worktree.test.ts +0 -501
  203. package/src/swarm-worktree.ts +0 -575
  204. package/src/swarm.integration.test.ts +0 -2377
  205. package/src/swarm.ts +0 -38
  206. package/src/tool-adapter.integration.test.ts +0 -1221
  207. package/src/tool-availability.ts +0 -461
  208. package/tsconfig.json +0 -28
@@ -1,214 +0,0 @@
1
- /**
2
- * Eval History Tracker - Progressive gates based on run history
3
- *
4
- * Tracks eval run scores over time and calculates the current phase:
5
- * - Bootstrap (<10 runs): No gates, just collect data
6
- * - Stabilization (10-50 runs): Warn on >10% regression
7
- * - Production (>50 runs + variance <0.1): Fail on >5% regression
8
- *
9
- * @module eval-history
10
- */
11
- import * as fs from "node:fs";
12
- import * as path from "node:path";
13
-
14
- /**
15
- * Progressive phases based on run count and variance
16
- */
17
- export type Phase = "bootstrap" | "stabilization" | "production";
18
-
19
- /**
20
- * Single eval run record
21
- */
22
- export interface EvalRunRecord {
23
- /** ISO-8601 timestamp */
24
- timestamp: string;
25
- /** Name of the eval (e.g., "swarm-decomposition") */
26
- eval_name: string;
27
- /** Score (0-1 range typically) */
28
- score: number;
29
- /** Run count (monotonically increasing per eval) */
30
- run_count: number;
31
- }
32
-
33
- /**
34
- * Default path for eval history
35
- */
36
- export const DEFAULT_EVAL_HISTORY_PATH = ".opencode/eval-history.jsonl";
37
-
38
- /**
39
- * Variance threshold for production phase
40
- */
41
- export const VARIANCE_THRESHOLD = 0.1;
42
-
43
- /**
44
- * Run count thresholds for phase transitions
45
- */
46
- export const BOOTSTRAP_THRESHOLD = 10;
47
- export const STABILIZATION_THRESHOLD = 50;
48
-
49
- /**
50
- * Get the eval history file path
51
- */
52
- export function getEvalHistoryPath(projectPath: string): string {
53
- return path.join(projectPath, DEFAULT_EVAL_HISTORY_PATH);
54
- }
55
-
56
- /**
57
- * Ensure the eval history directory exists
58
- */
59
- export function ensureEvalHistoryDir(projectPath: string): void {
60
- const historyPath = getEvalHistoryPath(projectPath);
61
- const dir = path.dirname(historyPath);
62
- if (!fs.existsSync(dir)) {
63
- fs.mkdirSync(dir, { recursive: true });
64
- }
65
- }
66
-
67
- /**
68
- * Record an eval run to JSONL history
69
- *
70
- * Appends atomically to `.opencode/eval-history.jsonl`. Each line is a complete JSON object
71
- * representing one eval run (timestamp, eval name, score, run count).
72
- *
73
- * **Auto-creates directory** if `.opencode/` doesn't exist.
74
- *
75
- * **Thread-safe**: Uses `appendFileSync` for atomic writes (safe for concurrent eval runs).
76
- *
77
- * **Integration**: Called automatically by evalite runner after each eval completes.
78
- * Also callable manually for custom eval tracking.
79
- *
80
- * @param projectPath - Absolute path to project root
81
- * @param run - Eval run record with timestamp, eval_name, score, run_count
82
- *
83
- * @example
84
- * ```typescript
85
- * import { recordEvalRun } from "./eval-history.js";
86
- *
87
- * recordEvalRun("/path/to/project", {
88
- * timestamp: new Date().toISOString(),
89
- * eval_name: "swarm-decomposition",
90
- * score: 0.92,
91
- * run_count: 15,
92
- * });
93
- * ```
94
- */
95
- export function recordEvalRun(
96
- projectPath: string,
97
- run: EvalRunRecord,
98
- ): void {
99
- ensureEvalHistoryDir(projectPath);
100
- const historyPath = getEvalHistoryPath(projectPath);
101
- const line = `${JSON.stringify(run)}\n`;
102
- fs.appendFileSync(historyPath, line, "utf-8");
103
- }
104
-
105
- /**
106
- * Read all eval run records from JSONL file
107
- *
108
- * Internal helper for parsing the history file
109
- */
110
- function readAllRecords(projectPath: string): EvalRunRecord[] {
111
- const historyPath = getEvalHistoryPath(projectPath);
112
-
113
- if (!fs.existsSync(historyPath)) {
114
- return [];
115
- }
116
-
117
- const content = fs.readFileSync(historyPath, "utf-8");
118
- const lines = content.trim().split("\n").filter(Boolean);
119
-
120
- return lines.map((line) => JSON.parse(line) as EvalRunRecord);
121
- }
122
-
123
- /**
124
- * Get score history for a specific eval
125
- *
126
- * Returns runs in chronological order (oldest first)
127
- */
128
- export function getScoreHistory(
129
- projectPath: string,
130
- evalName: string,
131
- ): EvalRunRecord[] {
132
- return readAllRecords(projectPath).filter(
133
- (run) => run.eval_name === evalName,
134
- );
135
- }
136
-
137
- /**
138
- * Calculate statistical variance of scores
139
- *
140
- * Variance = mean of squared deviations from the mean
141
- * Formula: Σ((x - μ)²) / n
142
- */
143
- export function calculateVariance(scores: number[]): number {
144
- if (scores.length <= 1) {
145
- return 0;
146
- }
147
-
148
- const mean = scores.reduce((sum, score) => sum + score, 0) / scores.length;
149
-
150
- const variance = scores.reduce((sum, score) => {
151
- const deviation = score - mean;
152
- return sum + deviation * deviation;
153
- }, 0) / scores.length;
154
-
155
- return variance;
156
- }
157
-
158
- /**
159
- * Get the current phase for an eval based on run count and score variance
160
- *
161
- * Progressive phase logic ensures quality gates adapt to data maturity:
162
- *
163
- * - **Bootstrap (<10 runs)**: No gates, just collect baseline data
164
- * - **Stabilization (10-50 runs)**: Warn on >10% regression (but pass)
165
- * - **Production (>50 runs AND variance <0.1)**: Fail on >5% regression
166
- *
167
- * **Variance check**: If >50 runs but variance ≥0.1, stays in stabilization.
168
- * This prevents premature production gates when scores are still unstable.
169
- *
170
- * **Why variance matters**: An eval with wildly fluctuating scores isn't ready for
171
- * strict gates. Variance threshold (0.1) ensures the eval is consistent before
172
- * enforcing production-level quality control.
173
- *
174
- * @param projectPath - Absolute path to project root (contains `.opencode/eval-history.jsonl`)
175
- * @param evalName - Name of the eval (e.g., "swarm-decomposition")
176
- * @returns Current phase: "bootstrap" | "stabilization" | "production"
177
- *
178
- * @example
179
- * ```typescript
180
- * import { getPhase } from "./eval-history.js";
181
- *
182
- * const phase = getPhase("/path/to/project", "swarm-decomposition");
183
- *
184
- * if (phase === "production") {
185
- * console.log("🚀 Production phase - strict gates enabled");
186
- * } else if (phase === "stabilization") {
187
- * console.log("⚙️ Stabilization phase - warnings only");
188
- * } else {
189
- * console.log("🌱 Bootstrap phase - collecting data");
190
- * }
191
- * ```
192
- */
193
- export function getPhase(projectPath: string, evalName: string): Phase {
194
- const history = getScoreHistory(projectPath, evalName);
195
-
196
- if (history.length < BOOTSTRAP_THRESHOLD) {
197
- return "bootstrap";
198
- }
199
-
200
- if (history.length <= STABILIZATION_THRESHOLD) {
201
- return "stabilization";
202
- }
203
-
204
- // >50 runs - check variance
205
- const scores = history.map((run) => run.score);
206
- const variance = calculateVariance(scores);
207
-
208
- if (variance < VARIANCE_THRESHOLD) {
209
- return "production";
210
- }
211
-
212
- // High variance - stay in stabilization
213
- return "stabilization";
214
- }
@@ -1,378 +0,0 @@
1
- /**
2
- * Tests for eval-learning.ts - Eval-to-Learning Feedback Loop
3
- *
4
- * TDD RED phase: Write failing tests first, then implement.
5
- *
6
- * Core behavior:
7
- * - Detect significant eval score drops (>15% from rolling average)
8
- * - Store failure context to semantic-memory with structured tags
9
- * - Ignore minor fluctuations (<15% variance)
10
- * - Configurable threshold for sensitivity tuning
11
- */
12
- import { describe, test, expect, beforeEach, mock } from "bun:test";
13
- import {
14
- learnFromEvalFailure,
15
- type EvalLearningConfig,
16
- calculateRollingAverage,
17
- isSignificantDrop,
18
- formatFailureContext,
19
- createLearningConfig,
20
- DEFAULT_EVAL_LEARNING_CONFIG,
21
- } from "./eval-learning";
22
- import type { EvalRunRecord } from "./eval-history";
23
- import type { MemoryAdapter } from "./memory-tools";
24
-
25
- // ============================================================================
26
- // Mock Memory Adapter
27
- // ============================================================================
28
-
29
- /**
30
- * Create a mock memory adapter for testing
31
- *
32
- * Tracks store() calls without hitting real storage
33
- */
34
- function createMockMemoryAdapter(): MemoryAdapter {
35
- const storedMemories: Array<{
36
- information: string;
37
- tags?: string;
38
- metadata?: string;
39
- }> = [];
40
-
41
- return {
42
- store: mock(async (args) => {
43
- storedMemories.push(args);
44
- return {
45
- id: `mem_${Date.now()}`,
46
- message: "Stored successfully",
47
- };
48
- }),
49
- find: mock(async () => ({ results: [], total: 0 })),
50
- get: mock(async () => null),
51
- remove: mock(async () => ({ success: true, message: "Removed" })),
52
- validate: mock(async () => ({ success: true, message: "Validated" })),
53
- list: mock(async () => []),
54
- stats: mock(async () => ({
55
- total_memories: 0,
56
- total_embeddings: 0,
57
- collections: {},
58
- })),
59
- checkHealth: mock(async () => ({ ready: true, message: "OK" })),
60
- getStoredMemories: () => storedMemories,
61
- } as any;
62
- }
63
-
64
- // ============================================================================
65
- // Tests: Rolling Average Calculation
66
- // ============================================================================
67
-
68
- describe("calculateRollingAverage", () => {
69
- test("returns 0 for empty history", () => {
70
- const avg = calculateRollingAverage([]);
71
- expect(avg).toBe(0);
72
- });
73
-
74
- test("returns single score for history of 1", () => {
75
- const history: EvalRunRecord[] = [
76
- {
77
- eval_name: "test",
78
- score: 0.85,
79
- timestamp: "2024-12-01T00:00:00Z",
80
- run_count: 1,
81
- },
82
- ];
83
-
84
- const avg = calculateRollingAverage(history);
85
- expect(avg).toBe(0.85);
86
- });
87
-
88
- test("calculates average of last N runs (default 5)", () => {
89
- const history: EvalRunRecord[] = [
90
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
91
- { eval_name: "test", score: 0.82, timestamp: "2024-12-02", run_count: 2 },
92
- { eval_name: "test", score: 0.84, timestamp: "2024-12-03", run_count: 3 },
93
- { eval_name: "test", score: 0.86, timestamp: "2024-12-04", run_count: 4 },
94
- { eval_name: "test", score: 0.88, timestamp: "2024-12-05", run_count: 5 },
95
- { eval_name: "test", score: 0.9, timestamp: "2024-12-06", run_count: 6 },
96
- ];
97
-
98
- const avg = calculateRollingAverage(history);
99
- // Last 5: 0.82, 0.84, 0.86, 0.88, 0.9 => avg = 0.86
100
- expect(avg).toBeCloseTo(0.86, 2);
101
- });
102
-
103
- test("uses custom window size", () => {
104
- const history: EvalRunRecord[] = [
105
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
106
- { eval_name: "test", score: 0.85, timestamp: "2024-12-02", run_count: 2 },
107
- { eval_name: "test", score: 0.9, timestamp: "2024-12-03", run_count: 3 },
108
- ];
109
-
110
- const avg = calculateRollingAverage(history, 2);
111
- // Last 2: 0.85, 0.9 => avg = 0.875
112
- expect(avg).toBeCloseTo(0.875, 3);
113
- });
114
-
115
- test("handles window larger than history", () => {
116
- const history: EvalRunRecord[] = [
117
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
118
- { eval_name: "test", score: 0.9, timestamp: "2024-12-02", run_count: 2 },
119
- ];
120
-
121
- const avg = calculateRollingAverage(history, 10);
122
- // Uses all available: (0.8 + 0.9) / 2 = 0.85
123
- expect(avg).toBeCloseTo(0.85, 2);
124
- });
125
- });
126
-
127
- // ============================================================================
128
- // Tests: Significant Drop Detection
129
- // ============================================================================
130
-
131
- describe("isSignificantDrop", () => {
132
- test("returns false when current equals baseline", () => {
133
- expect(isSignificantDrop(0.85, 0.85)).toBe(false);
134
- });
135
-
136
- test("returns false when current is higher than baseline", () => {
137
- expect(isSignificantDrop(0.9, 0.85)).toBe(false);
138
- });
139
-
140
- test("returns false for drop below threshold (default 15%)", () => {
141
- // Drop of 10%: 0.85 -> 0.765 (90% of 0.85)
142
- expect(isSignificantDrop(0.765, 0.85)).toBe(false);
143
- });
144
-
145
- test("returns true for drop at threshold (15%)", () => {
146
- // Drop of exactly 15%: 0.85 -> 0.7225 (85% of 0.85)
147
- // Use slightly lower to account for floating point precision
148
- expect(isSignificantDrop(0.722, 0.85)).toBe(true);
149
- });
150
-
151
- test("returns true for drop above threshold (20%)", () => {
152
- // Drop of 20%: 0.85 -> 0.68 (80% of 0.85)
153
- expect(isSignificantDrop(0.68, 0.85)).toBe(true);
154
- });
155
-
156
- test("uses custom threshold", () => {
157
- // Drop of 8%: 0.85 -> 0.782 (92% of 0.85)
158
- // Default (15%) => false
159
- expect(isSignificantDrop(0.782, 0.85)).toBe(false);
160
-
161
- // Custom threshold (5%) => true
162
- expect(isSignificantDrop(0.782, 0.85, 0.05)).toBe(true);
163
- });
164
-
165
- test("returns false when baseline is 0 (avoid division by zero)", () => {
166
- expect(isSignificantDrop(0, 0)).toBe(false);
167
- expect(isSignificantDrop(0.5, 0)).toBe(false);
168
- });
169
- });
170
-
171
- // ============================================================================
172
- // Tests: Failure Context Formatting
173
- // ============================================================================
174
-
175
- describe("formatFailureContext", () => {
176
- test("includes eval name, scores, and drop percentage", () => {
177
- const context = formatFailureContext("compaction-test", 0.68, 0.85);
178
-
179
- expect(context).toContain("compaction-test");
180
- expect(context).toContain("0.68");
181
- expect(context).toContain("0.85");
182
- expect(context).toContain("20.0%"); // (0.85 - 0.68) / 0.85 = 20%
183
- });
184
-
185
- test("includes optional scorer context", () => {
186
- const scorerContext = "violationCount scorer failed: 5 violations detected";
187
- const context = formatFailureContext(
188
- "coordinator-behavior",
189
- 0.5,
190
- 0.8,
191
- scorerContext,
192
- );
193
-
194
- expect(context).toContain("coordinator-behavior");
195
- expect(context).toContain(scorerContext);
196
- });
197
-
198
- test("handles baseline of 0 gracefully", () => {
199
- const context = formatFailureContext("test", 0.5, 0);
200
- expect(context).not.toContain("NaN");
201
- expect(context).not.toContain("Infinity");
202
- });
203
- });
204
-
205
- // ============================================================================
206
- // Tests: Main learnFromEvalFailure Function
207
- // ============================================================================
208
-
209
- describe("learnFromEvalFailure", () => {
210
- let mockAdapter: MemoryAdapter;
211
-
212
- beforeEach(() => {
213
- mockAdapter = createMockMemoryAdapter();
214
- });
215
-
216
- test("stores memory when score drops significantly", async () => {
217
- const history: EvalRunRecord[] = [
218
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
219
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
220
- { eval_name: "test", score: 0.86, timestamp: "2024-12-03", run_count: 3 },
221
- { eval_name: "test", score: 0.85, timestamp: "2024-12-04", run_count: 4 },
222
- { eval_name: "test", score: 0.84, timestamp: "2024-12-05", run_count: 5 },
223
- ];
224
- const currentScore = 0.68; // Drop of ~20%
225
-
226
- const result = await learnFromEvalFailure(
227
- "test-eval",
228
- currentScore,
229
- history,
230
- mockAdapter,
231
- );
232
-
233
- expect(result.triggered).toBe(true);
234
- expect(result.baseline).toBeCloseTo(0.848, 2);
235
- expect(result.drop_percentage).toBeCloseTo(0.198, 2); // ~20%
236
-
237
- // Verify memory was stored
238
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
239
-
240
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
241
- expect(storedMemory.information).toContain("test-eval");
242
- expect(storedMemory.information).toContain("0.68");
243
- expect(storedMemory.tags).toContain("eval-failure");
244
- expect(storedMemory.tags).toContain("test-eval");
245
- });
246
-
247
- test("does not store memory for minor fluctuations", async () => {
248
- const history: EvalRunRecord[] = [
249
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
250
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
251
- ];
252
- const currentScore = 0.8; // Drop of ~5%, below 15% threshold
253
-
254
- const result = await learnFromEvalFailure(
255
- "test-eval",
256
- currentScore,
257
- history,
258
- mockAdapter,
259
- );
260
-
261
- expect(result.triggered).toBe(false);
262
- expect(mockAdapter.store).not.toHaveBeenCalled();
263
- });
264
-
265
- test("includes scorer context in memory if provided", async () => {
266
- const history: EvalRunRecord[] = [
267
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
268
- ];
269
- const currentScore = 0.7; // Drop of ~22%
270
- const scorerContext = "violationCount: 8 protocol violations";
271
-
272
- await learnFromEvalFailure(
273
- "coordinator-behavior",
274
- currentScore,
275
- history,
276
- mockAdapter,
277
- { scorerContext },
278
- );
279
-
280
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
281
- expect(storedMemory.information).toContain(scorerContext);
282
- });
283
-
284
- test("uses custom threshold when provided", async () => {
285
- const history: EvalRunRecord[] = [
286
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
287
- ];
288
- const currentScore = 0.85; // Drop of ~5.5%
289
-
290
- const customConfig: EvalLearningConfig = {
291
- ...DEFAULT_EVAL_LEARNING_CONFIG,
292
- dropThreshold: 0.05, // 5% threshold
293
- };
294
-
295
- const result = await learnFromEvalFailure(
296
- "test-eval",
297
- currentScore,
298
- history,
299
- mockAdapter,
300
- { config: customConfig },
301
- );
302
-
303
- expect(result.triggered).toBe(true);
304
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
305
- });
306
-
307
- test("handles empty history gracefully", async () => {
308
- const result = await learnFromEvalFailure(
309
- "test-eval",
310
- 0.5,
311
- [],
312
- mockAdapter,
313
- );
314
-
315
- expect(result.triggered).toBe(false);
316
- expect(result.baseline).toBe(0);
317
- expect(mockAdapter.store).not.toHaveBeenCalled();
318
- });
319
-
320
- test("generates structured tags for semantic search", async () => {
321
- const history: EvalRunRecord[] = [
322
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
323
- ];
324
- const currentScore = 0.7; // Significant drop
325
-
326
- await learnFromEvalFailure(
327
- "compaction-test",
328
- currentScore,
329
- history,
330
- mockAdapter,
331
- );
332
-
333
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
334
- const tags = storedMemory.tags;
335
-
336
- expect(tags).toContain("eval-failure");
337
- expect(tags).toContain("compaction-test");
338
- expect(tags).toContain("regression");
339
- });
340
-
341
- test("stores metadata for future prompt generation", async () => {
342
- const history: EvalRunRecord[] = [
343
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
344
- ];
345
- const currentScore = 0.7;
346
-
347
- await learnFromEvalFailure("test-eval", currentScore, history, mockAdapter);
348
-
349
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
350
- expect(storedMemory.metadata).toBeDefined();
351
-
352
- const metadata = JSON.parse(storedMemory.metadata!);
353
- expect(metadata.eval_name).toBe("test-eval");
354
- expect(metadata.baseline_score).toBeCloseTo(0.9, 2);
355
- expect(metadata.current_score).toBe(0.7);
356
- expect(metadata.drop_percentage).toBeCloseTo(0.222, 2);
357
- });
358
- });
359
-
360
- // ============================================================================
361
- // Tests: Convenience Helpers
362
- // ============================================================================
363
-
364
- describe("createLearningConfig", () => {
365
- test("creates config with custom threshold", () => {
366
- const config = createLearningConfig(0.1);
367
-
368
- expect(config.dropThreshold).toBe(0.1);
369
- expect(config.windowSize).toBe(DEFAULT_EVAL_LEARNING_CONFIG.windowSize);
370
- });
371
-
372
- test("accepts custom window size", () => {
373
- const config = createLearningConfig(0.2, 10);
374
-
375
- expect(config.dropThreshold).toBe(0.2);
376
- expect(config.windowSize).toBe(10);
377
- });
378
- });