opencode-swarm-plugin 0.44.0 → 0.44.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/bin/swarm.serve.test.ts +6 -4
  2. package/bin/swarm.ts +18 -12
  3. package/dist/compaction-prompt-scoring.js +139 -0
  4. package/dist/eval-capture.js +12811 -0
  5. package/dist/hive.d.ts.map +1 -1
  6. package/dist/hive.js +14834 -0
  7. package/dist/index.d.ts +18 -0
  8. package/dist/index.d.ts.map +1 -1
  9. package/dist/index.js +7743 -62593
  10. package/dist/plugin.js +24052 -78907
  11. package/dist/swarm-orchestrate.d.ts.map +1 -1
  12. package/dist/swarm-prompts.d.ts.map +1 -1
  13. package/dist/swarm-prompts.js +39407 -0
  14. package/dist/swarm-review.d.ts.map +1 -1
  15. package/dist/swarm-validation.d.ts +127 -0
  16. package/dist/swarm-validation.d.ts.map +1 -0
  17. package/dist/validators/index.d.ts +7 -0
  18. package/dist/validators/index.d.ts.map +1 -0
  19. package/dist/validators/schema-validator.d.ts +58 -0
  20. package/dist/validators/schema-validator.d.ts.map +1 -0
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2286
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
  43. package/docs/planning/ROADMAP.md +0 -368
  44. package/docs/semantic-memory-cli-syntax.md +0 -123
  45. package/docs/swarm-mail-architecture.md +0 -1147
  46. package/docs/testing/context-recovery-test.md +0 -470
  47. package/evals/ARCHITECTURE.md +0 -1189
  48. package/evals/README.md +0 -768
  49. package/evals/compaction-prompt.eval.ts +0 -149
  50. package/evals/compaction-resumption.eval.ts +0 -289
  51. package/evals/coordinator-behavior.eval.ts +0 -307
  52. package/evals/coordinator-session.eval.ts +0 -154
  53. package/evals/evalite.config.ts.bak +0 -15
  54. package/evals/example.eval.ts +0 -31
  55. package/evals/fixtures/cass-baseline.ts +0 -217
  56. package/evals/fixtures/compaction-cases.ts +0 -350
  57. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  58. package/evals/fixtures/coordinator-sessions.ts +0 -328
  59. package/evals/fixtures/decomposition-cases.ts +0 -105
  60. package/evals/lib/compaction-loader.test.ts +0 -248
  61. package/evals/lib/compaction-loader.ts +0 -320
  62. package/evals/lib/data-loader.evalite-test.ts +0 -289
  63. package/evals/lib/data-loader.test.ts +0 -345
  64. package/evals/lib/data-loader.ts +0 -281
  65. package/evals/lib/llm.ts +0 -115
  66. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  67. package/evals/scorers/compaction-scorers.ts +0 -305
  68. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  69. package/evals/scorers/coordinator-discipline.ts +0 -325
  70. package/evals/scorers/index.test.ts +0 -146
  71. package/evals/scorers/index.ts +0 -328
  72. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  73. package/evals/scorers/outcome-scorers.ts +0 -349
  74. package/evals/swarm-decomposition.eval.ts +0 -121
  75. package/examples/commands/swarm.md +0 -745
  76. package/examples/plugin-wrapper-template.ts +0 -2515
  77. package/examples/skills/hive-workflow/SKILL.md +0 -212
  78. package/examples/skills/skill-creator/SKILL.md +0 -223
  79. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  80. package/global-skills/cli-builder/SKILL.md +0 -344
  81. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  82. package/global-skills/learning-systems/SKILL.md +0 -644
  83. package/global-skills/skill-creator/LICENSE.txt +0 -202
  84. package/global-skills/skill-creator/SKILL.md +0 -352
  85. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  86. package/global-skills/skill-creator/references/workflows.md +0 -28
  87. package/global-skills/swarm-coordination/SKILL.md +0 -995
  88. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  89. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  90. package/global-skills/system-design/SKILL.md +0 -213
  91. package/global-skills/testing-patterns/SKILL.md +0 -430
  92. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  93. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  94. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  95. package/scripts/cleanup-test-memories.ts +0 -346
  96. package/scripts/init-skill.ts +0 -222
  97. package/scripts/migrate-unknown-sessions.ts +0 -349
  98. package/scripts/validate-skill.ts +0 -204
  99. package/src/agent-mail.ts +0 -1724
  100. package/src/anti-patterns.test.ts +0 -1167
  101. package/src/anti-patterns.ts +0 -448
  102. package/src/compaction-capture.integration.test.ts +0 -257
  103. package/src/compaction-hook.test.ts +0 -838
  104. package/src/compaction-hook.ts +0 -1204
  105. package/src/compaction-observability.integration.test.ts +0 -139
  106. package/src/compaction-observability.test.ts +0 -187
  107. package/src/compaction-observability.ts +0 -324
  108. package/src/compaction-prompt-scorers.test.ts +0 -475
  109. package/src/compaction-prompt-scoring.ts +0 -300
  110. package/src/contributor-tools.test.ts +0 -133
  111. package/src/contributor-tools.ts +0 -201
  112. package/src/dashboard.test.ts +0 -611
  113. package/src/dashboard.ts +0 -462
  114. package/src/error-enrichment.test.ts +0 -403
  115. package/src/error-enrichment.ts +0 -219
  116. package/src/eval-capture.test.ts +0 -1015
  117. package/src/eval-capture.ts +0 -929
  118. package/src/eval-gates.test.ts +0 -306
  119. package/src/eval-gates.ts +0 -218
  120. package/src/eval-history.test.ts +0 -508
  121. package/src/eval-history.ts +0 -214
  122. package/src/eval-learning.test.ts +0 -378
  123. package/src/eval-learning.ts +0 -360
  124. package/src/eval-runner.test.ts +0 -223
  125. package/src/eval-runner.ts +0 -402
  126. package/src/export-tools.test.ts +0 -476
  127. package/src/export-tools.ts +0 -257
  128. package/src/hive.integration.test.ts +0 -2241
  129. package/src/hive.ts +0 -1628
  130. package/src/index.ts +0 -940
  131. package/src/learning.integration.test.ts +0 -1815
  132. package/src/learning.ts +0 -1079
  133. package/src/logger.test.ts +0 -189
  134. package/src/logger.ts +0 -135
  135. package/src/mandate-promotion.test.ts +0 -473
  136. package/src/mandate-promotion.ts +0 -239
  137. package/src/mandate-storage.integration.test.ts +0 -601
  138. package/src/mandate-storage.test.ts +0 -578
  139. package/src/mandate-storage.ts +0 -794
  140. package/src/mandates.ts +0 -540
  141. package/src/memory-tools.test.ts +0 -195
  142. package/src/memory-tools.ts +0 -344
  143. package/src/memory.integration.test.ts +0 -334
  144. package/src/memory.test.ts +0 -158
  145. package/src/memory.ts +0 -527
  146. package/src/model-selection.test.ts +0 -188
  147. package/src/model-selection.ts +0 -68
  148. package/src/observability-tools.test.ts +0 -359
  149. package/src/observability-tools.ts +0 -871
  150. package/src/output-guardrails.test.ts +0 -438
  151. package/src/output-guardrails.ts +0 -381
  152. package/src/pattern-maturity.test.ts +0 -1160
  153. package/src/pattern-maturity.ts +0 -525
  154. package/src/planning-guardrails.test.ts +0 -491
  155. package/src/planning-guardrails.ts +0 -438
  156. package/src/plugin.ts +0 -23
  157. package/src/post-compaction-tracker.test.ts +0 -251
  158. package/src/post-compaction-tracker.ts +0 -237
  159. package/src/query-tools.test.ts +0 -636
  160. package/src/query-tools.ts +0 -324
  161. package/src/rate-limiter.integration.test.ts +0 -466
  162. package/src/rate-limiter.ts +0 -774
  163. package/src/replay-tools.test.ts +0 -496
  164. package/src/replay-tools.ts +0 -240
  165. package/src/repo-crawl.integration.test.ts +0 -441
  166. package/src/repo-crawl.ts +0 -610
  167. package/src/schemas/cell-events.test.ts +0 -347
  168. package/src/schemas/cell-events.ts +0 -807
  169. package/src/schemas/cell.ts +0 -257
  170. package/src/schemas/evaluation.ts +0 -166
  171. package/src/schemas/index.test.ts +0 -199
  172. package/src/schemas/index.ts +0 -286
  173. package/src/schemas/mandate.ts +0 -232
  174. package/src/schemas/swarm-context.ts +0 -115
  175. package/src/schemas/task.ts +0 -161
  176. package/src/schemas/worker-handoff.test.ts +0 -302
  177. package/src/schemas/worker-handoff.ts +0 -131
  178. package/src/sessions/agent-discovery.test.ts +0 -137
  179. package/src/sessions/agent-discovery.ts +0 -112
  180. package/src/sessions/index.ts +0 -15
  181. package/src/skills.integration.test.ts +0 -1192
  182. package/src/skills.test.ts +0 -643
  183. package/src/skills.ts +0 -1549
  184. package/src/storage.integration.test.ts +0 -341
  185. package/src/storage.ts +0 -884
  186. package/src/structured.integration.test.ts +0 -817
  187. package/src/structured.test.ts +0 -1046
  188. package/src/structured.ts +0 -762
  189. package/src/swarm-decompose.test.ts +0 -188
  190. package/src/swarm-decompose.ts +0 -1302
  191. package/src/swarm-deferred.integration.test.ts +0 -157
  192. package/src/swarm-deferred.test.ts +0 -38
  193. package/src/swarm-insights.test.ts +0 -214
  194. package/src/swarm-insights.ts +0 -459
  195. package/src/swarm-mail.integration.test.ts +0 -970
  196. package/src/swarm-mail.ts +0 -739
  197. package/src/swarm-orchestrate.integration.test.ts +0 -282
  198. package/src/swarm-orchestrate.test.ts +0 -548
  199. package/src/swarm-orchestrate.ts +0 -3084
  200. package/src/swarm-prompts.test.ts +0 -1270
  201. package/src/swarm-prompts.ts +0 -2077
  202. package/src/swarm-research.integration.test.ts +0 -701
  203. package/src/swarm-research.test.ts +0 -698
  204. package/src/swarm-research.ts +0 -472
  205. package/src/swarm-review.integration.test.ts +0 -285
  206. package/src/swarm-review.test.ts +0 -879
  207. package/src/swarm-review.ts +0 -709
  208. package/src/swarm-strategies.ts +0 -407
  209. package/src/swarm-worktree.test.ts +0 -501
  210. package/src/swarm-worktree.ts +0 -575
  211. package/src/swarm.integration.test.ts +0 -2377
  212. package/src/swarm.ts +0 -38
  213. package/src/tool-adapter.integration.test.ts +0 -1221
  214. package/src/tool-availability.ts +0 -461
  215. package/tsconfig.json +0 -28
@@ -1,214 +0,0 @@
1
- /**
2
- * Eval History Tracker - Progressive gates based on run history
3
- *
4
- * Tracks eval run scores over time and calculates the current phase:
5
- * - Bootstrap (<10 runs): No gates, just collect data
6
- * - Stabilization (10-50 runs): Warn on >10% regression
7
- * - Production (>50 runs + variance <0.1): Fail on >5% regression
8
- *
9
- * @module eval-history
10
- */
11
- import * as fs from "node:fs";
12
- import * as path from "node:path";
13
-
14
- /**
15
- * Progressive phases based on run count and variance
16
- */
17
- export type Phase = "bootstrap" | "stabilization" | "production";
18
-
19
- /**
20
- * Single eval run record
21
- */
22
- export interface EvalRunRecord {
23
- /** ISO-8601 timestamp */
24
- timestamp: string;
25
- /** Name of the eval (e.g., "swarm-decomposition") */
26
- eval_name: string;
27
- /** Score (0-1 range typically) */
28
- score: number;
29
- /** Run count (monotonically increasing per eval) */
30
- run_count: number;
31
- }
32
-
33
- /**
34
- * Default path for eval history
35
- */
36
- export const DEFAULT_EVAL_HISTORY_PATH = ".opencode/eval-history.jsonl";
37
-
38
- /**
39
- * Variance threshold for production phase
40
- */
41
- export const VARIANCE_THRESHOLD = 0.1;
42
-
43
- /**
44
- * Run count thresholds for phase transitions
45
- */
46
- export const BOOTSTRAP_THRESHOLD = 10;
47
- export const STABILIZATION_THRESHOLD = 50;
48
-
49
- /**
50
- * Get the eval history file path
51
- */
52
- export function getEvalHistoryPath(projectPath: string): string {
53
- return path.join(projectPath, DEFAULT_EVAL_HISTORY_PATH);
54
- }
55
-
56
- /**
57
- * Ensure the eval history directory exists
58
- */
59
- export function ensureEvalHistoryDir(projectPath: string): void {
60
- const historyPath = getEvalHistoryPath(projectPath);
61
- const dir = path.dirname(historyPath);
62
- if (!fs.existsSync(dir)) {
63
- fs.mkdirSync(dir, { recursive: true });
64
- }
65
- }
66
-
67
- /**
68
- * Record an eval run to JSONL history
69
- *
70
- * Appends atomically to `.opencode/eval-history.jsonl`. Each line is a complete JSON object
71
- * representing one eval run (timestamp, eval name, score, run count).
72
- *
73
- * **Auto-creates directory** if `.opencode/` doesn't exist.
74
- *
75
- * **Thread-safe**: Uses `appendFileSync` for atomic writes (safe for concurrent eval runs).
76
- *
77
- * **Integration**: Called automatically by evalite runner after each eval completes.
78
- * Also callable manually for custom eval tracking.
79
- *
80
- * @param projectPath - Absolute path to project root
81
- * @param run - Eval run record with timestamp, eval_name, score, run_count
82
- *
83
- * @example
84
- * ```typescript
85
- * import { recordEvalRun } from "./eval-history.js";
86
- *
87
- * recordEvalRun("/path/to/project", {
88
- * timestamp: new Date().toISOString(),
89
- * eval_name: "swarm-decomposition",
90
- * score: 0.92,
91
- * run_count: 15,
92
- * });
93
- * ```
94
- */
95
- export function recordEvalRun(
96
- projectPath: string,
97
- run: EvalRunRecord,
98
- ): void {
99
- ensureEvalHistoryDir(projectPath);
100
- const historyPath = getEvalHistoryPath(projectPath);
101
- const line = `${JSON.stringify(run)}\n`;
102
- fs.appendFileSync(historyPath, line, "utf-8");
103
- }
104
-
105
- /**
106
- * Read all eval run records from JSONL file
107
- *
108
- * Internal helper for parsing the history file
109
- */
110
- function readAllRecords(projectPath: string): EvalRunRecord[] {
111
- const historyPath = getEvalHistoryPath(projectPath);
112
-
113
- if (!fs.existsSync(historyPath)) {
114
- return [];
115
- }
116
-
117
- const content = fs.readFileSync(historyPath, "utf-8");
118
- const lines = content.trim().split("\n").filter(Boolean);
119
-
120
- return lines.map((line) => JSON.parse(line) as EvalRunRecord);
121
- }
122
-
123
- /**
124
- * Get score history for a specific eval
125
- *
126
- * Returns runs in chronological order (oldest first)
127
- */
128
- export function getScoreHistory(
129
- projectPath: string,
130
- evalName: string,
131
- ): EvalRunRecord[] {
132
- return readAllRecords(projectPath).filter(
133
- (run) => run.eval_name === evalName,
134
- );
135
- }
136
-
137
- /**
138
- * Calculate statistical variance of scores
139
- *
140
- * Variance = mean of squared deviations from the mean
141
- * Formula: Σ((x - μ)²) / n
142
- */
143
- export function calculateVariance(scores: number[]): number {
144
- if (scores.length <= 1) {
145
- return 0;
146
- }
147
-
148
- const mean = scores.reduce((sum, score) => sum + score, 0) / scores.length;
149
-
150
- const variance = scores.reduce((sum, score) => {
151
- const deviation = score - mean;
152
- return sum + deviation * deviation;
153
- }, 0) / scores.length;
154
-
155
- return variance;
156
- }
157
-
158
- /**
159
- * Get the current phase for an eval based on run count and score variance
160
- *
161
- * Progressive phase logic ensures quality gates adapt to data maturity:
162
- *
163
- * - **Bootstrap (<10 runs)**: No gates, just collect baseline data
164
- * - **Stabilization (10-50 runs)**: Warn on >10% regression (but pass)
165
- * - **Production (>50 runs AND variance <0.1)**: Fail on >5% regression
166
- *
167
- * **Variance check**: If >50 runs but variance ≥0.1, stays in stabilization.
168
- * This prevents premature production gates when scores are still unstable.
169
- *
170
- * **Why variance matters**: An eval with wildly fluctuating scores isn't ready for
171
- * strict gates. Variance threshold (0.1) ensures the eval is consistent before
172
- * enforcing production-level quality control.
173
- *
174
- * @param projectPath - Absolute path to project root (contains `.opencode/eval-history.jsonl`)
175
- * @param evalName - Name of the eval (e.g., "swarm-decomposition")
176
- * @returns Current phase: "bootstrap" | "stabilization" | "production"
177
- *
178
- * @example
179
- * ```typescript
180
- * import { getPhase } from "./eval-history.js";
181
- *
182
- * const phase = getPhase("/path/to/project", "swarm-decomposition");
183
- *
184
- * if (phase === "production") {
185
- * console.log("🚀 Production phase - strict gates enabled");
186
- * } else if (phase === "stabilization") {
187
- * console.log("⚙️ Stabilization phase - warnings only");
188
- * } else {
189
- * console.log("🌱 Bootstrap phase - collecting data");
190
- * }
191
- * ```
192
- */
193
- export function getPhase(projectPath: string, evalName: string): Phase {
194
- const history = getScoreHistory(projectPath, evalName);
195
-
196
- if (history.length < BOOTSTRAP_THRESHOLD) {
197
- return "bootstrap";
198
- }
199
-
200
- if (history.length <= STABILIZATION_THRESHOLD) {
201
- return "stabilization";
202
- }
203
-
204
- // >50 runs - check variance
205
- const scores = history.map((run) => run.score);
206
- const variance = calculateVariance(scores);
207
-
208
- if (variance < VARIANCE_THRESHOLD) {
209
- return "production";
210
- }
211
-
212
- // High variance - stay in stabilization
213
- return "stabilization";
214
- }
@@ -1,378 +0,0 @@
1
- /**
2
- * Tests for eval-learning.ts - Eval-to-Learning Feedback Loop
3
- *
4
- * TDD RED phase: Write failing tests first, then implement.
5
- *
6
- * Core behavior:
7
- * - Detect significant eval score drops (>15% from rolling average)
8
- * - Store failure context to semantic-memory with structured tags
9
- * - Ignore minor fluctuations (<15% variance)
10
- * - Configurable threshold for sensitivity tuning
11
- */
12
- import { describe, test, expect, beforeEach, mock } from "bun:test";
13
- import {
14
- learnFromEvalFailure,
15
- type EvalLearningConfig,
16
- calculateRollingAverage,
17
- isSignificantDrop,
18
- formatFailureContext,
19
- createLearningConfig,
20
- DEFAULT_EVAL_LEARNING_CONFIG,
21
- } from "./eval-learning";
22
- import type { EvalRunRecord } from "./eval-history";
23
- import type { MemoryAdapter } from "./memory-tools";
24
-
25
- // ============================================================================
26
- // Mock Memory Adapter
27
- // ============================================================================
28
-
29
- /**
30
- * Create a mock memory adapter for testing
31
- *
32
- * Tracks store() calls without hitting real storage
33
- */
34
- function createMockMemoryAdapter(): MemoryAdapter {
35
- const storedMemories: Array<{
36
- information: string;
37
- tags?: string;
38
- metadata?: string;
39
- }> = [];
40
-
41
- return {
42
- store: mock(async (args) => {
43
- storedMemories.push(args);
44
- return {
45
- id: `mem_${Date.now()}`,
46
- message: "Stored successfully",
47
- };
48
- }),
49
- find: mock(async () => ({ results: [], total: 0 })),
50
- get: mock(async () => null),
51
- remove: mock(async () => ({ success: true, message: "Removed" })),
52
- validate: mock(async () => ({ success: true, message: "Validated" })),
53
- list: mock(async () => []),
54
- stats: mock(async () => ({
55
- total_memories: 0,
56
- total_embeddings: 0,
57
- collections: {},
58
- })),
59
- checkHealth: mock(async () => ({ ready: true, message: "OK" })),
60
- getStoredMemories: () => storedMemories,
61
- } as any;
62
- }
63
-
64
- // ============================================================================
65
- // Tests: Rolling Average Calculation
66
- // ============================================================================
67
-
68
- describe("calculateRollingAverage", () => {
69
- test("returns 0 for empty history", () => {
70
- const avg = calculateRollingAverage([]);
71
- expect(avg).toBe(0);
72
- });
73
-
74
- test("returns single score for history of 1", () => {
75
- const history: EvalRunRecord[] = [
76
- {
77
- eval_name: "test",
78
- score: 0.85,
79
- timestamp: "2024-12-01T00:00:00Z",
80
- run_count: 1,
81
- },
82
- ];
83
-
84
- const avg = calculateRollingAverage(history);
85
- expect(avg).toBe(0.85);
86
- });
87
-
88
- test("calculates average of last N runs (default 5)", () => {
89
- const history: EvalRunRecord[] = [
90
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
91
- { eval_name: "test", score: 0.82, timestamp: "2024-12-02", run_count: 2 },
92
- { eval_name: "test", score: 0.84, timestamp: "2024-12-03", run_count: 3 },
93
- { eval_name: "test", score: 0.86, timestamp: "2024-12-04", run_count: 4 },
94
- { eval_name: "test", score: 0.88, timestamp: "2024-12-05", run_count: 5 },
95
- { eval_name: "test", score: 0.9, timestamp: "2024-12-06", run_count: 6 },
96
- ];
97
-
98
- const avg = calculateRollingAverage(history);
99
- // Last 5: 0.82, 0.84, 0.86, 0.88, 0.9 => avg = 0.86
100
- expect(avg).toBeCloseTo(0.86, 2);
101
- });
102
-
103
- test("uses custom window size", () => {
104
- const history: EvalRunRecord[] = [
105
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
106
- { eval_name: "test", score: 0.85, timestamp: "2024-12-02", run_count: 2 },
107
- { eval_name: "test", score: 0.9, timestamp: "2024-12-03", run_count: 3 },
108
- ];
109
-
110
- const avg = calculateRollingAverage(history, 2);
111
- // Last 2: 0.85, 0.9 => avg = 0.875
112
- expect(avg).toBeCloseTo(0.875, 3);
113
- });
114
-
115
- test("handles window larger than history", () => {
116
- const history: EvalRunRecord[] = [
117
- { eval_name: "test", score: 0.8, timestamp: "2024-12-01", run_count: 1 },
118
- { eval_name: "test", score: 0.9, timestamp: "2024-12-02", run_count: 2 },
119
- ];
120
-
121
- const avg = calculateRollingAverage(history, 10);
122
- // Uses all available: (0.8 + 0.9) / 2 = 0.85
123
- expect(avg).toBeCloseTo(0.85, 2);
124
- });
125
- });
126
-
127
- // ============================================================================
128
- // Tests: Significant Drop Detection
129
- // ============================================================================
130
-
131
- describe("isSignificantDrop", () => {
132
- test("returns false when current equals baseline", () => {
133
- expect(isSignificantDrop(0.85, 0.85)).toBe(false);
134
- });
135
-
136
- test("returns false when current is higher than baseline", () => {
137
- expect(isSignificantDrop(0.9, 0.85)).toBe(false);
138
- });
139
-
140
- test("returns false for drop below threshold (default 15%)", () => {
141
- // Drop of 10%: 0.85 -> 0.765 (90% of 0.85)
142
- expect(isSignificantDrop(0.765, 0.85)).toBe(false);
143
- });
144
-
145
- test("returns true for drop at threshold (15%)", () => {
146
- // Drop of exactly 15%: 0.85 -> 0.7225 (85% of 0.85)
147
- // Use slightly lower to account for floating point precision
148
- expect(isSignificantDrop(0.722, 0.85)).toBe(true);
149
- });
150
-
151
- test("returns true for drop above threshold (20%)", () => {
152
- // Drop of 20%: 0.85 -> 0.68 (80% of 0.85)
153
- expect(isSignificantDrop(0.68, 0.85)).toBe(true);
154
- });
155
-
156
- test("uses custom threshold", () => {
157
- // Drop of 8%: 0.85 -> 0.782 (92% of 0.85)
158
- // Default (15%) => false
159
- expect(isSignificantDrop(0.782, 0.85)).toBe(false);
160
-
161
- // Custom threshold (5%) => true
162
- expect(isSignificantDrop(0.782, 0.85, 0.05)).toBe(true);
163
- });
164
-
165
- test("returns false when baseline is 0 (avoid division by zero)", () => {
166
- expect(isSignificantDrop(0, 0)).toBe(false);
167
- expect(isSignificantDrop(0.5, 0)).toBe(false);
168
- });
169
- });
170
-
171
- // ============================================================================
172
- // Tests: Failure Context Formatting
173
- // ============================================================================
174
-
175
- describe("formatFailureContext", () => {
176
- test("includes eval name, scores, and drop percentage", () => {
177
- const context = formatFailureContext("compaction-test", 0.68, 0.85);
178
-
179
- expect(context).toContain("compaction-test");
180
- expect(context).toContain("0.68");
181
- expect(context).toContain("0.85");
182
- expect(context).toContain("20.0%"); // (0.85 - 0.68) / 0.85 = 20%
183
- });
184
-
185
- test("includes optional scorer context", () => {
186
- const scorerContext = "violationCount scorer failed: 5 violations detected";
187
- const context = formatFailureContext(
188
- "coordinator-behavior",
189
- 0.5,
190
- 0.8,
191
- scorerContext,
192
- );
193
-
194
- expect(context).toContain("coordinator-behavior");
195
- expect(context).toContain(scorerContext);
196
- });
197
-
198
- test("handles baseline of 0 gracefully", () => {
199
- const context = formatFailureContext("test", 0.5, 0);
200
- expect(context).not.toContain("NaN");
201
- expect(context).not.toContain("Infinity");
202
- });
203
- });
204
-
205
- // ============================================================================
206
- // Tests: Main learnFromEvalFailure Function
207
- // ============================================================================
208
-
209
- describe("learnFromEvalFailure", () => {
210
- let mockAdapter: MemoryAdapter;
211
-
212
- beforeEach(() => {
213
- mockAdapter = createMockMemoryAdapter();
214
- });
215
-
216
- test("stores memory when score drops significantly", async () => {
217
- const history: EvalRunRecord[] = [
218
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
219
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
220
- { eval_name: "test", score: 0.86, timestamp: "2024-12-03", run_count: 3 },
221
- { eval_name: "test", score: 0.85, timestamp: "2024-12-04", run_count: 4 },
222
- { eval_name: "test", score: 0.84, timestamp: "2024-12-05", run_count: 5 },
223
- ];
224
- const currentScore = 0.68; // Drop of ~20%
225
-
226
- const result = await learnFromEvalFailure(
227
- "test-eval",
228
- currentScore,
229
- history,
230
- mockAdapter,
231
- );
232
-
233
- expect(result.triggered).toBe(true);
234
- expect(result.baseline).toBeCloseTo(0.848, 2);
235
- expect(result.drop_percentage).toBeCloseTo(0.198, 2); // ~20%
236
-
237
- // Verify memory was stored
238
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
239
-
240
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
241
- expect(storedMemory.information).toContain("test-eval");
242
- expect(storedMemory.information).toContain("0.68");
243
- expect(storedMemory.tags).toContain("eval-failure");
244
- expect(storedMemory.tags).toContain("test-eval");
245
- });
246
-
247
- test("does not store memory for minor fluctuations", async () => {
248
- const history: EvalRunRecord[] = [
249
- { eval_name: "test", score: 0.85, timestamp: "2024-12-01", run_count: 1 },
250
- { eval_name: "test", score: 0.84, timestamp: "2024-12-02", run_count: 2 },
251
- ];
252
- const currentScore = 0.8; // Drop of ~5%, below 15% threshold
253
-
254
- const result = await learnFromEvalFailure(
255
- "test-eval",
256
- currentScore,
257
- history,
258
- mockAdapter,
259
- );
260
-
261
- expect(result.triggered).toBe(false);
262
- expect(mockAdapter.store).not.toHaveBeenCalled();
263
- });
264
-
265
- test("includes scorer context in memory if provided", async () => {
266
- const history: EvalRunRecord[] = [
267
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
268
- ];
269
- const currentScore = 0.7; // Drop of ~22%
270
- const scorerContext = "violationCount: 8 protocol violations";
271
-
272
- await learnFromEvalFailure(
273
- "coordinator-behavior",
274
- currentScore,
275
- history,
276
- mockAdapter,
277
- { scorerContext },
278
- );
279
-
280
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
281
- expect(storedMemory.information).toContain(scorerContext);
282
- });
283
-
284
- test("uses custom threshold when provided", async () => {
285
- const history: EvalRunRecord[] = [
286
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
287
- ];
288
- const currentScore = 0.85; // Drop of ~5.5%
289
-
290
- const customConfig: EvalLearningConfig = {
291
- ...DEFAULT_EVAL_LEARNING_CONFIG,
292
- dropThreshold: 0.05, // 5% threshold
293
- };
294
-
295
- const result = await learnFromEvalFailure(
296
- "test-eval",
297
- currentScore,
298
- history,
299
- mockAdapter,
300
- { config: customConfig },
301
- );
302
-
303
- expect(result.triggered).toBe(true);
304
- expect(mockAdapter.store).toHaveBeenCalledTimes(1);
305
- });
306
-
307
- test("handles empty history gracefully", async () => {
308
- const result = await learnFromEvalFailure(
309
- "test-eval",
310
- 0.5,
311
- [],
312
- mockAdapter,
313
- );
314
-
315
- expect(result.triggered).toBe(false);
316
- expect(result.baseline).toBe(0);
317
- expect(mockAdapter.store).not.toHaveBeenCalled();
318
- });
319
-
320
- test("generates structured tags for semantic search", async () => {
321
- const history: EvalRunRecord[] = [
322
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
323
- ];
324
- const currentScore = 0.7; // Significant drop
325
-
326
- await learnFromEvalFailure(
327
- "compaction-test",
328
- currentScore,
329
- history,
330
- mockAdapter,
331
- );
332
-
333
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
334
- const tags = storedMemory.tags;
335
-
336
- expect(tags).toContain("eval-failure");
337
- expect(tags).toContain("compaction-test");
338
- expect(tags).toContain("regression");
339
- });
340
-
341
- test("stores metadata for future prompt generation", async () => {
342
- const history: EvalRunRecord[] = [
343
- { eval_name: "test", score: 0.9, timestamp: "2024-12-01", run_count: 1 },
344
- ];
345
- const currentScore = 0.7;
346
-
347
- await learnFromEvalFailure("test-eval", currentScore, history, mockAdapter);
348
-
349
- const storedMemory = (mockAdapter as any).getStoredMemories()[0];
350
- expect(storedMemory.metadata).toBeDefined();
351
-
352
- const metadata = JSON.parse(storedMemory.metadata!);
353
- expect(metadata.eval_name).toBe("test-eval");
354
- expect(metadata.baseline_score).toBeCloseTo(0.9, 2);
355
- expect(metadata.current_score).toBe(0.7);
356
- expect(metadata.drop_percentage).toBeCloseTo(0.222, 2);
357
- });
358
- });
359
-
360
- // ============================================================================
361
- // Tests: Convenience Helpers
362
- // ============================================================================
363
-
364
- describe("createLearningConfig", () => {
365
- test("creates config with custom threshold", () => {
366
- const config = createLearningConfig(0.1);
367
-
368
- expect(config.dropThreshold).toBe(0.1);
369
- expect(config.windowSize).toBe(DEFAULT_EVAL_LEARNING_CONFIG.windowSize);
370
- });
371
-
372
- test("accepts custom window size", () => {
373
- const config = createLearningConfig(0.2, 10);
374
-
375
- expect(config.dropThreshold).toBe(0.2);
376
- expect(config.windowSize).toBe(10);
377
- });
378
- });