opencode-swarm-plugin 0.37.0 → 0.39.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +20 -5
- package/.hive/memories.jsonl +35 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/.turbo/turbo-build.log +4 -4
- package/.turbo/turbo-test.log +319 -319
- package/CHANGELOG.md +258 -0
- package/README.md +50 -0
- package/bin/swarm.test.ts +475 -0
- package/bin/swarm.ts +385 -208
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +81 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts +59 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +87 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +823 -131
- package/dist/plugin.js +655 -131
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/swarm-decompose.d.ts +30 -0
- package/dist/swarm-decompose.d.ts.map +1 -1
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +19 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +595 -94
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
- package/evals/scorers/coordinator-discipline.ts +13 -13
- package/examples/plugin-wrapper-template.ts +177 -8
- package/package.json +7 -2
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +139 -2
- package/src/compaction-hook.ts +113 -2
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +422 -0
- package/src/eval-capture.ts +94 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/index.ts +61 -1
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.test.ts +40 -47
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.test.ts +270 -7
- package/src/swarm-orchestrate.ts +100 -13
- package/src/swarm-prompts.test.ts +121 -0
- package/src/swarm-prompts.ts +297 -4
- package/src/swarm-research.integration.test.ts +157 -0
- package/src/swarm-review.ts +3 -3
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for eval-history - tracks eval run scores and calculates progressive phases
|
|
3
|
+
*
|
|
4
|
+
* TDD: RED phase - all tests should fail initially
|
|
5
|
+
*/
|
|
6
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
7
|
+
import * as fs from "node:fs";
|
|
8
|
+
import * as path from "node:path";
|
|
9
|
+
import {
|
|
10
|
+
type EvalRunRecord,
|
|
11
|
+
type Phase,
|
|
12
|
+
calculateVariance,
|
|
13
|
+
getPhase,
|
|
14
|
+
getScoreHistory,
|
|
15
|
+
recordEvalRun,
|
|
16
|
+
} from "./eval-history.js";
|
|
17
|
+
|
|
18
|
+
describe("eval-history", () => {
|
|
19
|
+
const testDir = path.join(import.meta.dir, ".test-eval-history");
|
|
20
|
+
const testProjectPath = path.join(testDir, "test-project");
|
|
21
|
+
|
|
22
|
+
beforeEach(() => {
|
|
23
|
+
// Clean slate for each test
|
|
24
|
+
if (fs.existsSync(testDir)) {
|
|
25
|
+
fs.rmSync(testDir, { recursive: true });
|
|
26
|
+
}
|
|
27
|
+
fs.mkdirSync(testProjectPath, { recursive: true });
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
afterEach(() => {
|
|
31
|
+
// Cleanup
|
|
32
|
+
if (fs.existsSync(testDir)) {
|
|
33
|
+
fs.rmSync(testDir, { recursive: true });
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("recordEvalRun", () => {
|
|
38
|
+
test("appends eval run to JSONL file", () => {
|
|
39
|
+
const run: EvalRunRecord = {
|
|
40
|
+
timestamp: new Date().toISOString(),
|
|
41
|
+
eval_name: "swarm-decomposition",
|
|
42
|
+
score: 0.85,
|
|
43
|
+
run_count: 1,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
recordEvalRun(testProjectPath, run);
|
|
47
|
+
|
|
48
|
+
const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
|
|
49
|
+
expect(fs.existsSync(historyPath)).toBe(true);
|
|
50
|
+
|
|
51
|
+
const content = fs.readFileSync(historyPath, "utf-8");
|
|
52
|
+
const lines = content.trim().split("\n");
|
|
53
|
+
expect(lines).toHaveLength(1);
|
|
54
|
+
|
|
55
|
+
const parsed = JSON.parse(lines[0]);
|
|
56
|
+
expect(parsed.eval_name).toBe("swarm-decomposition");
|
|
57
|
+
expect(parsed.score).toBe(0.85);
|
|
58
|
+
expect(parsed.run_count).toBe(1);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
test("appends multiple runs sequentially", () => {
|
|
62
|
+
const run1: EvalRunRecord = {
|
|
63
|
+
timestamp: new Date().toISOString(),
|
|
64
|
+
eval_name: "swarm-decomposition",
|
|
65
|
+
score: 0.80,
|
|
66
|
+
run_count: 1,
|
|
67
|
+
};
|
|
68
|
+
const run2: EvalRunRecord = {
|
|
69
|
+
timestamp: new Date().toISOString(),
|
|
70
|
+
eval_name: "swarm-decomposition",
|
|
71
|
+
score: 0.85,
|
|
72
|
+
run_count: 2,
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
recordEvalRun(testProjectPath, run1);
|
|
76
|
+
recordEvalRun(testProjectPath, run2);
|
|
77
|
+
|
|
78
|
+
const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
|
|
79
|
+
const content = fs.readFileSync(historyPath, "utf-8");
|
|
80
|
+
const lines = content.trim().split("\n");
|
|
81
|
+
expect(lines).toHaveLength(2);
|
|
82
|
+
|
|
83
|
+
const parsed1 = JSON.parse(lines[0]);
|
|
84
|
+
const parsed2 = JSON.parse(lines[1]);
|
|
85
|
+
expect(parsed1.score).toBe(0.80);
|
|
86
|
+
expect(parsed2.score).toBe(0.85);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
test("creates directory if it doesn't exist", () => {
|
|
90
|
+
const run: EvalRunRecord = {
|
|
91
|
+
timestamp: new Date().toISOString(),
|
|
92
|
+
eval_name: "test-eval",
|
|
93
|
+
score: 0.90,
|
|
94
|
+
run_count: 1,
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
// Directory doesn't exist yet
|
|
98
|
+
const opencodePath = path.join(testProjectPath, ".opencode");
|
|
99
|
+
expect(fs.existsSync(opencodePath)).toBe(false);
|
|
100
|
+
|
|
101
|
+
recordEvalRun(testProjectPath, run);
|
|
102
|
+
|
|
103
|
+
// Directory should be created
|
|
104
|
+
expect(fs.existsSync(opencodePath)).toBe(true);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test("supports different eval names in same history", () => {
|
|
108
|
+
const run1: EvalRunRecord = {
|
|
109
|
+
timestamp: new Date().toISOString(),
|
|
110
|
+
eval_name: "swarm-decomposition",
|
|
111
|
+
score: 0.85,
|
|
112
|
+
run_count: 1,
|
|
113
|
+
};
|
|
114
|
+
const run2: EvalRunRecord = {
|
|
115
|
+
timestamp: new Date().toISOString(),
|
|
116
|
+
eval_name: "coordinator-session",
|
|
117
|
+
score: 0.75,
|
|
118
|
+
run_count: 1,
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
recordEvalRun(testProjectPath, run1);
|
|
122
|
+
recordEvalRun(testProjectPath, run2);
|
|
123
|
+
|
|
124
|
+
const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
|
|
125
|
+
const content = fs.readFileSync(historyPath, "utf-8");
|
|
126
|
+
const lines = content.trim().split("\n");
|
|
127
|
+
expect(lines).toHaveLength(2);
|
|
128
|
+
|
|
129
|
+
const parsed1 = JSON.parse(lines[0]);
|
|
130
|
+
const parsed2 = JSON.parse(lines[1]);
|
|
131
|
+
expect(parsed1.eval_name).toBe("swarm-decomposition");
|
|
132
|
+
expect(parsed2.eval_name).toBe("coordinator-session");
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
describe("getScoreHistory", () => {
|
|
137
|
+
test("returns empty array when no history exists", () => {
|
|
138
|
+
const history = getScoreHistory(testProjectPath, "swarm-decomposition");
|
|
139
|
+
expect(history).toEqual([]);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
test("returns all runs for a specific eval", () => {
|
|
143
|
+
const runs: EvalRunRecord[] = [
|
|
144
|
+
{
|
|
145
|
+
timestamp: new Date().toISOString(),
|
|
146
|
+
eval_name: "swarm-decomposition",
|
|
147
|
+
score: 0.80,
|
|
148
|
+
run_count: 1,
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
timestamp: new Date().toISOString(),
|
|
152
|
+
eval_name: "swarm-decomposition",
|
|
153
|
+
score: 0.85,
|
|
154
|
+
run_count: 2,
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
timestamp: new Date().toISOString(),
|
|
158
|
+
eval_name: "coordinator-session",
|
|
159
|
+
score: 0.70,
|
|
160
|
+
run_count: 1,
|
|
161
|
+
},
|
|
162
|
+
];
|
|
163
|
+
|
|
164
|
+
for (const run of runs) {
|
|
165
|
+
recordEvalRun(testProjectPath, run);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const history = getScoreHistory(testProjectPath, "swarm-decomposition");
|
|
169
|
+
expect(history).toHaveLength(2);
|
|
170
|
+
expect(history[0].score).toBe(0.80);
|
|
171
|
+
expect(history[1].score).toBe(0.85);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
test("filters by eval_name correctly", () => {
|
|
175
|
+
const runs: EvalRunRecord[] = [
|
|
176
|
+
{
|
|
177
|
+
timestamp: new Date().toISOString(),
|
|
178
|
+
eval_name: "swarm-decomposition",
|
|
179
|
+
score: 0.80,
|
|
180
|
+
run_count: 1,
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
timestamp: new Date().toISOString(),
|
|
184
|
+
eval_name: "coordinator-session",
|
|
185
|
+
score: 0.70,
|
|
186
|
+
run_count: 1,
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
timestamp: new Date().toISOString(),
|
|
190
|
+
eval_name: "swarm-decomposition",
|
|
191
|
+
score: 0.85,
|
|
192
|
+
run_count: 2,
|
|
193
|
+
},
|
|
194
|
+
];
|
|
195
|
+
|
|
196
|
+
for (const run of runs) {
|
|
197
|
+
recordEvalRun(testProjectPath, run);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const decompositionHistory = getScoreHistory(testProjectPath, "swarm-decomposition");
|
|
201
|
+
const coordinatorHistory = getScoreHistory(testProjectPath, "coordinator-session");
|
|
202
|
+
|
|
203
|
+
expect(decompositionHistory).toHaveLength(2);
|
|
204
|
+
expect(coordinatorHistory).toHaveLength(1);
|
|
205
|
+
expect(coordinatorHistory[0].score).toBe(0.70);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
test("returns runs in chronological order", () => {
|
|
209
|
+
const baseTime = Date.now();
|
|
210
|
+
const runs: EvalRunRecord[] = [
|
|
211
|
+
{
|
|
212
|
+
timestamp: new Date(baseTime).toISOString(),
|
|
213
|
+
eval_name: "test-eval",
|
|
214
|
+
score: 0.80,
|
|
215
|
+
run_count: 1,
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
timestamp: new Date(baseTime + 1000).toISOString(),
|
|
219
|
+
eval_name: "test-eval",
|
|
220
|
+
score: 0.85,
|
|
221
|
+
run_count: 2,
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
timestamp: new Date(baseTime + 2000).toISOString(),
|
|
225
|
+
eval_name: "test-eval",
|
|
226
|
+
score: 0.90,
|
|
227
|
+
run_count: 3,
|
|
228
|
+
},
|
|
229
|
+
];
|
|
230
|
+
|
|
231
|
+
for (const run of runs) {
|
|
232
|
+
recordEvalRun(testProjectPath, run);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const history = getScoreHistory(testProjectPath, "test-eval");
|
|
236
|
+
expect(history).toHaveLength(3);
|
|
237
|
+
expect(history[0].score).toBe(0.80);
|
|
238
|
+
expect(history[1].score).toBe(0.85);
|
|
239
|
+
expect(history[2].score).toBe(0.90);
|
|
240
|
+
});
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
describe("calculateVariance", () => {
|
|
244
|
+
test("returns 0 for single score", () => {
|
|
245
|
+
const variance = calculateVariance([0.85]);
|
|
246
|
+
expect(variance).toBe(0);
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
test("returns 0 for identical scores", () => {
|
|
250
|
+
const variance = calculateVariance([0.80, 0.80, 0.80, 0.80]);
|
|
251
|
+
expect(variance).toBe(0);
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
test("calculates variance for varying scores", () => {
|
|
255
|
+
// Scores: 0.70, 0.80, 0.90
|
|
256
|
+
// Mean: 0.80
|
|
257
|
+
// Deviations: -0.10, 0, 0.10
|
|
258
|
+
// Squared deviations: 0.01, 0, 0.01
|
|
259
|
+
// Variance: 0.02 / 3 = 0.00666...
|
|
260
|
+
const variance = calculateVariance([0.70, 0.80, 0.90]);
|
|
261
|
+
expect(variance).toBeCloseTo(0.00667, 5);
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
test("calculates variance for larger dataset", () => {
|
|
265
|
+
// 10 scores with controlled variance
|
|
266
|
+
const scores = [0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84];
|
|
267
|
+
const variance = calculateVariance(scores);
|
|
268
|
+
expect(variance).toBeGreaterThan(0);
|
|
269
|
+
expect(variance).toBeLessThan(0.01); // Should be small but not zero
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
test("handles empty array", () => {
|
|
273
|
+
const variance = calculateVariance([]);
|
|
274
|
+
expect(variance).toBe(0);
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
test("handles high variance scores", () => {
|
|
278
|
+
const scores = [0.10, 0.50, 0.90];
|
|
279
|
+
const variance = calculateVariance(scores);
|
|
280
|
+
expect(variance).toBeGreaterThan(0.05);
|
|
281
|
+
});
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
describe("getPhase", () => {
|
|
285
|
+
test("returns bootstrap phase for <10 runs", () => {
|
|
286
|
+
const runs: EvalRunRecord[] = [];
|
|
287
|
+
for (let i = 1; i <= 9; i++) {
|
|
288
|
+
runs.push({
|
|
289
|
+
timestamp: new Date().toISOString(),
|
|
290
|
+
eval_name: "test-eval",
|
|
291
|
+
score: 0.80 + Math.random() * 0.1,
|
|
292
|
+
run_count: i,
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
for (const run of runs) {
|
|
297
|
+
recordEvalRun(testProjectPath, run);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
301
|
+
expect(phase).toBe("bootstrap");
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
test("returns stabilization phase for 10-50 runs", () => {
|
|
305
|
+
const runs: EvalRunRecord[] = [];
|
|
306
|
+
for (let i = 1; i <= 25; i++) {
|
|
307
|
+
runs.push({
|
|
308
|
+
timestamp: new Date().toISOString(),
|
|
309
|
+
eval_name: "test-eval",
|
|
310
|
+
score: 0.80 + Math.random() * 0.1,
|
|
311
|
+
run_count: i,
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
for (const run of runs) {
|
|
316
|
+
recordEvalRun(testProjectPath, run);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
320
|
+
expect(phase).toBe("stabilization");
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
test("returns production phase for >50 runs with low variance", () => {
|
|
324
|
+
const runs: EvalRunRecord[] = [];
|
|
325
|
+
// Create 60 runs with very low variance (0.80 ± 0.01)
|
|
326
|
+
for (let i = 1; i <= 60; i++) {
|
|
327
|
+
runs.push({
|
|
328
|
+
timestamp: new Date().toISOString(),
|
|
329
|
+
eval_name: "test-eval",
|
|
330
|
+
score: 0.80 + (Math.random() * 0.02 - 0.01), // Variance ~0.00003
|
|
331
|
+
run_count: i,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
for (const run of runs) {
|
|
336
|
+
recordEvalRun(testProjectPath, run);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
340
|
+
expect(phase).toBe("production");
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
test("returns stabilization phase for >50 runs with high variance", () => {
|
|
344
|
+
const runs: EvalRunRecord[] = [];
|
|
345
|
+
// Create 60 runs with high variance
|
|
346
|
+
// Variance = 0.1225 for alternating 0.1 and 0.8
|
|
347
|
+
for (let i = 1; i <= 60; i++) {
|
|
348
|
+
runs.push({
|
|
349
|
+
timestamp: new Date().toISOString(),
|
|
350
|
+
eval_name: "test-eval",
|
|
351
|
+
score: i % 2 === 0 ? 0.1 : 0.8,
|
|
352
|
+
run_count: i,
|
|
353
|
+
});
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
for (const run of runs) {
|
|
357
|
+
recordEvalRun(testProjectPath, run);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
361
|
+
expect(phase).toBe("stabilization");
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
test("returns bootstrap phase when no history exists", () => {
|
|
365
|
+
const phase = getPhase(testProjectPath, "nonexistent-eval");
|
|
366
|
+
expect(phase).toBe("bootstrap");
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
test("phase transitions at exactly 10 runs", () => {
|
|
370
|
+
const runs: EvalRunRecord[] = [];
|
|
371
|
+
for (let i = 1; i <= 10; i++) {
|
|
372
|
+
runs.push({
|
|
373
|
+
timestamp: new Date().toISOString(),
|
|
374
|
+
eval_name: "test-eval",
|
|
375
|
+
score: 0.80,
|
|
376
|
+
run_count: i,
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Record 9 runs - should be bootstrap
|
|
381
|
+
for (let i = 0; i < 9; i++) {
|
|
382
|
+
recordEvalRun(testProjectPath, runs[i]);
|
|
383
|
+
}
|
|
384
|
+
expect(getPhase(testProjectPath, "test-eval")).toBe("bootstrap");
|
|
385
|
+
|
|
386
|
+
// Add 10th run - should be stabilization
|
|
387
|
+
recordEvalRun(testProjectPath, runs[9]);
|
|
388
|
+
expect(getPhase(testProjectPath, "test-eval")).toBe("stabilization");
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
test("phase transitions at exactly 50 runs with low variance", () => {
|
|
392
|
+
const runs: EvalRunRecord[] = [];
|
|
393
|
+
for (let i = 1; i <= 51; i++) {
|
|
394
|
+
runs.push({
|
|
395
|
+
timestamp: new Date().toISOString(),
|
|
396
|
+
eval_name: "test-eval",
|
|
397
|
+
score: 0.80 + (Math.random() * 0.02 - 0.01),
|
|
398
|
+
run_count: i,
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// Record 50 runs - should be stabilization
|
|
403
|
+
for (let i = 0; i < 50; i++) {
|
|
404
|
+
recordEvalRun(testProjectPath, runs[i]);
|
|
405
|
+
}
|
|
406
|
+
expect(getPhase(testProjectPath, "test-eval")).toBe("stabilization");
|
|
407
|
+
|
|
408
|
+
// Add 51st run - should be production (if variance is low)
|
|
409
|
+
recordEvalRun(testProjectPath, runs[50]);
|
|
410
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
411
|
+
expect(phase).toBe("production");
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
test("variance threshold is 0.1", () => {
|
|
415
|
+
const runs: EvalRunRecord[] = [];
|
|
416
|
+
|
|
417
|
+
// Create 60 runs with variance just below 0.1
|
|
418
|
+
// Mean = 0.80, stdev = 0.30, variance = 0.09
|
|
419
|
+
for (let i = 1; i <= 60; i++) {
|
|
420
|
+
runs.push({
|
|
421
|
+
timestamp: new Date().toISOString(),
|
|
422
|
+
eval_name: "test-eval",
|
|
423
|
+
score: 0.80 + (i % 2 === 0 ? 0.15 : -0.15), // Produces variance ~0.0225
|
|
424
|
+
run_count: i,
|
|
425
|
+
});
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
for (const run of runs) {
|
|
429
|
+
recordEvalRun(testProjectPath, run);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
const phase = getPhase(testProjectPath, "test-eval");
|
|
433
|
+
expect(phase).toBe("production");
|
|
434
|
+
});
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
describe("phase progression integration", () => {
|
|
438
|
+
test("complete lifecycle: bootstrap -> stabilization -> production", () => {
|
|
439
|
+
const evalName = "lifecycle-test";
|
|
440
|
+
|
|
441
|
+
// Phase 1: Bootstrap (0-9 runs)
|
|
442
|
+
for (let i = 1; i <= 5; i++) {
|
|
443
|
+
recordEvalRun(testProjectPath, {
|
|
444
|
+
timestamp: new Date().toISOString(),
|
|
445
|
+
eval_name: evalName,
|
|
446
|
+
score: 0.75 + Math.random() * 0.2,
|
|
447
|
+
run_count: i,
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
expect(getPhase(testProjectPath, evalName)).toBe("bootstrap");
|
|
451
|
+
|
|
452
|
+
// Phase 2: Stabilization (10-50 runs)
|
|
453
|
+
for (let i = 6; i <= 30; i++) {
|
|
454
|
+
recordEvalRun(testProjectPath, {
|
|
455
|
+
timestamp: new Date().toISOString(),
|
|
456
|
+
eval_name: evalName,
|
|
457
|
+
score: 0.78 + Math.random() * 0.1,
|
|
458
|
+
run_count: i,
|
|
459
|
+
});
|
|
460
|
+
}
|
|
461
|
+
expect(getPhase(testProjectPath, evalName)).toBe("stabilization");
|
|
462
|
+
|
|
463
|
+
// Phase 3: Production (>50 runs, low variance)
|
|
464
|
+
for (let i = 31; i <= 60; i++) {
|
|
465
|
+
recordEvalRun(testProjectPath, {
|
|
466
|
+
timestamp: new Date().toISOString(),
|
|
467
|
+
eval_name: evalName,
|
|
468
|
+
score: 0.82 + (Math.random() * 0.02 - 0.01), // Very stable
|
|
469
|
+
run_count: i,
|
|
470
|
+
});
|
|
471
|
+
}
|
|
472
|
+
expect(getPhase(testProjectPath, evalName)).toBe("production");
|
|
473
|
+
|
|
474
|
+
// Verify history
|
|
475
|
+
const history = getScoreHistory(testProjectPath, evalName);
|
|
476
|
+
expect(history).toHaveLength(60);
|
|
477
|
+
});
|
|
478
|
+
|
|
479
|
+
test("regression in production keeps phase as stabilization if variance increases", () => {
|
|
480
|
+
const evalName = "regression-test";
|
|
481
|
+
|
|
482
|
+
// Build stable production phase
|
|
483
|
+
for (let i = 1; i <= 60; i++) {
|
|
484
|
+
recordEvalRun(testProjectPath, {
|
|
485
|
+
timestamp: new Date().toISOString(),
|
|
486
|
+
eval_name: evalName,
|
|
487
|
+
score: 0.85 + (Math.random() * 0.01 - 0.005),
|
|
488
|
+
run_count: i,
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
expect(getPhase(testProjectPath, evalName)).toBe("production");
|
|
492
|
+
|
|
493
|
+
// Introduce regression (high variance) - need 50 wild runs to push variance > 0.1
|
|
494
|
+
// 60 stable @ 0.85 + 50 wild @ 0.1/0.9 = variance ~0.103
|
|
495
|
+
for (let i = 61; i <= 110; i++) {
|
|
496
|
+
recordEvalRun(testProjectPath, {
|
|
497
|
+
timestamp: new Date().toISOString(),
|
|
498
|
+
eval_name: evalName,
|
|
499
|
+
score: i % 2 === 0 ? 0.1 : 0.9,
|
|
500
|
+
run_count: i,
|
|
501
|
+
});
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Should drop back to stabilization due to high variance
|
|
505
|
+
expect(getPhase(testProjectPath, evalName)).toBe("stabilization");
|
|
506
|
+
});
|
|
507
|
+
});
|
|
508
|
+
});
|