opencode-swarm-plugin 0.38.0 → 0.39.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +11 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +130 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +475 -0
  9. package/bin/swarm.ts +383 -0
  10. package/dist/compaction-hook.d.ts +1 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-prompt-scoring.d.ts +124 -0
  13. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  14. package/dist/eval-capture.d.ts +81 -1
  15. package/dist/eval-capture.d.ts.map +1 -1
  16. package/dist/eval-gates.d.ts +84 -0
  17. package/dist/eval-gates.d.ts.map +1 -0
  18. package/dist/eval-history.d.ts +117 -0
  19. package/dist/eval-history.d.ts.map +1 -0
  20. package/dist/eval-learning.d.ts +216 -0
  21. package/dist/eval-learning.d.ts.map +1 -0
  22. package/dist/index.d.ts +44 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +370 -13
  25. package/dist/plugin.js +203 -13
  26. package/dist/post-compaction-tracker.d.ts +133 -0
  27. package/dist/post-compaction-tracker.d.ts.map +1 -0
  28. package/dist/swarm-orchestrate.d.ts +23 -0
  29. package/dist/swarm-orchestrate.d.ts.map +1 -1
  30. package/dist/swarm-prompts.d.ts +25 -1
  31. package/dist/swarm-prompts.d.ts.map +1 -1
  32. package/dist/swarm.d.ts +4 -0
  33. package/dist/swarm.d.ts.map +1 -1
  34. package/evals/README.md +589 -105
  35. package/evals/compaction-prompt.eval.ts +149 -0
  36. package/evals/coordinator-behavior.eval.ts +8 -8
  37. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  38. package/evals/lib/compaction-loader.test.ts +248 -0
  39. package/evals/lib/compaction-loader.ts +320 -0
  40. package/evals/lib/data-loader.test.ts +345 -0
  41. package/evals/lib/data-loader.ts +107 -6
  42. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  43. package/evals/scorers/compaction-scorers.ts +13 -13
  44. package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
  45. package/evals/scorers/coordinator-discipline.ts +13 -13
  46. package/examples/plugin-wrapper-template.ts +117 -0
  47. package/package.json +7 -5
  48. package/scripts/migrate-unknown-sessions.ts +349 -0
  49. package/src/compaction-capture.integration.test.ts +257 -0
  50. package/src/compaction-hook.test.ts +42 -0
  51. package/src/compaction-hook.ts +81 -0
  52. package/src/compaction-prompt-scorers.test.ts +299 -0
  53. package/src/compaction-prompt-scoring.ts +298 -0
  54. package/src/eval-capture.test.ts +422 -0
  55. package/src/eval-capture.ts +94 -2
  56. package/src/eval-gates.test.ts +306 -0
  57. package/src/eval-gates.ts +218 -0
  58. package/src/eval-history.test.ts +508 -0
  59. package/src/eval-history.ts +214 -0
  60. package/src/eval-learning.test.ts +378 -0
  61. package/src/eval-learning.ts +360 -0
  62. package/src/index.ts +61 -1
  63. package/src/post-compaction-tracker.test.ts +251 -0
  64. package/src/post-compaction-tracker.ts +237 -0
  65. package/src/swarm-decompose.ts +2 -2
  66. package/src/swarm-orchestrate.ts +2 -2
  67. package/src/swarm-prompts.ts +2 -2
  68. package/src/swarm-review.ts +3 -3
  69. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,508 @@
1
+ /**
2
+ * Tests for eval-history - tracks eval run scores and calculates progressive phases
3
+ *
4
+ * TDD: RED phase - all tests should fail initially
5
+ */
6
+ import { afterEach, beforeEach, describe, expect, test } from "bun:test";
7
+ import * as fs from "node:fs";
8
+ import * as path from "node:path";
9
+ import {
10
+ type EvalRunRecord,
11
+ type Phase,
12
+ calculateVariance,
13
+ getPhase,
14
+ getScoreHistory,
15
+ recordEvalRun,
16
+ } from "./eval-history.js";
17
+
18
+ describe("eval-history", () => {
19
+ const testDir = path.join(import.meta.dir, ".test-eval-history");
20
+ const testProjectPath = path.join(testDir, "test-project");
21
+
22
+ beforeEach(() => {
23
+ // Clean slate for each test
24
+ if (fs.existsSync(testDir)) {
25
+ fs.rmSync(testDir, { recursive: true });
26
+ }
27
+ fs.mkdirSync(testProjectPath, { recursive: true });
28
+ });
29
+
30
+ afterEach(() => {
31
+ // Cleanup
32
+ if (fs.existsSync(testDir)) {
33
+ fs.rmSync(testDir, { recursive: true });
34
+ }
35
+ });
36
+
37
+ describe("recordEvalRun", () => {
38
+ test("appends eval run to JSONL file", () => {
39
+ const run: EvalRunRecord = {
40
+ timestamp: new Date().toISOString(),
41
+ eval_name: "swarm-decomposition",
42
+ score: 0.85,
43
+ run_count: 1,
44
+ };
45
+
46
+ recordEvalRun(testProjectPath, run);
47
+
48
+ const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
49
+ expect(fs.existsSync(historyPath)).toBe(true);
50
+
51
+ const content = fs.readFileSync(historyPath, "utf-8");
52
+ const lines = content.trim().split("\n");
53
+ expect(lines).toHaveLength(1);
54
+
55
+ const parsed = JSON.parse(lines[0]);
56
+ expect(parsed.eval_name).toBe("swarm-decomposition");
57
+ expect(parsed.score).toBe(0.85);
58
+ expect(parsed.run_count).toBe(1);
59
+ });
60
+
61
+ test("appends multiple runs sequentially", () => {
62
+ const run1: EvalRunRecord = {
63
+ timestamp: new Date().toISOString(),
64
+ eval_name: "swarm-decomposition",
65
+ score: 0.80,
66
+ run_count: 1,
67
+ };
68
+ const run2: EvalRunRecord = {
69
+ timestamp: new Date().toISOString(),
70
+ eval_name: "swarm-decomposition",
71
+ score: 0.85,
72
+ run_count: 2,
73
+ };
74
+
75
+ recordEvalRun(testProjectPath, run1);
76
+ recordEvalRun(testProjectPath, run2);
77
+
78
+ const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
79
+ const content = fs.readFileSync(historyPath, "utf-8");
80
+ const lines = content.trim().split("\n");
81
+ expect(lines).toHaveLength(2);
82
+
83
+ const parsed1 = JSON.parse(lines[0]);
84
+ const parsed2 = JSON.parse(lines[1]);
85
+ expect(parsed1.score).toBe(0.80);
86
+ expect(parsed2.score).toBe(0.85);
87
+ });
88
+
89
+ test("creates directory if it doesn't exist", () => {
90
+ const run: EvalRunRecord = {
91
+ timestamp: new Date().toISOString(),
92
+ eval_name: "test-eval",
93
+ score: 0.90,
94
+ run_count: 1,
95
+ };
96
+
97
+ // Directory doesn't exist yet
98
+ const opencodePath = path.join(testProjectPath, ".opencode");
99
+ expect(fs.existsSync(opencodePath)).toBe(false);
100
+
101
+ recordEvalRun(testProjectPath, run);
102
+
103
+ // Directory should be created
104
+ expect(fs.existsSync(opencodePath)).toBe(true);
105
+ });
106
+
107
+ test("supports different eval names in same history", () => {
108
+ const run1: EvalRunRecord = {
109
+ timestamp: new Date().toISOString(),
110
+ eval_name: "swarm-decomposition",
111
+ score: 0.85,
112
+ run_count: 1,
113
+ };
114
+ const run2: EvalRunRecord = {
115
+ timestamp: new Date().toISOString(),
116
+ eval_name: "coordinator-session",
117
+ score: 0.75,
118
+ run_count: 1,
119
+ };
120
+
121
+ recordEvalRun(testProjectPath, run1);
122
+ recordEvalRun(testProjectPath, run2);
123
+
124
+ const historyPath = path.join(testProjectPath, ".opencode/eval-history.jsonl");
125
+ const content = fs.readFileSync(historyPath, "utf-8");
126
+ const lines = content.trim().split("\n");
127
+ expect(lines).toHaveLength(2);
128
+
129
+ const parsed1 = JSON.parse(lines[0]);
130
+ const parsed2 = JSON.parse(lines[1]);
131
+ expect(parsed1.eval_name).toBe("swarm-decomposition");
132
+ expect(parsed2.eval_name).toBe("coordinator-session");
133
+ });
134
+ });
135
+
136
+ describe("getScoreHistory", () => {
137
+ test("returns empty array when no history exists", () => {
138
+ const history = getScoreHistory(testProjectPath, "swarm-decomposition");
139
+ expect(history).toEqual([]);
140
+ });
141
+
142
+ test("returns all runs for a specific eval", () => {
143
+ const runs: EvalRunRecord[] = [
144
+ {
145
+ timestamp: new Date().toISOString(),
146
+ eval_name: "swarm-decomposition",
147
+ score: 0.80,
148
+ run_count: 1,
149
+ },
150
+ {
151
+ timestamp: new Date().toISOString(),
152
+ eval_name: "swarm-decomposition",
153
+ score: 0.85,
154
+ run_count: 2,
155
+ },
156
+ {
157
+ timestamp: new Date().toISOString(),
158
+ eval_name: "coordinator-session",
159
+ score: 0.70,
160
+ run_count: 1,
161
+ },
162
+ ];
163
+
164
+ for (const run of runs) {
165
+ recordEvalRun(testProjectPath, run);
166
+ }
167
+
168
+ const history = getScoreHistory(testProjectPath, "swarm-decomposition");
169
+ expect(history).toHaveLength(2);
170
+ expect(history[0].score).toBe(0.80);
171
+ expect(history[1].score).toBe(0.85);
172
+ });
173
+
174
+ test("filters by eval_name correctly", () => {
175
+ const runs: EvalRunRecord[] = [
176
+ {
177
+ timestamp: new Date().toISOString(),
178
+ eval_name: "swarm-decomposition",
179
+ score: 0.80,
180
+ run_count: 1,
181
+ },
182
+ {
183
+ timestamp: new Date().toISOString(),
184
+ eval_name: "coordinator-session",
185
+ score: 0.70,
186
+ run_count: 1,
187
+ },
188
+ {
189
+ timestamp: new Date().toISOString(),
190
+ eval_name: "swarm-decomposition",
191
+ score: 0.85,
192
+ run_count: 2,
193
+ },
194
+ ];
195
+
196
+ for (const run of runs) {
197
+ recordEvalRun(testProjectPath, run);
198
+ }
199
+
200
+ const decompositionHistory = getScoreHistory(testProjectPath, "swarm-decomposition");
201
+ const coordinatorHistory = getScoreHistory(testProjectPath, "coordinator-session");
202
+
203
+ expect(decompositionHistory).toHaveLength(2);
204
+ expect(coordinatorHistory).toHaveLength(1);
205
+ expect(coordinatorHistory[0].score).toBe(0.70);
206
+ });
207
+
208
+ test("returns runs in chronological order", () => {
209
+ const baseTime = Date.now();
210
+ const runs: EvalRunRecord[] = [
211
+ {
212
+ timestamp: new Date(baseTime).toISOString(),
213
+ eval_name: "test-eval",
214
+ score: 0.80,
215
+ run_count: 1,
216
+ },
217
+ {
218
+ timestamp: new Date(baseTime + 1000).toISOString(),
219
+ eval_name: "test-eval",
220
+ score: 0.85,
221
+ run_count: 2,
222
+ },
223
+ {
224
+ timestamp: new Date(baseTime + 2000).toISOString(),
225
+ eval_name: "test-eval",
226
+ score: 0.90,
227
+ run_count: 3,
228
+ },
229
+ ];
230
+
231
+ for (const run of runs) {
232
+ recordEvalRun(testProjectPath, run);
233
+ }
234
+
235
+ const history = getScoreHistory(testProjectPath, "test-eval");
236
+ expect(history).toHaveLength(3);
237
+ expect(history[0].score).toBe(0.80);
238
+ expect(history[1].score).toBe(0.85);
239
+ expect(history[2].score).toBe(0.90);
240
+ });
241
+ });
242
+
243
+ describe("calculateVariance", () => {
244
+ test("returns 0 for single score", () => {
245
+ const variance = calculateVariance([0.85]);
246
+ expect(variance).toBe(0);
247
+ });
248
+
249
+ test("returns 0 for identical scores", () => {
250
+ const variance = calculateVariance([0.80, 0.80, 0.80, 0.80]);
251
+ expect(variance).toBe(0);
252
+ });
253
+
254
+ test("calculates variance for varying scores", () => {
255
+ // Scores: 0.70, 0.80, 0.90
256
+ // Mean: 0.80
257
+ // Deviations: -0.10, 0, 0.10
258
+ // Squared deviations: 0.01, 0, 0.01
259
+ // Variance: 0.02 / 3 = 0.00666...
260
+ const variance = calculateVariance([0.70, 0.80, 0.90]);
261
+ expect(variance).toBeCloseTo(0.00667, 5);
262
+ });
263
+
264
+ test("calculates variance for larger dataset", () => {
265
+ // 10 scores with controlled variance
266
+ const scores = [0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84];
267
+ const variance = calculateVariance(scores);
268
+ expect(variance).toBeGreaterThan(0);
269
+ expect(variance).toBeLessThan(0.01); // Should be small but not zero
270
+ });
271
+
272
+ test("handles empty array", () => {
273
+ const variance = calculateVariance([]);
274
+ expect(variance).toBe(0);
275
+ });
276
+
277
+ test("handles high variance scores", () => {
278
+ const scores = [0.10, 0.50, 0.90];
279
+ const variance = calculateVariance(scores);
280
+ expect(variance).toBeGreaterThan(0.05);
281
+ });
282
+ });
283
+
284
+ describe("getPhase", () => {
285
+ test("returns bootstrap phase for <10 runs", () => {
286
+ const runs: EvalRunRecord[] = [];
287
+ for (let i = 1; i <= 9; i++) {
288
+ runs.push({
289
+ timestamp: new Date().toISOString(),
290
+ eval_name: "test-eval",
291
+ score: 0.80 + Math.random() * 0.1,
292
+ run_count: i,
293
+ });
294
+ }
295
+
296
+ for (const run of runs) {
297
+ recordEvalRun(testProjectPath, run);
298
+ }
299
+
300
+ const phase = getPhase(testProjectPath, "test-eval");
301
+ expect(phase).toBe("bootstrap");
302
+ });
303
+
304
+ test("returns stabilization phase for 10-50 runs", () => {
305
+ const runs: EvalRunRecord[] = [];
306
+ for (let i = 1; i <= 25; i++) {
307
+ runs.push({
308
+ timestamp: new Date().toISOString(),
309
+ eval_name: "test-eval",
310
+ score: 0.80 + Math.random() * 0.1,
311
+ run_count: i,
312
+ });
313
+ }
314
+
315
+ for (const run of runs) {
316
+ recordEvalRun(testProjectPath, run);
317
+ }
318
+
319
+ const phase = getPhase(testProjectPath, "test-eval");
320
+ expect(phase).toBe("stabilization");
321
+ });
322
+
323
+ test("returns production phase for >50 runs with low variance", () => {
324
+ const runs: EvalRunRecord[] = [];
325
+ // Create 60 runs with very low variance (0.80 ± 0.01)
326
+ for (let i = 1; i <= 60; i++) {
327
+ runs.push({
328
+ timestamp: new Date().toISOString(),
329
+ eval_name: "test-eval",
330
+ score: 0.80 + (Math.random() * 0.02 - 0.01), // Variance ~0.00003
331
+ run_count: i,
332
+ });
333
+ }
334
+
335
+ for (const run of runs) {
336
+ recordEvalRun(testProjectPath, run);
337
+ }
338
+
339
+ const phase = getPhase(testProjectPath, "test-eval");
340
+ expect(phase).toBe("production");
341
+ });
342
+
343
+ test("returns stabilization phase for >50 runs with high variance", () => {
344
+ const runs: EvalRunRecord[] = [];
345
+ // Create 60 runs with high variance
346
+ // Variance = 0.1225 for alternating 0.1 and 0.8
347
+ for (let i = 1; i <= 60; i++) {
348
+ runs.push({
349
+ timestamp: new Date().toISOString(),
350
+ eval_name: "test-eval",
351
+ score: i % 2 === 0 ? 0.1 : 0.8,
352
+ run_count: i,
353
+ });
354
+ }
355
+
356
+ for (const run of runs) {
357
+ recordEvalRun(testProjectPath, run);
358
+ }
359
+
360
+ const phase = getPhase(testProjectPath, "test-eval");
361
+ expect(phase).toBe("stabilization");
362
+ });
363
+
364
+ test("returns bootstrap phase when no history exists", () => {
365
+ const phase = getPhase(testProjectPath, "nonexistent-eval");
366
+ expect(phase).toBe("bootstrap");
367
+ });
368
+
369
+ test("phase transitions at exactly 10 runs", () => {
370
+ const runs: EvalRunRecord[] = [];
371
+ for (let i = 1; i <= 10; i++) {
372
+ runs.push({
373
+ timestamp: new Date().toISOString(),
374
+ eval_name: "test-eval",
375
+ score: 0.80,
376
+ run_count: i,
377
+ });
378
+ }
379
+
380
+ // Record 9 runs - should be bootstrap
381
+ for (let i = 0; i < 9; i++) {
382
+ recordEvalRun(testProjectPath, runs[i]);
383
+ }
384
+ expect(getPhase(testProjectPath, "test-eval")).toBe("bootstrap");
385
+
386
+ // Add 10th run - should be stabilization
387
+ recordEvalRun(testProjectPath, runs[9]);
388
+ expect(getPhase(testProjectPath, "test-eval")).toBe("stabilization");
389
+ });
390
+
391
+ test("phase transitions at exactly 50 runs with low variance", () => {
392
+ const runs: EvalRunRecord[] = [];
393
+ for (let i = 1; i <= 51; i++) {
394
+ runs.push({
395
+ timestamp: new Date().toISOString(),
396
+ eval_name: "test-eval",
397
+ score: 0.80 + (Math.random() * 0.02 - 0.01),
398
+ run_count: i,
399
+ });
400
+ }
401
+
402
+ // Record 50 runs - should be stabilization
403
+ for (let i = 0; i < 50; i++) {
404
+ recordEvalRun(testProjectPath, runs[i]);
405
+ }
406
+ expect(getPhase(testProjectPath, "test-eval")).toBe("stabilization");
407
+
408
+ // Add 51st run - should be production (if variance is low)
409
+ recordEvalRun(testProjectPath, runs[50]);
410
+ const phase = getPhase(testProjectPath, "test-eval");
411
+ expect(phase).toBe("production");
412
+ });
413
+
414
+ test("variance threshold is 0.1", () => {
415
+ const runs: EvalRunRecord[] = [];
416
+
417
+ // Create 60 runs with variance just below 0.1
418
+ // Mean = 0.80, stdev = 0.30, variance = 0.09
419
+ for (let i = 1; i <= 60; i++) {
420
+ runs.push({
421
+ timestamp: new Date().toISOString(),
422
+ eval_name: "test-eval",
423
+ score: 0.80 + (i % 2 === 0 ? 0.15 : -0.15), // Produces variance ~0.0225
424
+ run_count: i,
425
+ });
426
+ }
427
+
428
+ for (const run of runs) {
429
+ recordEvalRun(testProjectPath, run);
430
+ }
431
+
432
+ const phase = getPhase(testProjectPath, "test-eval");
433
+ expect(phase).toBe("production");
434
+ });
435
+ });
436
+
437
+ describe("phase progression integration", () => {
438
+ test("complete lifecycle: bootstrap -> stabilization -> production", () => {
439
+ const evalName = "lifecycle-test";
440
+
441
+ // Phase 1: Bootstrap (0-9 runs)
442
+ for (let i = 1; i <= 5; i++) {
443
+ recordEvalRun(testProjectPath, {
444
+ timestamp: new Date().toISOString(),
445
+ eval_name: evalName,
446
+ score: 0.75 + Math.random() * 0.2,
447
+ run_count: i,
448
+ });
449
+ }
450
+ expect(getPhase(testProjectPath, evalName)).toBe("bootstrap");
451
+
452
+ // Phase 2: Stabilization (10-50 runs)
453
+ for (let i = 6; i <= 30; i++) {
454
+ recordEvalRun(testProjectPath, {
455
+ timestamp: new Date().toISOString(),
456
+ eval_name: evalName,
457
+ score: 0.78 + Math.random() * 0.1,
458
+ run_count: i,
459
+ });
460
+ }
461
+ expect(getPhase(testProjectPath, evalName)).toBe("stabilization");
462
+
463
+ // Phase 3: Production (>50 runs, low variance)
464
+ for (let i = 31; i <= 60; i++) {
465
+ recordEvalRun(testProjectPath, {
466
+ timestamp: new Date().toISOString(),
467
+ eval_name: evalName,
468
+ score: 0.82 + (Math.random() * 0.02 - 0.01), // Very stable
469
+ run_count: i,
470
+ });
471
+ }
472
+ expect(getPhase(testProjectPath, evalName)).toBe("production");
473
+
474
+ // Verify history
475
+ const history = getScoreHistory(testProjectPath, evalName);
476
+ expect(history).toHaveLength(60);
477
+ });
478
+
479
+ test("regression in production keeps phase as stabilization if variance increases", () => {
480
+ const evalName = "regression-test";
481
+
482
+ // Build stable production phase
483
+ for (let i = 1; i <= 60; i++) {
484
+ recordEvalRun(testProjectPath, {
485
+ timestamp: new Date().toISOString(),
486
+ eval_name: evalName,
487
+ score: 0.85 + (Math.random() * 0.01 - 0.005),
488
+ run_count: i,
489
+ });
490
+ }
491
+ expect(getPhase(testProjectPath, evalName)).toBe("production");
492
+
493
+ // Introduce regression (high variance) - need 50 wild runs to push variance > 0.1
494
+ // 60 stable @ 0.85 + 50 wild @ 0.1/0.9 = variance ~0.103
495
+ for (let i = 61; i <= 110; i++) {
496
+ recordEvalRun(testProjectPath, {
497
+ timestamp: new Date().toISOString(),
498
+ eval_name: evalName,
499
+ score: i % 2 === 0 ? 0.1 : 0.9,
500
+ run_count: i,
501
+ });
502
+ }
503
+
504
+ // Should drop back to stabilization due to high variance
505
+ expect(getPhase(testProjectPath, evalName)).toBe("stabilization");
506
+ });
507
+ });
508
+ });