@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -5,19 +5,19 @@
5
5
  * Tests actual classes and functions, not inline mock implementations.
6
6
  */
7
7
 
8
- import { describe, expect, test } from 'bun:test';
8
+ import { describe, expect, test } from "bun:test";
9
9
  import {
10
10
  type BenchmarkConfig,
11
11
  BenchmarkDataGenerator,
12
12
  SeededRandom,
13
- } from '../BenchmarkDataGenerator';
13
+ } from "../BenchmarkDataGenerator";
14
14
 
15
15
  // =============================================================================
16
16
  // SeededRandom Tests - Real Class
17
17
  // =============================================================================
18
18
 
19
- describe('SeededRandom - Deterministic RNG', () => {
20
- test('same seed produces same sequence', () => {
19
+ describe("SeededRandom - Deterministic RNG", () => {
20
+ test("same seed produces same sequence", () => {
21
21
  const rng1 = new SeededRandom(12345);
22
22
  const rng2 = new SeededRandom(12345);
23
23
 
@@ -39,7 +39,7 @@ describe('SeededRandom - Deterministic RNG', () => {
39
39
  expect(seq1).toEqual(seq2);
40
40
  });
41
41
 
42
- test('different seeds produce different sequences', () => {
42
+ test("different seeds produce different sequences", () => {
43
43
  const rng1 = new SeededRandom(12345);
44
44
  const rng2 = new SeededRandom(54321);
45
45
 
@@ -49,7 +49,7 @@ describe('SeededRandom - Deterministic RNG', () => {
49
49
  expect(val1).not.toBe(val2);
50
50
  });
51
51
 
52
- test('next() produces values in [0, 1) range', () => {
52
+ test("next() produces values in [0, 1) range", () => {
53
53
  const rng = new SeededRandom(42);
54
54
 
55
55
  for (let i = 0; i < 1000; i++) {
@@ -59,7 +59,7 @@ describe('SeededRandom - Deterministic RNG', () => {
59
59
  }
60
60
  });
61
61
 
62
- test('nextInt() produces values in specified range', () => {
62
+ test("nextInt() produces values in specified range", () => {
63
63
  const rng = new SeededRandom(42);
64
64
 
65
65
  for (let i = 0; i < 100; i++) {
@@ -70,7 +70,7 @@ describe('SeededRandom - Deterministic RNG', () => {
70
70
  }
71
71
  });
72
72
 
73
- test('nextInt() handles single value range', () => {
73
+ test("nextInt() handles single value range", () => {
74
74
  const rng = new SeededRandom(42);
75
75
 
76
76
  for (let i = 0; i < 10; i++) {
@@ -79,9 +79,9 @@ describe('SeededRandom - Deterministic RNG', () => {
79
79
  }
80
80
  });
81
81
 
82
- test('pick() selects from array', () => {
82
+ test("pick() selects from array", () => {
83
83
  const rng = new SeededRandom(42);
84
- const options = ['a', 'b', 'c', 'd', 'e'];
84
+ const options = ["a", "b", "c", "d", "e"];
85
85
 
86
86
  const selections = new Set<string>();
87
87
  for (let i = 0; i < 100; i++) {
@@ -94,11 +94,11 @@ describe('SeededRandom - Deterministic RNG', () => {
94
94
  expect(selections.size).toBeGreaterThan(3);
95
95
  });
96
96
 
97
- test('pick() is deterministic with same seed', () => {
97
+ test("pick() is deterministic with same seed", () => {
98
98
  const rng1 = new SeededRandom(42);
99
99
  const rng2 = new SeededRandom(42);
100
100
 
101
- const options = ['a', 'b', 'c', 'd', 'e'];
101
+ const options = ["a", "b", "c", "d", "e"];
102
102
 
103
103
  const picks1 = [rng1.pick(options), rng1.pick(options), rng1.pick(options)];
104
104
  const picks2 = [rng2.pick(options), rng2.pick(options), rng2.pick(options)];
@@ -106,7 +106,7 @@ describe('SeededRandom - Deterministic RNG', () => {
106
106
  expect(picks1).toEqual(picks2);
107
107
  });
108
108
 
109
- test('nextFloat() produces values in specified range', () => {
109
+ test("nextFloat() produces values in specified range", () => {
110
110
  const rng = new SeededRandom(42);
111
111
 
112
112
  for (let i = 0; i < 100; i++) {
@@ -121,7 +121,7 @@ describe('SeededRandom - Deterministic RNG', () => {
121
121
  // BenchmarkDataGenerator Tests - Real Class
122
122
  // =============================================================================
123
123
 
124
- describe('BenchmarkDataGenerator - Data Generation', () => {
124
+ describe("BenchmarkDataGenerator - Data Generation", () => {
125
125
  const baseConfig: BenchmarkConfig = {
126
126
  durationMinutes: 60, // 1 hour
127
127
  tickInterval: 3600, // 1 hour ticks
@@ -131,7 +131,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
131
131
  seed: 12345,
132
132
  };
133
133
 
134
- test('generates deterministic data with same seed', async () => {
134
+ test("generates deterministic data with same seed", async () => {
135
135
  const generator1 = new BenchmarkDataGenerator(baseConfig);
136
136
  const generator2 = new BenchmarkDataGenerator(baseConfig);
137
137
 
@@ -140,25 +140,25 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
140
140
 
141
141
  // Same structure
142
142
  expect(snapshot1.initialState.predictionMarkets.length).toBe(
143
- snapshot2.initialState.predictionMarkets.length
143
+ snapshot2.initialState.predictionMarkets.length,
144
144
  );
145
145
  expect(snapshot1.initialState.perpetualMarkets.length).toBe(
146
- snapshot2.initialState.perpetualMarkets.length
146
+ snapshot2.initialState.perpetualMarkets.length,
147
147
  );
148
148
  expect(snapshot1.initialState.agents.length).toBe(
149
- snapshot2.initialState.agents.length
149
+ snapshot2.initialState.agents.length,
150
150
  );
151
151
 
152
152
  // Same content (deterministic)
153
153
  expect(snapshot1.initialState.perpetualMarkets[0]?.ticker).toBe(
154
- snapshot2.initialState.perpetualMarkets[0]?.ticker
154
+ snapshot2.initialState.perpetualMarkets[0]?.ticker,
155
155
  );
156
156
  expect(snapshot1.initialState.perpetualMarkets[0]?.price).toBe(
157
- snapshot2.initialState.perpetualMarkets[0]?.price
157
+ snapshot2.initialState.perpetualMarkets[0]?.price,
158
158
  );
159
159
  });
160
160
 
161
- test('generates correct number of markets', async () => {
161
+ test("generates correct number of markets", async () => {
162
162
  const generator = new BenchmarkDataGenerator(baseConfig);
163
163
  const snapshot = await generator.generate();
164
164
 
@@ -167,7 +167,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
167
167
  expect(snapshot.initialState.agents.length).toBe(5);
168
168
  });
169
169
 
170
- test('generates valid prediction market structure', async () => {
170
+ test("generates valid prediction market structure", async () => {
171
171
  const generator = new BenchmarkDataGenerator(baseConfig);
172
172
  const snapshot = await generator.generate();
173
173
 
@@ -184,32 +184,32 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
184
184
  }
185
185
  });
186
186
 
187
- test('generates valid perpetual market structure', async () => {
187
+ test("generates valid perpetual market structure", async () => {
188
188
  const generator = new BenchmarkDataGenerator(baseConfig);
189
189
  const snapshot = await generator.generate();
190
190
 
191
191
  for (const market of snapshot.initialState.perpetualMarkets) {
192
192
  expect(market.ticker).toBeDefined();
193
193
  expect(market.price).toBeGreaterThan(0);
194
- expect(typeof market.priceChange24h).toBe('number');
194
+ expect(typeof market.priceChange24h).toBe("number");
195
195
  expect(market.volume24h).toBeGreaterThanOrEqual(0);
196
- expect(typeof market.fundingRate).toBe('number');
196
+ expect(typeof market.fundingRate).toBe("number");
197
197
  }
198
198
  });
199
199
 
200
- test('generates valid agent structure', async () => {
200
+ test("generates valid agent structure", async () => {
201
201
  const generator = new BenchmarkDataGenerator(baseConfig);
202
202
  const snapshot = await generator.generate();
203
203
 
204
204
  for (const agent of snapshot.initialState.agents) {
205
205
  expect(agent.id).toBeDefined();
206
206
  expect(agent.name).toBeDefined();
207
- expect(typeof agent.reputation).toBe('number');
208
- expect(typeof agent.totalPnl).toBe('number');
207
+ expect(typeof agent.reputation).toBe("number");
208
+ expect(typeof agent.totalPnl).toBe("number");
209
209
  }
210
210
  });
211
211
 
212
- test('generates ticks for duration', async () => {
212
+ test("generates ticks for duration", async () => {
213
213
  const generator = new BenchmarkDataGenerator({
214
214
  ...baseConfig,
215
215
  durationMinutes: 180, // 3 hours
@@ -221,7 +221,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
221
221
  expect(snapshot.ticks.length).toBe(3);
222
222
  });
223
223
 
224
- test('different seeds produce different data', async () => {
224
+ test("different seeds produce different data", async () => {
225
225
  const generator1 = new BenchmarkDataGenerator({ ...baseConfig, seed: 111 });
226
226
  const generator2 = new BenchmarkDataGenerator({ ...baseConfig, seed: 222 });
227
227
 
@@ -240,7 +240,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
240
240
  // BenchmarkDataGenerator - Causal Simulation Mode
241
241
  // =============================================================================
242
242
 
243
- describe('BenchmarkDataGenerator - Causal Simulation', () => {
243
+ describe("BenchmarkDataGenerator - Causal Simulation", () => {
244
244
  const causalConfig: BenchmarkConfig = {
245
245
  durationMinutes: 24 * 60, // 1 day
246
246
  tickInterval: 3600, // Hourly (required for causal)
@@ -251,18 +251,18 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
251
251
  useCausalSimulation: true,
252
252
  };
253
253
 
254
- test('causal mode generates hidden narrative facts', async () => {
254
+ test("causal mode generates hidden narrative facts", async () => {
255
255
  const generator = new BenchmarkDataGenerator(causalConfig);
256
256
  const snapshot = await generator.generate();
257
257
 
258
258
  expect(snapshot.groundTruth).toBeDefined();
259
259
  expect(snapshot.groundTruth.hiddenNarrativeFacts).toBeDefined();
260
- expect(snapshot.groundTruth.hiddenNarrativeFacts!.length).toBeGreaterThan(
261
- 0
260
+ expect(snapshot.groundTruth.hiddenNarrativeFacts?.length).toBeGreaterThan(
261
+ 0,
262
262
  );
263
263
  });
264
264
 
265
- test('hidden narrative facts have valid structure', async () => {
265
+ test("hidden narrative facts have valid structure", async () => {
266
266
  const generator = new BenchmarkDataGenerator(causalConfig);
267
267
  const snapshot = await generator.generate();
268
268
 
@@ -271,39 +271,39 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
271
271
  expect(fact.fact).toBeDefined();
272
272
  expect(fact.affectsTickers).toBeDefined();
273
273
  expect(fact.affectsTickers.length).toBeGreaterThan(0);
274
- expect(['positive', 'negative']).toContain(fact.sentiment);
274
+ expect(["positive", "negative"]).toContain(fact.sentiment);
275
275
  expect(fact.eventSchedule).toBeDefined();
276
276
  expect(fact.eventSchedule.length).toBeGreaterThan(0);
277
277
  }
278
278
  });
279
279
 
280
- test('causal events are scheduled correctly', async () => {
280
+ test("causal events are scheduled correctly", async () => {
281
281
  const generator = new BenchmarkDataGenerator(causalConfig);
282
282
  const snapshot = await generator.generate();
283
283
 
284
284
  expect(snapshot.groundTruth.causalEvents).toBeDefined();
285
- expect(snapshot.groundTruth.causalEvents!.length).toBeGreaterThan(0);
285
+ expect(snapshot.groundTruth.causalEvents?.length).toBeGreaterThan(0);
286
286
 
287
287
  // Verify each causal event has required fields
288
288
  for (const event of snapshot.groundTruth.causalEvents!) {
289
289
  expect(event.tick).toBeDefined();
290
290
  expect(event.eventType).toBeDefined();
291
291
  expect(event.affectedTickers.length).toBeGreaterThan(0);
292
- expect(['low', 'medium', 'high']).toContain(event.volatilityBucket);
292
+ expect(["low", "medium", "high"]).toContain(event.volatilityBucket);
293
293
  }
294
294
  });
295
295
 
296
- test('causal mode generates market outcomes', async () => {
296
+ test("causal mode generates market outcomes", async () => {
297
297
  const generator = new BenchmarkDataGenerator(causalConfig);
298
298
  const snapshot = await generator.generate();
299
299
 
300
300
  expect(snapshot.groundTruth.marketOutcomes).toBeDefined();
301
301
  expect(
302
- Object.keys(snapshot.groundTruth.marketOutcomes).length
302
+ Object.keys(snapshot.groundTruth.marketOutcomes).length,
303
303
  ).toBeGreaterThan(0);
304
304
  });
305
305
 
306
- test('ground truth includes price history', async () => {
306
+ test("ground truth includes price history", async () => {
307
307
  const generator = new BenchmarkDataGenerator(causalConfig);
308
308
  const snapshot = await generator.generate();
309
309
 
@@ -322,8 +322,8 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
322
322
  // BenchmarkConfig Validation Tests
323
323
  // =============================================================================
324
324
 
325
- describe('BenchmarkConfig - Validation', () => {
326
- test('valid config creates generator without error', () => {
325
+ describe("BenchmarkConfig - Validation", () => {
326
+ test("valid config creates generator without error", () => {
327
327
  const config: BenchmarkConfig = {
328
328
  durationMinutes: 30 * 24 * 60,
329
329
  tickInterval: 3600,
@@ -336,7 +336,7 @@ describe('BenchmarkConfig - Validation', () => {
336
336
  expect(() => new BenchmarkDataGenerator(config)).not.toThrow();
337
337
  });
338
338
 
339
- test('config with zero markets is valid (edge case)', async () => {
339
+ test("config with zero markets is valid (edge case)", async () => {
340
340
  const config: BenchmarkConfig = {
341
341
  durationMinutes: 60,
342
342
  tickInterval: 3600,
@@ -353,7 +353,7 @@ describe('BenchmarkConfig - Validation', () => {
353
353
  expect(snapshot.initialState.perpetualMarkets.length).toBe(1);
354
354
  });
355
355
 
356
- test('calculates total ticks correctly', async () => {
356
+ test("calculates total ticks correctly", async () => {
357
357
  const config: BenchmarkConfig = {
358
358
  durationMinutes: 24 * 60, // 1 day
359
359
  tickInterval: 3600, // 1 hour
@@ -370,7 +370,7 @@ describe('BenchmarkConfig - Validation', () => {
370
370
  expect(snapshot.ticks.length).toBe(expectedTicks);
371
371
  });
372
372
 
373
- test('short duration with fast ticks', async () => {
373
+ test("short duration with fast ticks", async () => {
374
374
  const config: BenchmarkConfig = {
375
375
  durationMinutes: 10, // 10 minutes
376
376
  tickInterval: 60, // 1 minute
@@ -391,7 +391,7 @@ describe('BenchmarkConfig - Validation', () => {
391
391
  // Comparison Logic Tests - Using Real Types
392
392
  // =============================================================================
393
393
 
394
- describe('Benchmark Comparison Logic', () => {
394
+ describe("Benchmark Comparison Logic", () => {
395
395
  // Test the comparison calculation logic that would be used in runMultiple
396
396
  interface RunResult {
397
397
  id: string;
@@ -406,8 +406,8 @@ describe('Benchmark Comparison Logic', () => {
406
406
  avgPnl: 0,
407
407
  avgAccuracy: 0,
408
408
  avgOptimality: 0,
409
- bestRun: '',
410
- worstRun: '',
409
+ bestRun: "",
410
+ worstRun: "",
411
411
  };
412
412
  }
413
413
 
@@ -418,17 +418,17 @@ describe('Benchmark Comparison Logic', () => {
418
418
  runs.reduce((sum, r) => sum + r.optimality, 0) / runs.length;
419
419
  const bestRun = runs.reduce((best, r) => (r.pnl > best.pnl ? r : best)).id;
420
420
  const worstRun = runs.reduce((worst, r) =>
421
- r.pnl < worst.pnl ? r : worst
421
+ r.pnl < worst.pnl ? r : worst,
422
422
  ).id;
423
423
 
424
424
  return { avgPnl, avgAccuracy, avgOptimality, bestRun, worstRun };
425
425
  }
426
426
 
427
- test('calculates average metrics across runs', () => {
427
+ test("calculates average metrics across runs", () => {
428
428
  const runs: RunResult[] = [
429
- { id: 'run-1', pnl: 100, accuracy: 0.6, optimality: 0.7 },
430
- { id: 'run-2', pnl: 200, accuracy: 0.8, optimality: 0.8 },
431
- { id: 'run-3', pnl: 150, accuracy: 0.7, optimality: 0.75 },
429
+ { id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
430
+ { id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
431
+ { id: "run-3", pnl: 150, accuracy: 0.7, optimality: 0.75 },
432
432
  ];
433
433
 
434
434
  const comparison = calculateComparison(runs);
@@ -438,51 +438,51 @@ describe('Benchmark Comparison Logic', () => {
438
438
  expect(comparison.avgOptimality).toBe(0.75);
439
439
  });
440
440
 
441
- test('identifies best and worst runs', () => {
441
+ test("identifies best and worst runs", () => {
442
442
  const runs: RunResult[] = [
443
- { id: 'run-1', pnl: 100, accuracy: 0.6, optimality: 0.7 },
444
- { id: 'run-2', pnl: 200, accuracy: 0.8, optimality: 0.8 },
445
- { id: 'run-3', pnl: 50, accuracy: 0.5, optimality: 0.6 },
443
+ { id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
444
+ { id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
445
+ { id: "run-3", pnl: 50, accuracy: 0.5, optimality: 0.6 },
446
446
  ];
447
447
 
448
448
  const comparison = calculateComparison(runs);
449
449
 
450
- expect(comparison.bestRun).toBe('run-2');
451
- expect(comparison.worstRun).toBe('run-3');
450
+ expect(comparison.bestRun).toBe("run-2");
451
+ expect(comparison.worstRun).toBe("run-3");
452
452
  });
453
453
 
454
- test('handles negative PnL values', () => {
454
+ test("handles negative PnL values", () => {
455
455
  const runs: RunResult[] = [
456
- { id: 'run-1', pnl: -50, accuracy: 0.4, optimality: 0.3 },
457
- { id: 'run-2', pnl: 50, accuracy: 0.6, optimality: 0.6 },
458
- { id: 'run-3', pnl: -100, accuracy: 0.3, optimality: 0.2 },
456
+ { id: "run-1", pnl: -50, accuracy: 0.4, optimality: 0.3 },
457
+ { id: "run-2", pnl: 50, accuracy: 0.6, optimality: 0.6 },
458
+ { id: "run-3", pnl: -100, accuracy: 0.3, optimality: 0.2 },
459
459
  ];
460
460
 
461
461
  const comparison = calculateComparison(runs);
462
462
 
463
- expect(comparison.bestRun).toBe('run-2');
464
- expect(comparison.worstRun).toBe('run-3');
463
+ expect(comparison.bestRun).toBe("run-2");
464
+ expect(comparison.worstRun).toBe("run-3");
465
465
  expect(comparison.avgPnl).toBeCloseTo(-33.33, 1);
466
466
  });
467
467
 
468
- test('handles single run', () => {
468
+ test("handles single run", () => {
469
469
  const runs: RunResult[] = [
470
- { id: 'run-1', pnl: 100, accuracy: 0.7, optimality: 0.8 },
470
+ { id: "run-1", pnl: 100, accuracy: 0.7, optimality: 0.8 },
471
471
  ];
472
472
 
473
473
  const comparison = calculateComparison(runs);
474
474
 
475
475
  expect(comparison.avgPnl).toBe(100);
476
- expect(comparison.bestRun).toBe('run-1');
477
- expect(comparison.worstRun).toBe('run-1');
476
+ expect(comparison.bestRun).toBe("run-1");
477
+ expect(comparison.worstRun).toBe("run-1");
478
478
  });
479
479
 
480
- test('handles empty runs array', () => {
480
+ test("handles empty runs array", () => {
481
481
  const comparison = calculateComparison([]);
482
482
 
483
483
  expect(comparison.avgPnl).toBe(0);
484
- expect(comparison.bestRun).toBe('');
485
- expect(comparison.worstRun).toBe('');
484
+ expect(comparison.bestRun).toBe("");
485
+ expect(comparison.worstRun).toBe("");
486
486
  });
487
487
  });
488
488
 
@@ -490,7 +490,7 @@ describe('Benchmark Comparison Logic', () => {
490
490
  // Alpha Calculation (Excess Return)
491
491
  // =============================================================================
492
492
 
493
- describe('Alpha Calculation', () => {
493
+ describe("Alpha Calculation", () => {
494
494
  function calculateAlpha(baselinePnl: number, challengerPnl: number) {
495
495
  const alpha = challengerPnl - baselinePnl;
496
496
  const alphaPercent =
@@ -502,31 +502,31 @@ describe('Alpha Calculation', () => {
502
502
  return { alpha, alphaPercent };
503
503
  }
504
504
 
505
- test('positive alpha when outperforming', () => {
505
+ test("positive alpha when outperforming", () => {
506
506
  const result = calculateAlpha(100, 150);
507
507
  expect(result.alpha).toBe(50);
508
508
  expect(result.alphaPercent).toBe(50);
509
509
  });
510
510
 
511
- test('negative alpha when underperforming', () => {
511
+ test("negative alpha when underperforming", () => {
512
512
  const result = calculateAlpha(150, 100);
513
513
  expect(result.alpha).toBe(-50);
514
514
  expect(result.alphaPercent).toBeCloseTo(-33.33, 1);
515
515
  });
516
516
 
517
- test('zero alpha when equal performance', () => {
517
+ test("zero alpha when equal performance", () => {
518
518
  const result = calculateAlpha(100, 100);
519
519
  expect(result.alpha).toBe(0);
520
520
  expect(result.alphaPercent).toBe(0);
521
521
  });
522
522
 
523
- test('handles baseline of zero', () => {
523
+ test("handles baseline of zero", () => {
524
524
  const result = calculateAlpha(0, 100);
525
525
  expect(result.alpha).toBe(100);
526
526
  expect(result.alphaPercent).toBe(Infinity);
527
527
  });
528
528
 
529
- test('handles both zero', () => {
529
+ test("handles both zero", () => {
530
530
  const result = calculateAlpha(0, 0);
531
531
  expect(result.alpha).toBe(0);
532
532
  expect(result.alphaPercent).toBe(0);
@@ -1,14 +1,14 @@
1
- import { describe, expect, it } from 'bun:test';
2
- import type { BenchmarkGameSnapshot } from '../BenchmarkDataGenerator';
3
- import { MetricsVisualizer } from '../MetricsVisualizer';
4
- import { SimulationEngine, type SimulationResult } from '../SimulationEngine';
1
+ import { describe, expect, it } from "bun:test";
2
+ import type { BenchmarkGameSnapshot } from "../BenchmarkDataGenerator";
3
+ import { MetricsVisualizer } from "../MetricsVisualizer";
4
+ import { SimulationEngine, type SimulationResult } from "../SimulationEngine";
5
5
 
6
- describe('Head-to-Head Benchmark Infrastructure', () => {
6
+ describe("Head-to-Head Benchmark Infrastructure", () => {
7
7
  // 1. Test Simulation Engine PnL History Tracking
8
- describe('SimulationEngine PnL History', () => {
9
- it('should initialize with empty pnlHistory and return it after run()', async () => {
8
+ describe("SimulationEngine PnL History", () => {
9
+ it("should initialize with empty pnlHistory and return it after run()", async () => {
10
10
  const mockSnapshot = {
11
- id: 'test',
11
+ id: "test",
12
12
  ticks: [],
13
13
  initialState: {
14
14
  predictionMarkets: [],
@@ -24,7 +24,7 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
24
24
 
25
25
  const engine = new SimulationEngine({
26
26
  snapshot: mockSnapshot,
27
- agentId: 'test-agent',
27
+ agentId: "test-agent",
28
28
  fastForward: true,
29
29
  });
30
30
 
@@ -36,16 +36,16 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
36
36
  });
37
37
 
38
38
  // 2. Test MetricsVisualizer Logic
39
- describe('MetricsVisualizer Comparison Logic', () => {
39
+ describe("MetricsVisualizer Comparison Logic", () => {
40
40
  // Mock Result Helper
41
41
  const createMockResult = (
42
42
  id: string,
43
43
  pnl: number,
44
- history: number[]
44
+ history: number[],
45
45
  ): SimulationResult => ({
46
46
  id,
47
47
  agentId: id,
48
- benchmarkId: 'bench-1',
48
+ benchmarkId: "bench-1",
49
49
  startTime: 0,
50
50
  endTime: 1000,
51
51
  ticksProcessed: history.length,
@@ -75,13 +75,13 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
75
75
  timing: { totalDuration: 0, avgResponseTime: 0, maxResponseTime: 0 },
76
76
  optimalityScore: 50,
77
77
  },
78
- trajectory: { states: [], actions: [], rewards: [], windowId: '' },
78
+ trajectory: { states: [], actions: [], rewards: [], windowId: "" },
79
79
  pnlHistory: history.map((val, idx) => ({ tick: idx, pnl: val })),
80
80
  });
81
81
 
82
- it('should correctly merge PnL histories of equal length', () => {
83
- const baseline = createMockResult('baseline', 100, [10, 50, 100]);
84
- const challenger = createMockResult('challenger', 200, [20, 100, 200]);
82
+ it("should correctly merge PnL histories of equal length", () => {
83
+ const baseline = createMockResult("baseline", 100, [10, 50, 100]);
84
+ const challenger = createMockResult("challenger", 200, [20, 100, 200]);
85
85
 
86
86
  // Use public static method
87
87
  const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
@@ -90,11 +90,11 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
90
90
  expect(history[2]).toEqual({ tick: 2, baseline: 100, challenger: 200 });
91
91
  });
92
92
 
93
- it('should handle unequal history lengths (fill with final value)', () => {
93
+ it("should handle unequal history lengths (fill with final value)", () => {
94
94
  // Baseline died early (e.g., bankruptcy or crash)
95
- const baseline = createMockResult('baseline', -50, [10, -50]);
95
+ const baseline = createMockResult("baseline", -50, [10, -50]);
96
96
  // Challenger kept going
97
- const challenger = createMockResult('challenger', 100, [20, 60, 80, 100]);
97
+ const challenger = createMockResult("challenger", 100, [20, 60, 80, 100]);
98
98
 
99
99
  const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
100
100
 
@@ -109,18 +109,18 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
109
109
  expect(history[3]).toEqual({ tick: 3, baseline: -50, challenger: 100 });
110
110
  });
111
111
 
112
- it('should generate ASCII chart string', () => {
113
- const baseline = createMockResult('baseline', 100, [10, 100]);
114
- const challenger = createMockResult('challenger', 200, [20, 200]);
112
+ it("should generate ASCII chart string", () => {
113
+ const baseline = createMockResult("baseline", 100, [10, 100]);
114
+ const challenger = createMockResult("challenger", 200, [20, 200]);
115
115
 
116
116
  const chart = MetricsVisualizer.generateAsciiComparison(
117
117
  baseline,
118
- challenger
118
+ challenger,
119
119
  );
120
120
 
121
- expect(chart).toContain('HEAD-TO-HEAD RESULTS');
122
- expect(chart).toContain('WINNER: Challenger');
123
- expect(chart).toContain('Alpha Generated: +$100.00');
121
+ expect(chart).toContain("HEAD-TO-HEAD RESULTS");
122
+ expect(chart).toContain("WINNER: Challenger");
123
+ expect(chart).toContain("Alpha Generated: +$100.00");
124
124
  });
125
125
  });
126
126
  });