@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,409 @@
1
+ /**
2
+ * BenchmarkRunner Tests
3
+ *
4
+ * Tests the BenchmarkRunner and related benchmark infrastructure.
5
+ * Tests actual classes and functions, not inline mock implementations.
6
+ */
7
+ import { describe, expect, test } from "bun:test";
8
+ import { BenchmarkDataGenerator, SeededRandom, } from "../BenchmarkDataGenerator";
9
+ // =============================================================================
10
+ // SeededRandom Tests - Real Class
11
+ // =============================================================================
12
+ describe("SeededRandom - Deterministic RNG", () => {
13
+ test("same seed produces same sequence", () => {
14
+ const rng1 = new SeededRandom(12345);
15
+ const rng2 = new SeededRandom(12345);
16
+ const seq1 = [
17
+ rng1.next(),
18
+ rng1.next(),
19
+ rng1.next(),
20
+ rng1.next(),
21
+ rng1.next(),
22
+ ];
23
+ const seq2 = [
24
+ rng2.next(),
25
+ rng2.next(),
26
+ rng2.next(),
27
+ rng2.next(),
28
+ rng2.next(),
29
+ ];
30
+ expect(seq1).toEqual(seq2);
31
+ });
32
+ test("different seeds produce different sequences", () => {
33
+ const rng1 = new SeededRandom(12345);
34
+ const rng2 = new SeededRandom(54321);
35
+ const val1 = rng1.next();
36
+ const val2 = rng2.next();
37
+ expect(val1).not.toBe(val2);
38
+ });
39
+ test("next() produces values in [0, 1) range", () => {
40
+ const rng = new SeededRandom(42);
41
+ for (let i = 0; i < 1000; i++) {
42
+ const val = rng.next();
43
+ expect(val).toBeGreaterThanOrEqual(0);
44
+ expect(val).toBeLessThan(1);
45
+ }
46
+ });
47
+ test("nextInt() produces values in specified range", () => {
48
+ const rng = new SeededRandom(42);
49
+ for (let i = 0; i < 100; i++) {
50
+ const val = rng.nextInt(10, 20);
51
+ expect(val).toBeGreaterThanOrEqual(10);
52
+ expect(val).toBeLessThanOrEqual(20);
53
+ expect(Number.isInteger(val)).toBe(true);
54
+ }
55
+ });
56
+ test("nextInt() handles single value range", () => {
57
+ const rng = new SeededRandom(42);
58
+ for (let i = 0; i < 10; i++) {
59
+ const val = rng.nextInt(5, 5);
60
+ expect(val).toBe(5);
61
+ }
62
+ });
63
+ test("pick() selects from array", () => {
64
+ const rng = new SeededRandom(42);
65
+ const options = ["a", "b", "c", "d", "e"];
66
+ const selections = new Set();
67
+ for (let i = 0; i < 100; i++) {
68
+ const val = rng.pick(options);
69
+ expect(options).toContain(val);
70
+ selections.add(val);
71
+ }
72
+ // With 100 attempts, we should hit most options
73
+ expect(selections.size).toBeGreaterThan(3);
74
+ });
75
+ test("pick() is deterministic with same seed", () => {
76
+ const rng1 = new SeededRandom(42);
77
+ const rng2 = new SeededRandom(42);
78
+ const options = ["a", "b", "c", "d", "e"];
79
+ const picks1 = [rng1.pick(options), rng1.pick(options), rng1.pick(options)];
80
+ const picks2 = [rng2.pick(options), rng2.pick(options), rng2.pick(options)];
81
+ expect(picks1).toEqual(picks2);
82
+ });
83
+ test("nextFloat() produces values in specified range", () => {
84
+ const rng = new SeededRandom(42);
85
+ for (let i = 0; i < 100; i++) {
86
+ const val = rng.nextFloat(5.5, 10.5);
87
+ expect(val).toBeGreaterThanOrEqual(5.5);
88
+ expect(val).toBeLessThanOrEqual(10.5);
89
+ }
90
+ });
91
+ });
92
+ // =============================================================================
93
+ // BenchmarkDataGenerator Tests - Real Class
94
+ // =============================================================================
95
+ describe("BenchmarkDataGenerator - Data Generation", () => {
96
+ const baseConfig = {
97
+ durationMinutes: 60, // 1 hour
98
+ tickInterval: 3600, // 1 hour ticks
99
+ numPredictionMarkets: 2,
100
+ numPerpetualMarkets: 3,
101
+ numAgents: 5,
102
+ seed: 12345,
103
+ };
104
+ test("generates deterministic data with same seed", async () => {
105
+ const generator1 = new BenchmarkDataGenerator(baseConfig);
106
+ const generator2 = new BenchmarkDataGenerator(baseConfig);
107
+ const snapshot1 = await generator1.generate();
108
+ const snapshot2 = await generator2.generate();
109
+ // Same structure
110
+ expect(snapshot1.initialState.predictionMarkets.length).toBe(snapshot2.initialState.predictionMarkets.length);
111
+ expect(snapshot1.initialState.perpetualMarkets.length).toBe(snapshot2.initialState.perpetualMarkets.length);
112
+ expect(snapshot1.initialState.agents.length).toBe(snapshot2.initialState.agents.length);
113
+ // Same content (deterministic)
114
+ expect(snapshot1.initialState.perpetualMarkets[0]?.ticker).toBe(snapshot2.initialState.perpetualMarkets[0]?.ticker);
115
+ expect(snapshot1.initialState.perpetualMarkets[0]?.price).toBe(snapshot2.initialState.perpetualMarkets[0]?.price);
116
+ });
117
+ test("generates correct number of markets", async () => {
118
+ const generator = new BenchmarkDataGenerator(baseConfig);
119
+ const snapshot = await generator.generate();
120
+ expect(snapshot.initialState.predictionMarkets.length).toBe(2);
121
+ expect(snapshot.initialState.perpetualMarkets.length).toBe(3);
122
+ expect(snapshot.initialState.agents.length).toBe(5);
123
+ });
124
+ test("generates valid prediction market structure", async () => {
125
+ const generator = new BenchmarkDataGenerator(baseConfig);
126
+ const snapshot = await generator.generate();
127
+ for (const market of snapshot.initialState.predictionMarkets) {
128
+ expect(market.id).toBeDefined();
129
+ expect(market.question).toBeDefined();
130
+ expect(market.yesPrice).toBeGreaterThanOrEqual(0);
131
+ expect(market.yesPrice).toBeLessThanOrEqual(1);
132
+ expect(market.noPrice).toBeGreaterThanOrEqual(0);
133
+ expect(market.noPrice).toBeLessThanOrEqual(1);
134
+ expect(market.yesPrice + market.noPrice).toBeCloseTo(1, 1);
135
+ expect(market.resolved).toBe(false);
136
+ expect(market.liquidity).toBeGreaterThan(0);
137
+ }
138
+ });
139
+ test("generates valid perpetual market structure", async () => {
140
+ const generator = new BenchmarkDataGenerator(baseConfig);
141
+ const snapshot = await generator.generate();
142
+ for (const market of snapshot.initialState.perpetualMarkets) {
143
+ expect(market.ticker).toBeDefined();
144
+ expect(market.price).toBeGreaterThan(0);
145
+ expect(typeof market.priceChange24h).toBe("number");
146
+ expect(market.volume24h).toBeGreaterThanOrEqual(0);
147
+ expect(typeof market.fundingRate).toBe("number");
148
+ }
149
+ });
150
+ test("generates valid agent structure", async () => {
151
+ const generator = new BenchmarkDataGenerator(baseConfig);
152
+ const snapshot = await generator.generate();
153
+ for (const agent of snapshot.initialState.agents) {
154
+ expect(agent.id).toBeDefined();
155
+ expect(agent.name).toBeDefined();
156
+ expect(typeof agent.reputation).toBe("number");
157
+ expect(typeof agent.totalPnl).toBe("number");
158
+ }
159
+ });
160
+ test("generates ticks for duration", async () => {
161
+ const generator = new BenchmarkDataGenerator({
162
+ ...baseConfig,
163
+ durationMinutes: 180, // 3 hours
164
+ tickInterval: 3600, // 1 hour
165
+ });
166
+ const snapshot = await generator.generate();
167
+ // 3 hours / 1 hour per tick = 3 ticks
168
+ expect(snapshot.ticks.length).toBe(3);
169
+ });
170
+ test("different seeds produce different data", async () => {
171
+ const generator1 = new BenchmarkDataGenerator({ ...baseConfig, seed: 111 });
172
+ const generator2 = new BenchmarkDataGenerator({ ...baseConfig, seed: 222 });
173
+ const snapshot1 = await generator1.generate();
174
+ const snapshot2 = await generator2.generate();
175
+ // Prices should differ with different seeds
176
+ const price1 = snapshot1.initialState.perpetualMarkets[0]?.price;
177
+ const price2 = snapshot2.initialState.perpetualMarkets[0]?.price;
178
+ expect(price1).not.toBe(price2);
179
+ });
180
+ });
181
+ // =============================================================================
182
+ // BenchmarkDataGenerator - Causal Simulation Mode
183
+ // =============================================================================
184
+ describe("BenchmarkDataGenerator - Causal Simulation", () => {
185
+ const causalConfig = {
186
+ durationMinutes: 24 * 60, // 1 day
187
+ tickInterval: 3600, // Hourly (required for causal)
188
+ numPredictionMarkets: 2,
189
+ numPerpetualMarkets: 3,
190
+ numAgents: 5,
191
+ seed: 12345,
192
+ useCausalSimulation: true,
193
+ };
194
+ test("causal mode generates hidden narrative facts", async () => {
195
+ const generator = new BenchmarkDataGenerator(causalConfig);
196
+ const snapshot = await generator.generate();
197
+ expect(snapshot.groundTruth).toBeDefined();
198
+ expect(snapshot.groundTruth.hiddenNarrativeFacts).toBeDefined();
199
+ expect(snapshot.groundTruth.hiddenNarrativeFacts?.length).toBeGreaterThan(0);
200
+ });
201
+ test("hidden narrative facts have valid structure", async () => {
202
+ const generator = new BenchmarkDataGenerator(causalConfig);
203
+ const snapshot = await generator.generate();
204
+ for (const fact of snapshot.groundTruth.hiddenNarrativeFacts ?? []) {
205
+ expect(fact.id).toBeDefined();
206
+ expect(fact.fact).toBeDefined();
207
+ expect(fact.affectsTickers).toBeDefined();
208
+ expect(fact.affectsTickers.length).toBeGreaterThan(0);
209
+ expect(["positive", "negative"]).toContain(fact.sentiment);
210
+ expect(fact.eventSchedule).toBeDefined();
211
+ expect(fact.eventSchedule.length).toBeGreaterThan(0);
212
+ }
213
+ });
214
+ test("causal events are scheduled correctly", async () => {
215
+ const generator = new BenchmarkDataGenerator(causalConfig);
216
+ const snapshot = await generator.generate();
217
+ expect(snapshot.groundTruth.causalEvents).toBeDefined();
218
+ expect(snapshot.groundTruth.causalEvents?.length).toBeGreaterThan(0);
219
+ // Verify each causal event has required fields
220
+ for (const event of snapshot.groundTruth.causalEvents ?? []) {
221
+ expect(event.tick).toBeDefined();
222
+ expect(event.eventType).toBeDefined();
223
+ expect(event.affectedTickers.length).toBeGreaterThan(0);
224
+ expect(["low", "medium", "high"]).toContain(event.volatilityBucket);
225
+ }
226
+ });
227
+ test("causal mode generates market outcomes", async () => {
228
+ const generator = new BenchmarkDataGenerator(causalConfig);
229
+ const snapshot = await generator.generate();
230
+ expect(snapshot.groundTruth.marketOutcomes).toBeDefined();
231
+ expect(Object.keys(snapshot.groundTruth.marketOutcomes).length).toBeGreaterThan(0);
232
+ });
233
+ test("ground truth includes price history", async () => {
234
+ const generator = new BenchmarkDataGenerator(causalConfig);
235
+ const snapshot = await generator.generate();
236
+ expect(snapshot.groundTruth.priceHistory).toBeDefined();
237
+ // Each perpetual market should have price history
238
+ for (const market of snapshot.initialState.perpetualMarkets) {
239
+ const history = snapshot.groundTruth.priceHistory[market.ticker];
240
+ expect(history).toBeDefined();
241
+ expect(history.length).toBeGreaterThan(0);
242
+ }
243
+ });
244
+ });
245
+ // =============================================================================
246
+ // BenchmarkConfig Validation Tests
247
+ // =============================================================================
248
+ describe("BenchmarkConfig - Validation", () => {
249
+ test("valid config creates generator without error", () => {
250
+ const config = {
251
+ durationMinutes: 30 * 24 * 60,
252
+ tickInterval: 3600,
253
+ numPredictionMarkets: 5,
254
+ numPerpetualMarkets: 5,
255
+ numAgents: 10,
256
+ seed: 12345,
257
+ };
258
+ expect(() => new BenchmarkDataGenerator(config)).not.toThrow();
259
+ });
260
+ test("config with zero markets is valid (edge case)", async () => {
261
+ const config = {
262
+ durationMinutes: 60,
263
+ tickInterval: 3600,
264
+ numPredictionMarkets: 0,
265
+ numPerpetualMarkets: 1,
266
+ numAgents: 1,
267
+ seed: 42,
268
+ };
269
+ const generator = new BenchmarkDataGenerator(config);
270
+ const snapshot = await generator.generate();
271
+ expect(snapshot.initialState.predictionMarkets.length).toBe(0);
272
+ expect(snapshot.initialState.perpetualMarkets.length).toBe(1);
273
+ });
274
+ test("calculates total ticks correctly", async () => {
275
+ const config = {
276
+ durationMinutes: 24 * 60, // 1 day
277
+ tickInterval: 3600, // 1 hour
278
+ numPredictionMarkets: 2,
279
+ numPerpetualMarkets: 3,
280
+ numAgents: 5,
281
+ seed: 12345,
282
+ };
283
+ const generator = new BenchmarkDataGenerator(config);
284
+ const snapshot = await generator.generate();
285
+ const expectedTicks = Math.floor((24 * 60 * 60) / 3600); // 24 hours
286
+ expect(snapshot.ticks.length).toBe(expectedTicks);
287
+ });
288
+ test("short duration with fast ticks", async () => {
289
+ const config = {
290
+ durationMinutes: 10, // 10 minutes
291
+ tickInterval: 60, // 1 minute
292
+ numPredictionMarkets: 1,
293
+ numPerpetualMarkets: 1,
294
+ numAgents: 2,
295
+ seed: 42,
296
+ };
297
+ const generator = new BenchmarkDataGenerator(config);
298
+ const snapshot = await generator.generate();
299
+ expect(snapshot.ticks.length).toBe(10);
300
+ });
301
+ });
302
+ // =============================================================================
303
+ // Comparison Logic Tests - Using Real Types
304
+ // =============================================================================
305
+ describe("Benchmark Comparison Logic", () => {
306
+ function calculateComparison(runs) {
307
+ if (runs.length === 0) {
308
+ return {
309
+ avgPnl: 0,
310
+ avgAccuracy: 0,
311
+ avgOptimality: 0,
312
+ bestRun: "",
313
+ worstRun: "",
314
+ };
315
+ }
316
+ const avgPnl = runs.reduce((sum, r) => sum + r.pnl, 0) / runs.length;
317
+ const avgAccuracy = runs.reduce((sum, r) => sum + r.accuracy, 0) / runs.length;
318
+ const avgOptimality = runs.reduce((sum, r) => sum + r.optimality, 0) / runs.length;
319
+ const bestRun = runs.reduce((best, r) => (r.pnl > best.pnl ? r : best)).id;
320
+ const worstRun = runs.reduce((worst, r) => r.pnl < worst.pnl ? r : worst).id;
321
+ return { avgPnl, avgAccuracy, avgOptimality, bestRun, worstRun };
322
+ }
323
+ test("calculates average metrics across runs", () => {
324
+ const runs = [
325
+ { id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
326
+ { id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
327
+ { id: "run-3", pnl: 150, accuracy: 0.7, optimality: 0.75 },
328
+ ];
329
+ const comparison = calculateComparison(runs);
330
+ expect(comparison.avgPnl).toBe(150);
331
+ expect(comparison.avgAccuracy).toBeCloseTo(0.7, 5);
332
+ expect(comparison.avgOptimality).toBe(0.75);
333
+ });
334
+ test("identifies best and worst runs", () => {
335
+ const runs = [
336
+ { id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
337
+ { id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
338
+ { id: "run-3", pnl: 50, accuracy: 0.5, optimality: 0.6 },
339
+ ];
340
+ const comparison = calculateComparison(runs);
341
+ expect(comparison.bestRun).toBe("run-2");
342
+ expect(comparison.worstRun).toBe("run-3");
343
+ });
344
+ test("handles negative PnL values", () => {
345
+ const runs = [
346
+ { id: "run-1", pnl: -50, accuracy: 0.4, optimality: 0.3 },
347
+ { id: "run-2", pnl: 50, accuracy: 0.6, optimality: 0.6 },
348
+ { id: "run-3", pnl: -100, accuracy: 0.3, optimality: 0.2 },
349
+ ];
350
+ const comparison = calculateComparison(runs);
351
+ expect(comparison.bestRun).toBe("run-2");
352
+ expect(comparison.worstRun).toBe("run-3");
353
+ expect(comparison.avgPnl).toBeCloseTo(-33.33, 1);
354
+ });
355
+ test("handles single run", () => {
356
+ const runs = [
357
+ { id: "run-1", pnl: 100, accuracy: 0.7, optimality: 0.8 },
358
+ ];
359
+ const comparison = calculateComparison(runs);
360
+ expect(comparison.avgPnl).toBe(100);
361
+ expect(comparison.bestRun).toBe("run-1");
362
+ expect(comparison.worstRun).toBe("run-1");
363
+ });
364
+ test("handles empty runs array", () => {
365
+ const comparison = calculateComparison([]);
366
+ expect(comparison.avgPnl).toBe(0);
367
+ expect(comparison.bestRun).toBe("");
368
+ expect(comparison.worstRun).toBe("");
369
+ });
370
+ });
371
+ // =============================================================================
372
+ // Alpha Calculation (Excess Return)
373
+ // =============================================================================
374
+ describe("Alpha Calculation", () => {
375
+ function calculateAlpha(baselinePnl, challengerPnl) {
376
+ const alpha = challengerPnl - baselinePnl;
377
+ const alphaPercent = baselinePnl !== 0
378
+ ? (alpha / Math.abs(baselinePnl)) * 100
379
+ : challengerPnl !== 0
380
+ ? Infinity
381
+ : 0;
382
+ return { alpha, alphaPercent };
383
+ }
384
+ test("positive alpha when outperforming", () => {
385
+ const result = calculateAlpha(100, 150);
386
+ expect(result.alpha).toBe(50);
387
+ expect(result.alphaPercent).toBe(50);
388
+ });
389
+ test("negative alpha when underperforming", () => {
390
+ const result = calculateAlpha(150, 100);
391
+ expect(result.alpha).toBe(-50);
392
+ expect(result.alphaPercent).toBeCloseTo(-33.33, 1);
393
+ });
394
+ test("zero alpha when equal performance", () => {
395
+ const result = calculateAlpha(100, 100);
396
+ expect(result.alpha).toBe(0);
397
+ expect(result.alphaPercent).toBe(0);
398
+ });
399
+ test("handles baseline of zero", () => {
400
+ const result = calculateAlpha(0, 100);
401
+ expect(result.alpha).toBe(100);
402
+ expect(result.alphaPercent).toBe(Infinity);
403
+ });
404
+ test("handles both zero", () => {
405
+ const result = calculateAlpha(0, 0);
406
+ expect(result.alpha).toBe(0);
407
+ expect(result.alphaPercent).toBe(0);
408
+ });
409
+ });
@@ -0,0 +1,105 @@
1
+ import { describe, expect, it } from "bun:test";
2
+ import { MetricsVisualizer } from "../MetricsVisualizer";
3
+ import { SimulationEngine } from "../SimulationEngine";
4
+ describe("Head-to-Head Benchmark Infrastructure", () => {
5
+ // 1. Test Simulation Engine PnL History Tracking
6
+ describe("SimulationEngine PnL History", () => {
7
+ it("should initialize with empty pnlHistory and return it after run()", async () => {
8
+ const mockSnapshot = {
9
+ id: "test",
10
+ ticks: [],
11
+ initialState: {
12
+ predictionMarkets: [],
13
+ perpetualMarkets: [],
14
+ agents: [],
15
+ },
16
+ groundTruth: {
17
+ marketOutcomes: {},
18
+ priceHistory: {},
19
+ optimalActions: [],
20
+ },
21
+ };
22
+ const engine = new SimulationEngine({
23
+ snapshot: mockSnapshot,
24
+ agentId: "test-agent",
25
+ fastForward: true,
26
+ });
27
+ engine.initialize();
28
+ // Use public API - run() returns pnlHistory
29
+ const result = await engine.run();
30
+ expect(result.pnlHistory).toEqual([]);
31
+ });
32
+ });
33
+ // 2. Test MetricsVisualizer Logic
34
+ describe("MetricsVisualizer Comparison Logic", () => {
35
+ // Mock Result Helper
36
+ const createMockResult = (id, pnl, history) => ({
37
+ id,
38
+ agentId: id,
39
+ benchmarkId: "bench-1",
40
+ startTime: 0,
41
+ endTime: 1000,
42
+ ticksProcessed: history.length,
43
+ actions: [],
44
+ metrics: {
45
+ totalPnl: pnl,
46
+ predictionMetrics: {
47
+ accuracy: 0.5,
48
+ totalPositions: 0,
49
+ correctPredictions: 0,
50
+ incorrectPredictions: 0,
51
+ avgPnlPerPosition: 0,
52
+ },
53
+ perpMetrics: {
54
+ winRate: 0.5,
55
+ totalTrades: 0,
56
+ profitableTrades: 0,
57
+ avgPnlPerTrade: 0,
58
+ maxDrawdown: 0,
59
+ },
60
+ socialMetrics: {
61
+ postsCreated: 0,
62
+ groupsJoined: 0,
63
+ messagesReceived: 0,
64
+ reputationGained: 0,
65
+ },
66
+ timing: { totalDuration: 0, avgResponseTime: 0, maxResponseTime: 0 },
67
+ optimalityScore: 50,
68
+ },
69
+ trajectory: { states: [], actions: [], rewards: [], windowId: "" },
70
+ pnlHistory: history.map((val, idx) => ({ tick: idx, pnl: val })),
71
+ });
72
+ it("should correctly merge PnL histories of equal length", () => {
73
+ const baseline = createMockResult("baseline", 100, [10, 50, 100]);
74
+ const challenger = createMockResult("challenger", 200, [20, 100, 200]);
75
+ // Use public static method
76
+ const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
77
+ expect(history).toHaveLength(3);
78
+ expect(history[2]).toEqual({ tick: 2, baseline: 100, challenger: 200 });
79
+ });
80
+ it("should handle unequal history lengths (fill with final value)", () => {
81
+ // Baseline died early (e.g., bankruptcy or crash)
82
+ const baseline = createMockResult("baseline", -50, [10, -50]);
83
+ // Challenger kept going
84
+ const challenger = createMockResult("challenger", 100, [20, 60, 80, 100]);
85
+ const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
86
+ expect(history).toHaveLength(4); // Should match longest
87
+ // Tick 0
88
+ expect(history[0]).toEqual({ tick: 0, baseline: 10, challenger: 20 });
89
+ // Tick 1
90
+ expect(history[1]).toEqual({ tick: 1, baseline: -50, challenger: 60 });
91
+ // Tick 2 (Baseline stopped, should carry over -50)
92
+ expect(history[2]).toEqual({ tick: 2, baseline: -50, challenger: 80 });
93
+ // Tick 3
94
+ expect(history[3]).toEqual({ tick: 3, baseline: -50, challenger: 100 });
95
+ });
96
+ it("should generate ASCII chart string", () => {
97
+ const baseline = createMockResult("baseline", 100, [10, 100]);
98
+ const challenger = createMockResult("challenger", 200, [20, 200]);
99
+ const chart = MetricsVisualizer.generateAsciiComparison(baseline, challenger);
100
+ expect(chart).toContain("HEAD-TO-HEAD RESULTS");
101
+ expect(chart).toContain("WINNER: Challenger");
102
+ expect(chart).toContain("Alpha Generated: +$100.00");
103
+ });
104
+ });
105
+ });
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Benchmark Module
3
+ *
4
+ * Tools for evaluating agent performance through simulation.
5
+ */
6
+ export { ArchetypeMatchupBenchmark, runQuickMatchupBenchmark, } from "./ArchetypeMatchupBenchmark";
7
+ export { BenchmarkChartGenerator } from "./BenchmarkChartGenerator";
8
+ export { BenchmarkDataGenerator, SeededRandom } from "./BenchmarkDataGenerator";
9
+ export { BenchmarkDataViewer } from "./BenchmarkDataViewer";
10
+ export { BenchmarkHistoryService } from "./BenchmarkHistoryService";
11
+ export { BenchmarkRunner } from "./BenchmarkRunner";
12
+ export * as BenchmarkValidator from "./BenchmarkValidator";
13
+ export { FastEvalRunner } from "./FastEvalRunner";
14
+ export { MetricsValidator } from "./MetricsValidator";
15
+ export { MetricsVisualizer } from "./MetricsVisualizer";
16
+ export { ModelBenchmarkService } from "./ModelBenchmarkService";
17
+ export { getBaselineModels, getModelById, getModelByModelId, getModelDisplayName, getModelsByProvider, getModelsByTier, MODEL_REGISTRY, validateModelId, } from "./ModelRegistry";
18
+ // Shared utilities
19
+ export { parseSimulationMetrics, } from "./parseSimulationMetrics";
20
+ export { createRulerContext, extractMarketOutcomesFromBenchmark, getHiddenEventsForTick, getHiddenFactsForTick, getTrueFacts, scoreActionAgainstGroundTruth, wasDecisionOptimal, } from "./RulerBenchmarkIntegration";
21
+ export { SimulationA2AInterface } from "./SimulationA2AInterface";
22
+ export { SimulationEngine } from "./SimulationEngine";
23
+ export { TaskRunner } from "./TaskRunner";
@@ -0,0 +1,86 @@
1
+ /**
2
+ * Simulation Metrics Parser
3
+ *
4
+ * Shared utility for validating and parsing SimulationMetrics from JSON data.
5
+ * Used by ModelBenchmarkService and HuggingFaceModelUploader.
6
+ */
7
+ /**
8
+ * Parse and validate SimulationMetrics from JSON data
9
+ *
10
+ * @param data - Raw JSON data to parse
11
+ * @returns Validated SimulationMetrics object
12
+ * @throws Error if data is invalid or missing required fields
13
+ */
14
+ export function parseSimulationMetrics(data) {
15
+ if (typeof data !== "object" || data === null) {
16
+ throw new Error("Invalid SimulationMetrics: expected object");
17
+ }
18
+ const metrics = data;
19
+ // Validate required fields
20
+ if (typeof metrics.totalPnl !== "number") {
21
+ throw new Error("Invalid SimulationMetrics: totalPnl must be a number");
22
+ }
23
+ if (typeof metrics.predictionMetrics !== "object" ||
24
+ metrics.predictionMetrics === null) {
25
+ throw new Error("Invalid SimulationMetrics: predictionMetrics must be an object");
26
+ }
27
+ if (typeof metrics.perpMetrics !== "object" || metrics.perpMetrics === null) {
28
+ throw new Error("Invalid SimulationMetrics: perpMetrics must be an object");
29
+ }
30
+ if (typeof metrics.optimalityScore !== "number") {
31
+ throw new Error("Invalid SimulationMetrics: optimalityScore must be a number");
32
+ }
33
+ if (typeof metrics.timing !== "object" || metrics.timing === null) {
34
+ throw new Error("Invalid SimulationMetrics: timing must be an object");
35
+ }
36
+ // Validate nested structures
37
+ const predictionMetrics = metrics.predictionMetrics;
38
+ const perpMetrics = metrics.perpMetrics;
39
+ const timing = metrics.timing;
40
+ // Helper to safely get number or default
41
+ const getNumber = (obj, key) => {
42
+ const val = obj[key];
43
+ return typeof val === "number" ? val : 0;
44
+ };
45
+ // Parse socialMetrics if present
46
+ const socialMetricsData = metrics.socialMetrics;
47
+ const socialMetrics = typeof socialMetricsData === "object" && socialMetricsData !== null
48
+ ? socialMetricsData
49
+ : null;
50
+ return {
51
+ totalPnl: metrics.totalPnl,
52
+ predictionMetrics: {
53
+ totalPositions: getNumber(predictionMetrics, "totalPositions"),
54
+ correctPredictions: getNumber(predictionMetrics, "correctPredictions"),
55
+ incorrectPredictions: getNumber(predictionMetrics, "incorrectPredictions"),
56
+ accuracy: getNumber(predictionMetrics, "accuracy"),
57
+ avgPnlPerPosition: getNumber(predictionMetrics, "avgPnlPerPosition"),
58
+ },
59
+ perpMetrics: {
60
+ totalTrades: getNumber(perpMetrics, "totalTrades"),
61
+ profitableTrades: getNumber(perpMetrics, "profitableTrades"),
62
+ winRate: getNumber(perpMetrics, "winRate"),
63
+ avgPnlPerTrade: getNumber(perpMetrics, "avgPnlPerTrade"),
64
+ maxDrawdown: getNumber(perpMetrics, "maxDrawdown"),
65
+ },
66
+ socialMetrics: socialMetrics
67
+ ? {
68
+ postsCreated: getNumber(socialMetrics, "postsCreated"),
69
+ groupsJoined: getNumber(socialMetrics, "groupsJoined"),
70
+ messagesReceived: getNumber(socialMetrics, "messagesReceived"),
71
+ reputationGained: getNumber(socialMetrics, "reputationGained"),
72
+ }
73
+ : {
74
+ postsCreated: 0,
75
+ groupsJoined: 0,
76
+ messagesReceived: 0,
77
+ reputationGained: 0,
78
+ },
79
+ timing: {
80
+ avgResponseTime: getNumber(timing, "avgResponseTime"),
81
+ maxResponseTime: getNumber(timing, "maxResponseTime"),
82
+ totalDuration: getNumber(timing, "totalDuration"),
83
+ },
84
+ optimalityScore: metrics.optimalityScore,
85
+ };
86
+ }
@@ -0,0 +1 @@
1
+ export {};