@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -5,19 +5,19 @@
|
|
|
5
5
|
* Tests actual classes and functions, not inline mock implementations.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import { describe, expect, test } from
|
|
8
|
+
import { describe, expect, test } from "bun:test";
|
|
9
9
|
import {
|
|
10
10
|
type BenchmarkConfig,
|
|
11
11
|
BenchmarkDataGenerator,
|
|
12
12
|
SeededRandom,
|
|
13
|
-
} from
|
|
13
|
+
} from "../BenchmarkDataGenerator";
|
|
14
14
|
|
|
15
15
|
// =============================================================================
|
|
16
16
|
// SeededRandom Tests - Real Class
|
|
17
17
|
// =============================================================================
|
|
18
18
|
|
|
19
|
-
describe(
|
|
20
|
-
test(
|
|
19
|
+
describe("SeededRandom - Deterministic RNG", () => {
|
|
20
|
+
test("same seed produces same sequence", () => {
|
|
21
21
|
const rng1 = new SeededRandom(12345);
|
|
22
22
|
const rng2 = new SeededRandom(12345);
|
|
23
23
|
|
|
@@ -39,7 +39,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
39
39
|
expect(seq1).toEqual(seq2);
|
|
40
40
|
});
|
|
41
41
|
|
|
42
|
-
test(
|
|
42
|
+
test("different seeds produce different sequences", () => {
|
|
43
43
|
const rng1 = new SeededRandom(12345);
|
|
44
44
|
const rng2 = new SeededRandom(54321);
|
|
45
45
|
|
|
@@ -49,7 +49,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
49
49
|
expect(val1).not.toBe(val2);
|
|
50
50
|
});
|
|
51
51
|
|
|
52
|
-
test(
|
|
52
|
+
test("next() produces values in [0, 1) range", () => {
|
|
53
53
|
const rng = new SeededRandom(42);
|
|
54
54
|
|
|
55
55
|
for (let i = 0; i < 1000; i++) {
|
|
@@ -59,7 +59,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
59
59
|
}
|
|
60
60
|
});
|
|
61
61
|
|
|
62
|
-
test(
|
|
62
|
+
test("nextInt() produces values in specified range", () => {
|
|
63
63
|
const rng = new SeededRandom(42);
|
|
64
64
|
|
|
65
65
|
for (let i = 0; i < 100; i++) {
|
|
@@ -70,7 +70,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
70
70
|
}
|
|
71
71
|
});
|
|
72
72
|
|
|
73
|
-
test(
|
|
73
|
+
test("nextInt() handles single value range", () => {
|
|
74
74
|
const rng = new SeededRandom(42);
|
|
75
75
|
|
|
76
76
|
for (let i = 0; i < 10; i++) {
|
|
@@ -79,9 +79,9 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
79
79
|
}
|
|
80
80
|
});
|
|
81
81
|
|
|
82
|
-
test(
|
|
82
|
+
test("pick() selects from array", () => {
|
|
83
83
|
const rng = new SeededRandom(42);
|
|
84
|
-
const options = [
|
|
84
|
+
const options = ["a", "b", "c", "d", "e"];
|
|
85
85
|
|
|
86
86
|
const selections = new Set<string>();
|
|
87
87
|
for (let i = 0; i < 100; i++) {
|
|
@@ -94,11 +94,11 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
94
94
|
expect(selections.size).toBeGreaterThan(3);
|
|
95
95
|
});
|
|
96
96
|
|
|
97
|
-
test(
|
|
97
|
+
test("pick() is deterministic with same seed", () => {
|
|
98
98
|
const rng1 = new SeededRandom(42);
|
|
99
99
|
const rng2 = new SeededRandom(42);
|
|
100
100
|
|
|
101
|
-
const options = [
|
|
101
|
+
const options = ["a", "b", "c", "d", "e"];
|
|
102
102
|
|
|
103
103
|
const picks1 = [rng1.pick(options), rng1.pick(options), rng1.pick(options)];
|
|
104
104
|
const picks2 = [rng2.pick(options), rng2.pick(options), rng2.pick(options)];
|
|
@@ -106,7 +106,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
106
106
|
expect(picks1).toEqual(picks2);
|
|
107
107
|
});
|
|
108
108
|
|
|
109
|
-
test(
|
|
109
|
+
test("nextFloat() produces values in specified range", () => {
|
|
110
110
|
const rng = new SeededRandom(42);
|
|
111
111
|
|
|
112
112
|
for (let i = 0; i < 100; i++) {
|
|
@@ -121,7 +121,7 @@ describe('SeededRandom - Deterministic RNG', () => {
|
|
|
121
121
|
// BenchmarkDataGenerator Tests - Real Class
|
|
122
122
|
// =============================================================================
|
|
123
123
|
|
|
124
|
-
describe(
|
|
124
|
+
describe("BenchmarkDataGenerator - Data Generation", () => {
|
|
125
125
|
const baseConfig: BenchmarkConfig = {
|
|
126
126
|
durationMinutes: 60, // 1 hour
|
|
127
127
|
tickInterval: 3600, // 1 hour ticks
|
|
@@ -131,7 +131,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
131
131
|
seed: 12345,
|
|
132
132
|
};
|
|
133
133
|
|
|
134
|
-
test(
|
|
134
|
+
test("generates deterministic data with same seed", async () => {
|
|
135
135
|
const generator1 = new BenchmarkDataGenerator(baseConfig);
|
|
136
136
|
const generator2 = new BenchmarkDataGenerator(baseConfig);
|
|
137
137
|
|
|
@@ -140,25 +140,25 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
140
140
|
|
|
141
141
|
// Same structure
|
|
142
142
|
expect(snapshot1.initialState.predictionMarkets.length).toBe(
|
|
143
|
-
snapshot2.initialState.predictionMarkets.length
|
|
143
|
+
snapshot2.initialState.predictionMarkets.length,
|
|
144
144
|
);
|
|
145
145
|
expect(snapshot1.initialState.perpetualMarkets.length).toBe(
|
|
146
|
-
snapshot2.initialState.perpetualMarkets.length
|
|
146
|
+
snapshot2.initialState.perpetualMarkets.length,
|
|
147
147
|
);
|
|
148
148
|
expect(snapshot1.initialState.agents.length).toBe(
|
|
149
|
-
snapshot2.initialState.agents.length
|
|
149
|
+
snapshot2.initialState.agents.length,
|
|
150
150
|
);
|
|
151
151
|
|
|
152
152
|
// Same content (deterministic)
|
|
153
153
|
expect(snapshot1.initialState.perpetualMarkets[0]?.ticker).toBe(
|
|
154
|
-
snapshot2.initialState.perpetualMarkets[0]?.ticker
|
|
154
|
+
snapshot2.initialState.perpetualMarkets[0]?.ticker,
|
|
155
155
|
);
|
|
156
156
|
expect(snapshot1.initialState.perpetualMarkets[0]?.price).toBe(
|
|
157
|
-
snapshot2.initialState.perpetualMarkets[0]?.price
|
|
157
|
+
snapshot2.initialState.perpetualMarkets[0]?.price,
|
|
158
158
|
);
|
|
159
159
|
});
|
|
160
160
|
|
|
161
|
-
test(
|
|
161
|
+
test("generates correct number of markets", async () => {
|
|
162
162
|
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
163
163
|
const snapshot = await generator.generate();
|
|
164
164
|
|
|
@@ -167,7 +167,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
167
167
|
expect(snapshot.initialState.agents.length).toBe(5);
|
|
168
168
|
});
|
|
169
169
|
|
|
170
|
-
test(
|
|
170
|
+
test("generates valid prediction market structure", async () => {
|
|
171
171
|
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
172
172
|
const snapshot = await generator.generate();
|
|
173
173
|
|
|
@@ -184,32 +184,32 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
184
184
|
}
|
|
185
185
|
});
|
|
186
186
|
|
|
187
|
-
test(
|
|
187
|
+
test("generates valid perpetual market structure", async () => {
|
|
188
188
|
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
189
189
|
const snapshot = await generator.generate();
|
|
190
190
|
|
|
191
191
|
for (const market of snapshot.initialState.perpetualMarkets) {
|
|
192
192
|
expect(market.ticker).toBeDefined();
|
|
193
193
|
expect(market.price).toBeGreaterThan(0);
|
|
194
|
-
expect(typeof market.priceChange24h).toBe(
|
|
194
|
+
expect(typeof market.priceChange24h).toBe("number");
|
|
195
195
|
expect(market.volume24h).toBeGreaterThanOrEqual(0);
|
|
196
|
-
expect(typeof market.fundingRate).toBe(
|
|
196
|
+
expect(typeof market.fundingRate).toBe("number");
|
|
197
197
|
}
|
|
198
198
|
});
|
|
199
199
|
|
|
200
|
-
test(
|
|
200
|
+
test("generates valid agent structure", async () => {
|
|
201
201
|
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
202
202
|
const snapshot = await generator.generate();
|
|
203
203
|
|
|
204
204
|
for (const agent of snapshot.initialState.agents) {
|
|
205
205
|
expect(agent.id).toBeDefined();
|
|
206
206
|
expect(agent.name).toBeDefined();
|
|
207
|
-
expect(typeof agent.reputation).toBe(
|
|
208
|
-
expect(typeof agent.totalPnl).toBe(
|
|
207
|
+
expect(typeof agent.reputation).toBe("number");
|
|
208
|
+
expect(typeof agent.totalPnl).toBe("number");
|
|
209
209
|
}
|
|
210
210
|
});
|
|
211
211
|
|
|
212
|
-
test(
|
|
212
|
+
test("generates ticks for duration", async () => {
|
|
213
213
|
const generator = new BenchmarkDataGenerator({
|
|
214
214
|
...baseConfig,
|
|
215
215
|
durationMinutes: 180, // 3 hours
|
|
@@ -221,7 +221,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
221
221
|
expect(snapshot.ticks.length).toBe(3);
|
|
222
222
|
});
|
|
223
223
|
|
|
224
|
-
test(
|
|
224
|
+
test("different seeds produce different data", async () => {
|
|
225
225
|
const generator1 = new BenchmarkDataGenerator({ ...baseConfig, seed: 111 });
|
|
226
226
|
const generator2 = new BenchmarkDataGenerator({ ...baseConfig, seed: 222 });
|
|
227
227
|
|
|
@@ -240,7 +240,7 @@ describe('BenchmarkDataGenerator - Data Generation', () => {
|
|
|
240
240
|
// BenchmarkDataGenerator - Causal Simulation Mode
|
|
241
241
|
// =============================================================================
|
|
242
242
|
|
|
243
|
-
describe(
|
|
243
|
+
describe("BenchmarkDataGenerator - Causal Simulation", () => {
|
|
244
244
|
const causalConfig: BenchmarkConfig = {
|
|
245
245
|
durationMinutes: 24 * 60, // 1 day
|
|
246
246
|
tickInterval: 3600, // Hourly (required for causal)
|
|
@@ -251,18 +251,18 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
|
|
|
251
251
|
useCausalSimulation: true,
|
|
252
252
|
};
|
|
253
253
|
|
|
254
|
-
test(
|
|
254
|
+
test("causal mode generates hidden narrative facts", async () => {
|
|
255
255
|
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
256
256
|
const snapshot = await generator.generate();
|
|
257
257
|
|
|
258
258
|
expect(snapshot.groundTruth).toBeDefined();
|
|
259
259
|
expect(snapshot.groundTruth.hiddenNarrativeFacts).toBeDefined();
|
|
260
|
-
expect(snapshot.groundTruth.hiddenNarrativeFacts
|
|
261
|
-
0
|
|
260
|
+
expect(snapshot.groundTruth.hiddenNarrativeFacts?.length).toBeGreaterThan(
|
|
261
|
+
0,
|
|
262
262
|
);
|
|
263
263
|
});
|
|
264
264
|
|
|
265
|
-
test(
|
|
265
|
+
test("hidden narrative facts have valid structure", async () => {
|
|
266
266
|
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
267
267
|
const snapshot = await generator.generate();
|
|
268
268
|
|
|
@@ -271,39 +271,39 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
|
|
|
271
271
|
expect(fact.fact).toBeDefined();
|
|
272
272
|
expect(fact.affectsTickers).toBeDefined();
|
|
273
273
|
expect(fact.affectsTickers.length).toBeGreaterThan(0);
|
|
274
|
-
expect([
|
|
274
|
+
expect(["positive", "negative"]).toContain(fact.sentiment);
|
|
275
275
|
expect(fact.eventSchedule).toBeDefined();
|
|
276
276
|
expect(fact.eventSchedule.length).toBeGreaterThan(0);
|
|
277
277
|
}
|
|
278
278
|
});
|
|
279
279
|
|
|
280
|
-
test(
|
|
280
|
+
test("causal events are scheduled correctly", async () => {
|
|
281
281
|
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
282
282
|
const snapshot = await generator.generate();
|
|
283
283
|
|
|
284
284
|
expect(snapshot.groundTruth.causalEvents).toBeDefined();
|
|
285
|
-
expect(snapshot.groundTruth.causalEvents
|
|
285
|
+
expect(snapshot.groundTruth.causalEvents?.length).toBeGreaterThan(0);
|
|
286
286
|
|
|
287
287
|
// Verify each causal event has required fields
|
|
288
288
|
for (const event of snapshot.groundTruth.causalEvents!) {
|
|
289
289
|
expect(event.tick).toBeDefined();
|
|
290
290
|
expect(event.eventType).toBeDefined();
|
|
291
291
|
expect(event.affectedTickers.length).toBeGreaterThan(0);
|
|
292
|
-
expect([
|
|
292
|
+
expect(["low", "medium", "high"]).toContain(event.volatilityBucket);
|
|
293
293
|
}
|
|
294
294
|
});
|
|
295
295
|
|
|
296
|
-
test(
|
|
296
|
+
test("causal mode generates market outcomes", async () => {
|
|
297
297
|
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
298
298
|
const snapshot = await generator.generate();
|
|
299
299
|
|
|
300
300
|
expect(snapshot.groundTruth.marketOutcomes).toBeDefined();
|
|
301
301
|
expect(
|
|
302
|
-
Object.keys(snapshot.groundTruth.marketOutcomes).length
|
|
302
|
+
Object.keys(snapshot.groundTruth.marketOutcomes).length,
|
|
303
303
|
).toBeGreaterThan(0);
|
|
304
304
|
});
|
|
305
305
|
|
|
306
|
-
test(
|
|
306
|
+
test("ground truth includes price history", async () => {
|
|
307
307
|
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
308
308
|
const snapshot = await generator.generate();
|
|
309
309
|
|
|
@@ -322,8 +322,8 @@ describe('BenchmarkDataGenerator - Causal Simulation', () => {
|
|
|
322
322
|
// BenchmarkConfig Validation Tests
|
|
323
323
|
// =============================================================================
|
|
324
324
|
|
|
325
|
-
describe(
|
|
326
|
-
test(
|
|
325
|
+
describe("BenchmarkConfig - Validation", () => {
|
|
326
|
+
test("valid config creates generator without error", () => {
|
|
327
327
|
const config: BenchmarkConfig = {
|
|
328
328
|
durationMinutes: 30 * 24 * 60,
|
|
329
329
|
tickInterval: 3600,
|
|
@@ -336,7 +336,7 @@ describe('BenchmarkConfig - Validation', () => {
|
|
|
336
336
|
expect(() => new BenchmarkDataGenerator(config)).not.toThrow();
|
|
337
337
|
});
|
|
338
338
|
|
|
339
|
-
test(
|
|
339
|
+
test("config with zero markets is valid (edge case)", async () => {
|
|
340
340
|
const config: BenchmarkConfig = {
|
|
341
341
|
durationMinutes: 60,
|
|
342
342
|
tickInterval: 3600,
|
|
@@ -353,7 +353,7 @@ describe('BenchmarkConfig - Validation', () => {
|
|
|
353
353
|
expect(snapshot.initialState.perpetualMarkets.length).toBe(1);
|
|
354
354
|
});
|
|
355
355
|
|
|
356
|
-
test(
|
|
356
|
+
test("calculates total ticks correctly", async () => {
|
|
357
357
|
const config: BenchmarkConfig = {
|
|
358
358
|
durationMinutes: 24 * 60, // 1 day
|
|
359
359
|
tickInterval: 3600, // 1 hour
|
|
@@ -370,7 +370,7 @@ describe('BenchmarkConfig - Validation', () => {
|
|
|
370
370
|
expect(snapshot.ticks.length).toBe(expectedTicks);
|
|
371
371
|
});
|
|
372
372
|
|
|
373
|
-
test(
|
|
373
|
+
test("short duration with fast ticks", async () => {
|
|
374
374
|
const config: BenchmarkConfig = {
|
|
375
375
|
durationMinutes: 10, // 10 minutes
|
|
376
376
|
tickInterval: 60, // 1 minute
|
|
@@ -391,7 +391,7 @@ describe('BenchmarkConfig - Validation', () => {
|
|
|
391
391
|
// Comparison Logic Tests - Using Real Types
|
|
392
392
|
// =============================================================================
|
|
393
393
|
|
|
394
|
-
describe(
|
|
394
|
+
describe("Benchmark Comparison Logic", () => {
|
|
395
395
|
// Test the comparison calculation logic that would be used in runMultiple
|
|
396
396
|
interface RunResult {
|
|
397
397
|
id: string;
|
|
@@ -406,8 +406,8 @@ describe('Benchmark Comparison Logic', () => {
|
|
|
406
406
|
avgPnl: 0,
|
|
407
407
|
avgAccuracy: 0,
|
|
408
408
|
avgOptimality: 0,
|
|
409
|
-
bestRun:
|
|
410
|
-
worstRun:
|
|
409
|
+
bestRun: "",
|
|
410
|
+
worstRun: "",
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
413
|
|
|
@@ -418,17 +418,17 @@ describe('Benchmark Comparison Logic', () => {
|
|
|
418
418
|
runs.reduce((sum, r) => sum + r.optimality, 0) / runs.length;
|
|
419
419
|
const bestRun = runs.reduce((best, r) => (r.pnl > best.pnl ? r : best)).id;
|
|
420
420
|
const worstRun = runs.reduce((worst, r) =>
|
|
421
|
-
r.pnl < worst.pnl ? r : worst
|
|
421
|
+
r.pnl < worst.pnl ? r : worst,
|
|
422
422
|
).id;
|
|
423
423
|
|
|
424
424
|
return { avgPnl, avgAccuracy, avgOptimality, bestRun, worstRun };
|
|
425
425
|
}
|
|
426
426
|
|
|
427
|
-
test(
|
|
427
|
+
test("calculates average metrics across runs", () => {
|
|
428
428
|
const runs: RunResult[] = [
|
|
429
|
-
{ id:
|
|
430
|
-
{ id:
|
|
431
|
-
{ id:
|
|
429
|
+
{ id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
|
|
430
|
+
{ id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
|
|
431
|
+
{ id: "run-3", pnl: 150, accuracy: 0.7, optimality: 0.75 },
|
|
432
432
|
];
|
|
433
433
|
|
|
434
434
|
const comparison = calculateComparison(runs);
|
|
@@ -438,51 +438,51 @@ describe('Benchmark Comparison Logic', () => {
|
|
|
438
438
|
expect(comparison.avgOptimality).toBe(0.75);
|
|
439
439
|
});
|
|
440
440
|
|
|
441
|
-
test(
|
|
441
|
+
test("identifies best and worst runs", () => {
|
|
442
442
|
const runs: RunResult[] = [
|
|
443
|
-
{ id:
|
|
444
|
-
{ id:
|
|
445
|
-
{ id:
|
|
443
|
+
{ id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
|
|
444
|
+
{ id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
|
|
445
|
+
{ id: "run-3", pnl: 50, accuracy: 0.5, optimality: 0.6 },
|
|
446
446
|
];
|
|
447
447
|
|
|
448
448
|
const comparison = calculateComparison(runs);
|
|
449
449
|
|
|
450
|
-
expect(comparison.bestRun).toBe(
|
|
451
|
-
expect(comparison.worstRun).toBe(
|
|
450
|
+
expect(comparison.bestRun).toBe("run-2");
|
|
451
|
+
expect(comparison.worstRun).toBe("run-3");
|
|
452
452
|
});
|
|
453
453
|
|
|
454
|
-
test(
|
|
454
|
+
test("handles negative PnL values", () => {
|
|
455
455
|
const runs: RunResult[] = [
|
|
456
|
-
{ id:
|
|
457
|
-
{ id:
|
|
458
|
-
{ id:
|
|
456
|
+
{ id: "run-1", pnl: -50, accuracy: 0.4, optimality: 0.3 },
|
|
457
|
+
{ id: "run-2", pnl: 50, accuracy: 0.6, optimality: 0.6 },
|
|
458
|
+
{ id: "run-3", pnl: -100, accuracy: 0.3, optimality: 0.2 },
|
|
459
459
|
];
|
|
460
460
|
|
|
461
461
|
const comparison = calculateComparison(runs);
|
|
462
462
|
|
|
463
|
-
expect(comparison.bestRun).toBe(
|
|
464
|
-
expect(comparison.worstRun).toBe(
|
|
463
|
+
expect(comparison.bestRun).toBe("run-2");
|
|
464
|
+
expect(comparison.worstRun).toBe("run-3");
|
|
465
465
|
expect(comparison.avgPnl).toBeCloseTo(-33.33, 1);
|
|
466
466
|
});
|
|
467
467
|
|
|
468
|
-
test(
|
|
468
|
+
test("handles single run", () => {
|
|
469
469
|
const runs: RunResult[] = [
|
|
470
|
-
{ id:
|
|
470
|
+
{ id: "run-1", pnl: 100, accuracy: 0.7, optimality: 0.8 },
|
|
471
471
|
];
|
|
472
472
|
|
|
473
473
|
const comparison = calculateComparison(runs);
|
|
474
474
|
|
|
475
475
|
expect(comparison.avgPnl).toBe(100);
|
|
476
|
-
expect(comparison.bestRun).toBe(
|
|
477
|
-
expect(comparison.worstRun).toBe(
|
|
476
|
+
expect(comparison.bestRun).toBe("run-1");
|
|
477
|
+
expect(comparison.worstRun).toBe("run-1");
|
|
478
478
|
});
|
|
479
479
|
|
|
480
|
-
test(
|
|
480
|
+
test("handles empty runs array", () => {
|
|
481
481
|
const comparison = calculateComparison([]);
|
|
482
482
|
|
|
483
483
|
expect(comparison.avgPnl).toBe(0);
|
|
484
|
-
expect(comparison.bestRun).toBe(
|
|
485
|
-
expect(comparison.worstRun).toBe(
|
|
484
|
+
expect(comparison.bestRun).toBe("");
|
|
485
|
+
expect(comparison.worstRun).toBe("");
|
|
486
486
|
});
|
|
487
487
|
});
|
|
488
488
|
|
|
@@ -490,7 +490,7 @@ describe('Benchmark Comparison Logic', () => {
|
|
|
490
490
|
// Alpha Calculation (Excess Return)
|
|
491
491
|
// =============================================================================
|
|
492
492
|
|
|
493
|
-
describe(
|
|
493
|
+
describe("Alpha Calculation", () => {
|
|
494
494
|
function calculateAlpha(baselinePnl: number, challengerPnl: number) {
|
|
495
495
|
const alpha = challengerPnl - baselinePnl;
|
|
496
496
|
const alphaPercent =
|
|
@@ -502,31 +502,31 @@ describe('Alpha Calculation', () => {
|
|
|
502
502
|
return { alpha, alphaPercent };
|
|
503
503
|
}
|
|
504
504
|
|
|
505
|
-
test(
|
|
505
|
+
test("positive alpha when outperforming", () => {
|
|
506
506
|
const result = calculateAlpha(100, 150);
|
|
507
507
|
expect(result.alpha).toBe(50);
|
|
508
508
|
expect(result.alphaPercent).toBe(50);
|
|
509
509
|
});
|
|
510
510
|
|
|
511
|
-
test(
|
|
511
|
+
test("negative alpha when underperforming", () => {
|
|
512
512
|
const result = calculateAlpha(150, 100);
|
|
513
513
|
expect(result.alpha).toBe(-50);
|
|
514
514
|
expect(result.alphaPercent).toBeCloseTo(-33.33, 1);
|
|
515
515
|
});
|
|
516
516
|
|
|
517
|
-
test(
|
|
517
|
+
test("zero alpha when equal performance", () => {
|
|
518
518
|
const result = calculateAlpha(100, 100);
|
|
519
519
|
expect(result.alpha).toBe(0);
|
|
520
520
|
expect(result.alphaPercent).toBe(0);
|
|
521
521
|
});
|
|
522
522
|
|
|
523
|
-
test(
|
|
523
|
+
test("handles baseline of zero", () => {
|
|
524
524
|
const result = calculateAlpha(0, 100);
|
|
525
525
|
expect(result.alpha).toBe(100);
|
|
526
526
|
expect(result.alphaPercent).toBe(Infinity);
|
|
527
527
|
});
|
|
528
528
|
|
|
529
|
-
test(
|
|
529
|
+
test("handles both zero", () => {
|
|
530
530
|
const result = calculateAlpha(0, 0);
|
|
531
531
|
expect(result.alpha).toBe(0);
|
|
532
532
|
expect(result.alphaPercent).toBe(0);
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import { describe, expect, it } from
|
|
2
|
-
import type { BenchmarkGameSnapshot } from
|
|
3
|
-
import { MetricsVisualizer } from
|
|
4
|
-
import { SimulationEngine, type SimulationResult } from
|
|
1
|
+
import { describe, expect, it } from "bun:test";
|
|
2
|
+
import type { BenchmarkGameSnapshot } from "../BenchmarkDataGenerator";
|
|
3
|
+
import { MetricsVisualizer } from "../MetricsVisualizer";
|
|
4
|
+
import { SimulationEngine, type SimulationResult } from "../SimulationEngine";
|
|
5
5
|
|
|
6
|
-
describe(
|
|
6
|
+
describe("Head-to-Head Benchmark Infrastructure", () => {
|
|
7
7
|
// 1. Test Simulation Engine PnL History Tracking
|
|
8
|
-
describe(
|
|
9
|
-
it(
|
|
8
|
+
describe("SimulationEngine PnL History", () => {
|
|
9
|
+
it("should initialize with empty pnlHistory and return it after run()", async () => {
|
|
10
10
|
const mockSnapshot = {
|
|
11
|
-
id:
|
|
11
|
+
id: "test",
|
|
12
12
|
ticks: [],
|
|
13
13
|
initialState: {
|
|
14
14
|
predictionMarkets: [],
|
|
@@ -24,7 +24,7 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
|
|
|
24
24
|
|
|
25
25
|
const engine = new SimulationEngine({
|
|
26
26
|
snapshot: mockSnapshot,
|
|
27
|
-
agentId:
|
|
27
|
+
agentId: "test-agent",
|
|
28
28
|
fastForward: true,
|
|
29
29
|
});
|
|
30
30
|
|
|
@@ -36,16 +36,16 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
|
|
|
36
36
|
});
|
|
37
37
|
|
|
38
38
|
// 2. Test MetricsVisualizer Logic
|
|
39
|
-
describe(
|
|
39
|
+
describe("MetricsVisualizer Comparison Logic", () => {
|
|
40
40
|
// Mock Result Helper
|
|
41
41
|
const createMockResult = (
|
|
42
42
|
id: string,
|
|
43
43
|
pnl: number,
|
|
44
|
-
history: number[]
|
|
44
|
+
history: number[],
|
|
45
45
|
): SimulationResult => ({
|
|
46
46
|
id,
|
|
47
47
|
agentId: id,
|
|
48
|
-
benchmarkId:
|
|
48
|
+
benchmarkId: "bench-1",
|
|
49
49
|
startTime: 0,
|
|
50
50
|
endTime: 1000,
|
|
51
51
|
ticksProcessed: history.length,
|
|
@@ -75,13 +75,13 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
|
|
|
75
75
|
timing: { totalDuration: 0, avgResponseTime: 0, maxResponseTime: 0 },
|
|
76
76
|
optimalityScore: 50,
|
|
77
77
|
},
|
|
78
|
-
trajectory: { states: [], actions: [], rewards: [], windowId:
|
|
78
|
+
trajectory: { states: [], actions: [], rewards: [], windowId: "" },
|
|
79
79
|
pnlHistory: history.map((val, idx) => ({ tick: idx, pnl: val })),
|
|
80
80
|
});
|
|
81
81
|
|
|
82
|
-
it(
|
|
83
|
-
const baseline = createMockResult(
|
|
84
|
-
const challenger = createMockResult(
|
|
82
|
+
it("should correctly merge PnL histories of equal length", () => {
|
|
83
|
+
const baseline = createMockResult("baseline", 100, [10, 50, 100]);
|
|
84
|
+
const challenger = createMockResult("challenger", 200, [20, 100, 200]);
|
|
85
85
|
|
|
86
86
|
// Use public static method
|
|
87
87
|
const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
|
|
@@ -90,11 +90,11 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
|
|
|
90
90
|
expect(history[2]).toEqual({ tick: 2, baseline: 100, challenger: 200 });
|
|
91
91
|
});
|
|
92
92
|
|
|
93
|
-
it(
|
|
93
|
+
it("should handle unequal history lengths (fill with final value)", () => {
|
|
94
94
|
// Baseline died early (e.g., bankruptcy or crash)
|
|
95
|
-
const baseline = createMockResult(
|
|
95
|
+
const baseline = createMockResult("baseline", -50, [10, -50]);
|
|
96
96
|
// Challenger kept going
|
|
97
|
-
const challenger = createMockResult(
|
|
97
|
+
const challenger = createMockResult("challenger", 100, [20, 60, 80, 100]);
|
|
98
98
|
|
|
99
99
|
const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
|
|
100
100
|
|
|
@@ -109,18 +109,18 @@ describe('Head-to-Head Benchmark Infrastructure', () => {
|
|
|
109
109
|
expect(history[3]).toEqual({ tick: 3, baseline: -50, challenger: 100 });
|
|
110
110
|
});
|
|
111
111
|
|
|
112
|
-
it(
|
|
113
|
-
const baseline = createMockResult(
|
|
114
|
-
const challenger = createMockResult(
|
|
112
|
+
it("should generate ASCII chart string", () => {
|
|
113
|
+
const baseline = createMockResult("baseline", 100, [10, 100]);
|
|
114
|
+
const challenger = createMockResult("challenger", 200, [20, 200]);
|
|
115
115
|
|
|
116
116
|
const chart = MetricsVisualizer.generateAsciiComparison(
|
|
117
117
|
baseline,
|
|
118
|
-
challenger
|
|
118
|
+
challenger,
|
|
119
119
|
);
|
|
120
120
|
|
|
121
|
-
expect(chart).toContain(
|
|
122
|
-
expect(chart).toContain(
|
|
123
|
-
expect(chart).toContain(
|
|
121
|
+
expect(chart).toContain("HEAD-TO-HEAD RESULTS");
|
|
122
|
+
expect(chart).toContain("WINNER: Challenger");
|
|
123
|
+
expect(chart).toContain("Alpha Generated: +$100.00");
|
|
124
124
|
});
|
|
125
125
|
});
|
|
126
126
|
});
|