@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BenchmarkRunner Tests
|
|
3
|
+
*
|
|
4
|
+
* Tests the BenchmarkRunner and related benchmark infrastructure.
|
|
5
|
+
* Tests actual classes and functions, not inline mock implementations.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, expect, test } from "bun:test";
|
|
8
|
+
import { BenchmarkDataGenerator, SeededRandom, } from "../BenchmarkDataGenerator";
|
|
9
|
+
// =============================================================================
|
|
10
|
+
// SeededRandom Tests - Real Class
|
|
11
|
+
// =============================================================================
|
|
12
|
+
describe("SeededRandom - Deterministic RNG", () => {
|
|
13
|
+
test("same seed produces same sequence", () => {
|
|
14
|
+
const rng1 = new SeededRandom(12345);
|
|
15
|
+
const rng2 = new SeededRandom(12345);
|
|
16
|
+
const seq1 = [
|
|
17
|
+
rng1.next(),
|
|
18
|
+
rng1.next(),
|
|
19
|
+
rng1.next(),
|
|
20
|
+
rng1.next(),
|
|
21
|
+
rng1.next(),
|
|
22
|
+
];
|
|
23
|
+
const seq2 = [
|
|
24
|
+
rng2.next(),
|
|
25
|
+
rng2.next(),
|
|
26
|
+
rng2.next(),
|
|
27
|
+
rng2.next(),
|
|
28
|
+
rng2.next(),
|
|
29
|
+
];
|
|
30
|
+
expect(seq1).toEqual(seq2);
|
|
31
|
+
});
|
|
32
|
+
test("different seeds produce different sequences", () => {
|
|
33
|
+
const rng1 = new SeededRandom(12345);
|
|
34
|
+
const rng2 = new SeededRandom(54321);
|
|
35
|
+
const val1 = rng1.next();
|
|
36
|
+
const val2 = rng2.next();
|
|
37
|
+
expect(val1).not.toBe(val2);
|
|
38
|
+
});
|
|
39
|
+
test("next() produces values in [0, 1) range", () => {
|
|
40
|
+
const rng = new SeededRandom(42);
|
|
41
|
+
for (let i = 0; i < 1000; i++) {
|
|
42
|
+
const val = rng.next();
|
|
43
|
+
expect(val).toBeGreaterThanOrEqual(0);
|
|
44
|
+
expect(val).toBeLessThan(1);
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
test("nextInt() produces values in specified range", () => {
|
|
48
|
+
const rng = new SeededRandom(42);
|
|
49
|
+
for (let i = 0; i < 100; i++) {
|
|
50
|
+
const val = rng.nextInt(10, 20);
|
|
51
|
+
expect(val).toBeGreaterThanOrEqual(10);
|
|
52
|
+
expect(val).toBeLessThanOrEqual(20);
|
|
53
|
+
expect(Number.isInteger(val)).toBe(true);
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
test("nextInt() handles single value range", () => {
|
|
57
|
+
const rng = new SeededRandom(42);
|
|
58
|
+
for (let i = 0; i < 10; i++) {
|
|
59
|
+
const val = rng.nextInt(5, 5);
|
|
60
|
+
expect(val).toBe(5);
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
test("pick() selects from array", () => {
|
|
64
|
+
const rng = new SeededRandom(42);
|
|
65
|
+
const options = ["a", "b", "c", "d", "e"];
|
|
66
|
+
const selections = new Set();
|
|
67
|
+
for (let i = 0; i < 100; i++) {
|
|
68
|
+
const val = rng.pick(options);
|
|
69
|
+
expect(options).toContain(val);
|
|
70
|
+
selections.add(val);
|
|
71
|
+
}
|
|
72
|
+
// With 100 attempts, we should hit most options
|
|
73
|
+
expect(selections.size).toBeGreaterThan(3);
|
|
74
|
+
});
|
|
75
|
+
test("pick() is deterministic with same seed", () => {
|
|
76
|
+
const rng1 = new SeededRandom(42);
|
|
77
|
+
const rng2 = new SeededRandom(42);
|
|
78
|
+
const options = ["a", "b", "c", "d", "e"];
|
|
79
|
+
const picks1 = [rng1.pick(options), rng1.pick(options), rng1.pick(options)];
|
|
80
|
+
const picks2 = [rng2.pick(options), rng2.pick(options), rng2.pick(options)];
|
|
81
|
+
expect(picks1).toEqual(picks2);
|
|
82
|
+
});
|
|
83
|
+
test("nextFloat() produces values in specified range", () => {
|
|
84
|
+
const rng = new SeededRandom(42);
|
|
85
|
+
for (let i = 0; i < 100; i++) {
|
|
86
|
+
const val = rng.nextFloat(5.5, 10.5);
|
|
87
|
+
expect(val).toBeGreaterThanOrEqual(5.5);
|
|
88
|
+
expect(val).toBeLessThanOrEqual(10.5);
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
});
|
|
92
|
+
// =============================================================================
|
|
93
|
+
// BenchmarkDataGenerator Tests - Real Class
|
|
94
|
+
// =============================================================================
|
|
95
|
+
describe("BenchmarkDataGenerator - Data Generation", () => {
|
|
96
|
+
const baseConfig = {
|
|
97
|
+
durationMinutes: 60, // 1 hour
|
|
98
|
+
tickInterval: 3600, // 1 hour ticks
|
|
99
|
+
numPredictionMarkets: 2,
|
|
100
|
+
numPerpetualMarkets: 3,
|
|
101
|
+
numAgents: 5,
|
|
102
|
+
seed: 12345,
|
|
103
|
+
};
|
|
104
|
+
test("generates deterministic data with same seed", async () => {
|
|
105
|
+
const generator1 = new BenchmarkDataGenerator(baseConfig);
|
|
106
|
+
const generator2 = new BenchmarkDataGenerator(baseConfig);
|
|
107
|
+
const snapshot1 = await generator1.generate();
|
|
108
|
+
const snapshot2 = await generator2.generate();
|
|
109
|
+
// Same structure
|
|
110
|
+
expect(snapshot1.initialState.predictionMarkets.length).toBe(snapshot2.initialState.predictionMarkets.length);
|
|
111
|
+
expect(snapshot1.initialState.perpetualMarkets.length).toBe(snapshot2.initialState.perpetualMarkets.length);
|
|
112
|
+
expect(snapshot1.initialState.agents.length).toBe(snapshot2.initialState.agents.length);
|
|
113
|
+
// Same content (deterministic)
|
|
114
|
+
expect(snapshot1.initialState.perpetualMarkets[0]?.ticker).toBe(snapshot2.initialState.perpetualMarkets[0]?.ticker);
|
|
115
|
+
expect(snapshot1.initialState.perpetualMarkets[0]?.price).toBe(snapshot2.initialState.perpetualMarkets[0]?.price);
|
|
116
|
+
});
|
|
117
|
+
test("generates correct number of markets", async () => {
|
|
118
|
+
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
119
|
+
const snapshot = await generator.generate();
|
|
120
|
+
expect(snapshot.initialState.predictionMarkets.length).toBe(2);
|
|
121
|
+
expect(snapshot.initialState.perpetualMarkets.length).toBe(3);
|
|
122
|
+
expect(snapshot.initialState.agents.length).toBe(5);
|
|
123
|
+
});
|
|
124
|
+
test("generates valid prediction market structure", async () => {
|
|
125
|
+
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
126
|
+
const snapshot = await generator.generate();
|
|
127
|
+
for (const market of snapshot.initialState.predictionMarkets) {
|
|
128
|
+
expect(market.id).toBeDefined();
|
|
129
|
+
expect(market.question).toBeDefined();
|
|
130
|
+
expect(market.yesPrice).toBeGreaterThanOrEqual(0);
|
|
131
|
+
expect(market.yesPrice).toBeLessThanOrEqual(1);
|
|
132
|
+
expect(market.noPrice).toBeGreaterThanOrEqual(0);
|
|
133
|
+
expect(market.noPrice).toBeLessThanOrEqual(1);
|
|
134
|
+
expect(market.yesPrice + market.noPrice).toBeCloseTo(1, 1);
|
|
135
|
+
expect(market.resolved).toBe(false);
|
|
136
|
+
expect(market.liquidity).toBeGreaterThan(0);
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
test("generates valid perpetual market structure", async () => {
|
|
140
|
+
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
141
|
+
const snapshot = await generator.generate();
|
|
142
|
+
for (const market of snapshot.initialState.perpetualMarkets) {
|
|
143
|
+
expect(market.ticker).toBeDefined();
|
|
144
|
+
expect(market.price).toBeGreaterThan(0);
|
|
145
|
+
expect(typeof market.priceChange24h).toBe("number");
|
|
146
|
+
expect(market.volume24h).toBeGreaterThanOrEqual(0);
|
|
147
|
+
expect(typeof market.fundingRate).toBe("number");
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
test("generates valid agent structure", async () => {
|
|
151
|
+
const generator = new BenchmarkDataGenerator(baseConfig);
|
|
152
|
+
const snapshot = await generator.generate();
|
|
153
|
+
for (const agent of snapshot.initialState.agents) {
|
|
154
|
+
expect(agent.id).toBeDefined();
|
|
155
|
+
expect(agent.name).toBeDefined();
|
|
156
|
+
expect(typeof agent.reputation).toBe("number");
|
|
157
|
+
expect(typeof agent.totalPnl).toBe("number");
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
test("generates ticks for duration", async () => {
|
|
161
|
+
const generator = new BenchmarkDataGenerator({
|
|
162
|
+
...baseConfig,
|
|
163
|
+
durationMinutes: 180, // 3 hours
|
|
164
|
+
tickInterval: 3600, // 1 hour
|
|
165
|
+
});
|
|
166
|
+
const snapshot = await generator.generate();
|
|
167
|
+
// 3 hours / 1 hour per tick = 3 ticks
|
|
168
|
+
expect(snapshot.ticks.length).toBe(3);
|
|
169
|
+
});
|
|
170
|
+
test("different seeds produce different data", async () => {
|
|
171
|
+
const generator1 = new BenchmarkDataGenerator({ ...baseConfig, seed: 111 });
|
|
172
|
+
const generator2 = new BenchmarkDataGenerator({ ...baseConfig, seed: 222 });
|
|
173
|
+
const snapshot1 = await generator1.generate();
|
|
174
|
+
const snapshot2 = await generator2.generate();
|
|
175
|
+
// Prices should differ with different seeds
|
|
176
|
+
const price1 = snapshot1.initialState.perpetualMarkets[0]?.price;
|
|
177
|
+
const price2 = snapshot2.initialState.perpetualMarkets[0]?.price;
|
|
178
|
+
expect(price1).not.toBe(price2);
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
// =============================================================================
|
|
182
|
+
// BenchmarkDataGenerator - Causal Simulation Mode
|
|
183
|
+
// =============================================================================
|
|
184
|
+
describe("BenchmarkDataGenerator - Causal Simulation", () => {
|
|
185
|
+
const causalConfig = {
|
|
186
|
+
durationMinutes: 24 * 60, // 1 day
|
|
187
|
+
tickInterval: 3600, // Hourly (required for causal)
|
|
188
|
+
numPredictionMarkets: 2,
|
|
189
|
+
numPerpetualMarkets: 3,
|
|
190
|
+
numAgents: 5,
|
|
191
|
+
seed: 12345,
|
|
192
|
+
useCausalSimulation: true,
|
|
193
|
+
};
|
|
194
|
+
test("causal mode generates hidden narrative facts", async () => {
|
|
195
|
+
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
196
|
+
const snapshot = await generator.generate();
|
|
197
|
+
expect(snapshot.groundTruth).toBeDefined();
|
|
198
|
+
expect(snapshot.groundTruth.hiddenNarrativeFacts).toBeDefined();
|
|
199
|
+
expect(snapshot.groundTruth.hiddenNarrativeFacts?.length).toBeGreaterThan(0);
|
|
200
|
+
});
|
|
201
|
+
test("hidden narrative facts have valid structure", async () => {
|
|
202
|
+
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
203
|
+
const snapshot = await generator.generate();
|
|
204
|
+
for (const fact of snapshot.groundTruth.hiddenNarrativeFacts ?? []) {
|
|
205
|
+
expect(fact.id).toBeDefined();
|
|
206
|
+
expect(fact.fact).toBeDefined();
|
|
207
|
+
expect(fact.affectsTickers).toBeDefined();
|
|
208
|
+
expect(fact.affectsTickers.length).toBeGreaterThan(0);
|
|
209
|
+
expect(["positive", "negative"]).toContain(fact.sentiment);
|
|
210
|
+
expect(fact.eventSchedule).toBeDefined();
|
|
211
|
+
expect(fact.eventSchedule.length).toBeGreaterThan(0);
|
|
212
|
+
}
|
|
213
|
+
});
|
|
214
|
+
test("causal events are scheduled correctly", async () => {
|
|
215
|
+
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
216
|
+
const snapshot = await generator.generate();
|
|
217
|
+
expect(snapshot.groundTruth.causalEvents).toBeDefined();
|
|
218
|
+
expect(snapshot.groundTruth.causalEvents?.length).toBeGreaterThan(0);
|
|
219
|
+
// Verify each causal event has required fields
|
|
220
|
+
for (const event of snapshot.groundTruth.causalEvents ?? []) {
|
|
221
|
+
expect(event.tick).toBeDefined();
|
|
222
|
+
expect(event.eventType).toBeDefined();
|
|
223
|
+
expect(event.affectedTickers.length).toBeGreaterThan(0);
|
|
224
|
+
expect(["low", "medium", "high"]).toContain(event.volatilityBucket);
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
test("causal mode generates market outcomes", async () => {
|
|
228
|
+
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
229
|
+
const snapshot = await generator.generate();
|
|
230
|
+
expect(snapshot.groundTruth.marketOutcomes).toBeDefined();
|
|
231
|
+
expect(Object.keys(snapshot.groundTruth.marketOutcomes).length).toBeGreaterThan(0);
|
|
232
|
+
});
|
|
233
|
+
test("ground truth includes price history", async () => {
|
|
234
|
+
const generator = new BenchmarkDataGenerator(causalConfig);
|
|
235
|
+
const snapshot = await generator.generate();
|
|
236
|
+
expect(snapshot.groundTruth.priceHistory).toBeDefined();
|
|
237
|
+
// Each perpetual market should have price history
|
|
238
|
+
for (const market of snapshot.initialState.perpetualMarkets) {
|
|
239
|
+
const history = snapshot.groundTruth.priceHistory[market.ticker];
|
|
240
|
+
expect(history).toBeDefined();
|
|
241
|
+
expect(history.length).toBeGreaterThan(0);
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
});
|
|
245
|
+
// =============================================================================
|
|
246
|
+
// BenchmarkConfig Validation Tests
|
|
247
|
+
// =============================================================================
|
|
248
|
+
describe("BenchmarkConfig - Validation", () => {
|
|
249
|
+
test("valid config creates generator without error", () => {
|
|
250
|
+
const config = {
|
|
251
|
+
durationMinutes: 30 * 24 * 60,
|
|
252
|
+
tickInterval: 3600,
|
|
253
|
+
numPredictionMarkets: 5,
|
|
254
|
+
numPerpetualMarkets: 5,
|
|
255
|
+
numAgents: 10,
|
|
256
|
+
seed: 12345,
|
|
257
|
+
};
|
|
258
|
+
expect(() => new BenchmarkDataGenerator(config)).not.toThrow();
|
|
259
|
+
});
|
|
260
|
+
test("config with zero markets is valid (edge case)", async () => {
|
|
261
|
+
const config = {
|
|
262
|
+
durationMinutes: 60,
|
|
263
|
+
tickInterval: 3600,
|
|
264
|
+
numPredictionMarkets: 0,
|
|
265
|
+
numPerpetualMarkets: 1,
|
|
266
|
+
numAgents: 1,
|
|
267
|
+
seed: 42,
|
|
268
|
+
};
|
|
269
|
+
const generator = new BenchmarkDataGenerator(config);
|
|
270
|
+
const snapshot = await generator.generate();
|
|
271
|
+
expect(snapshot.initialState.predictionMarkets.length).toBe(0);
|
|
272
|
+
expect(snapshot.initialState.perpetualMarkets.length).toBe(1);
|
|
273
|
+
});
|
|
274
|
+
test("calculates total ticks correctly", async () => {
|
|
275
|
+
const config = {
|
|
276
|
+
durationMinutes: 24 * 60, // 1 day
|
|
277
|
+
tickInterval: 3600, // 1 hour
|
|
278
|
+
numPredictionMarkets: 2,
|
|
279
|
+
numPerpetualMarkets: 3,
|
|
280
|
+
numAgents: 5,
|
|
281
|
+
seed: 12345,
|
|
282
|
+
};
|
|
283
|
+
const generator = new BenchmarkDataGenerator(config);
|
|
284
|
+
const snapshot = await generator.generate();
|
|
285
|
+
const expectedTicks = Math.floor((24 * 60 * 60) / 3600); // 24 hours
|
|
286
|
+
expect(snapshot.ticks.length).toBe(expectedTicks);
|
|
287
|
+
});
|
|
288
|
+
test("short duration with fast ticks", async () => {
|
|
289
|
+
const config = {
|
|
290
|
+
durationMinutes: 10, // 10 minutes
|
|
291
|
+
tickInterval: 60, // 1 minute
|
|
292
|
+
numPredictionMarkets: 1,
|
|
293
|
+
numPerpetualMarkets: 1,
|
|
294
|
+
numAgents: 2,
|
|
295
|
+
seed: 42,
|
|
296
|
+
};
|
|
297
|
+
const generator = new BenchmarkDataGenerator(config);
|
|
298
|
+
const snapshot = await generator.generate();
|
|
299
|
+
expect(snapshot.ticks.length).toBe(10);
|
|
300
|
+
});
|
|
301
|
+
});
|
|
302
|
+
// =============================================================================
|
|
303
|
+
// Comparison Logic Tests - Using Real Types
|
|
304
|
+
// =============================================================================
|
|
305
|
+
describe("Benchmark Comparison Logic", () => {
|
|
306
|
+
function calculateComparison(runs) {
|
|
307
|
+
if (runs.length === 0) {
|
|
308
|
+
return {
|
|
309
|
+
avgPnl: 0,
|
|
310
|
+
avgAccuracy: 0,
|
|
311
|
+
avgOptimality: 0,
|
|
312
|
+
bestRun: "",
|
|
313
|
+
worstRun: "",
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
const avgPnl = runs.reduce((sum, r) => sum + r.pnl, 0) / runs.length;
|
|
317
|
+
const avgAccuracy = runs.reduce((sum, r) => sum + r.accuracy, 0) / runs.length;
|
|
318
|
+
const avgOptimality = runs.reduce((sum, r) => sum + r.optimality, 0) / runs.length;
|
|
319
|
+
const bestRun = runs.reduce((best, r) => (r.pnl > best.pnl ? r : best)).id;
|
|
320
|
+
const worstRun = runs.reduce((worst, r) => r.pnl < worst.pnl ? r : worst).id;
|
|
321
|
+
return { avgPnl, avgAccuracy, avgOptimality, bestRun, worstRun };
|
|
322
|
+
}
|
|
323
|
+
test("calculates average metrics across runs", () => {
|
|
324
|
+
const runs = [
|
|
325
|
+
{ id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
|
|
326
|
+
{ id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
|
|
327
|
+
{ id: "run-3", pnl: 150, accuracy: 0.7, optimality: 0.75 },
|
|
328
|
+
];
|
|
329
|
+
const comparison = calculateComparison(runs);
|
|
330
|
+
expect(comparison.avgPnl).toBe(150);
|
|
331
|
+
expect(comparison.avgAccuracy).toBeCloseTo(0.7, 5);
|
|
332
|
+
expect(comparison.avgOptimality).toBe(0.75);
|
|
333
|
+
});
|
|
334
|
+
test("identifies best and worst runs", () => {
|
|
335
|
+
const runs = [
|
|
336
|
+
{ id: "run-1", pnl: 100, accuracy: 0.6, optimality: 0.7 },
|
|
337
|
+
{ id: "run-2", pnl: 200, accuracy: 0.8, optimality: 0.8 },
|
|
338
|
+
{ id: "run-3", pnl: 50, accuracy: 0.5, optimality: 0.6 },
|
|
339
|
+
];
|
|
340
|
+
const comparison = calculateComparison(runs);
|
|
341
|
+
expect(comparison.bestRun).toBe("run-2");
|
|
342
|
+
expect(comparison.worstRun).toBe("run-3");
|
|
343
|
+
});
|
|
344
|
+
test("handles negative PnL values", () => {
|
|
345
|
+
const runs = [
|
|
346
|
+
{ id: "run-1", pnl: -50, accuracy: 0.4, optimality: 0.3 },
|
|
347
|
+
{ id: "run-2", pnl: 50, accuracy: 0.6, optimality: 0.6 },
|
|
348
|
+
{ id: "run-3", pnl: -100, accuracy: 0.3, optimality: 0.2 },
|
|
349
|
+
];
|
|
350
|
+
const comparison = calculateComparison(runs);
|
|
351
|
+
expect(comparison.bestRun).toBe("run-2");
|
|
352
|
+
expect(comparison.worstRun).toBe("run-3");
|
|
353
|
+
expect(comparison.avgPnl).toBeCloseTo(-33.33, 1);
|
|
354
|
+
});
|
|
355
|
+
test("handles single run", () => {
|
|
356
|
+
const runs = [
|
|
357
|
+
{ id: "run-1", pnl: 100, accuracy: 0.7, optimality: 0.8 },
|
|
358
|
+
];
|
|
359
|
+
const comparison = calculateComparison(runs);
|
|
360
|
+
expect(comparison.avgPnl).toBe(100);
|
|
361
|
+
expect(comparison.bestRun).toBe("run-1");
|
|
362
|
+
expect(comparison.worstRun).toBe("run-1");
|
|
363
|
+
});
|
|
364
|
+
test("handles empty runs array", () => {
|
|
365
|
+
const comparison = calculateComparison([]);
|
|
366
|
+
expect(comparison.avgPnl).toBe(0);
|
|
367
|
+
expect(comparison.bestRun).toBe("");
|
|
368
|
+
expect(comparison.worstRun).toBe("");
|
|
369
|
+
});
|
|
370
|
+
});
|
|
371
|
+
// =============================================================================
|
|
372
|
+
// Alpha Calculation (Excess Return)
|
|
373
|
+
// =============================================================================
|
|
374
|
+
describe("Alpha Calculation", () => {
|
|
375
|
+
function calculateAlpha(baselinePnl, challengerPnl) {
|
|
376
|
+
const alpha = challengerPnl - baselinePnl;
|
|
377
|
+
const alphaPercent = baselinePnl !== 0
|
|
378
|
+
? (alpha / Math.abs(baselinePnl)) * 100
|
|
379
|
+
: challengerPnl !== 0
|
|
380
|
+
? Infinity
|
|
381
|
+
: 0;
|
|
382
|
+
return { alpha, alphaPercent };
|
|
383
|
+
}
|
|
384
|
+
test("positive alpha when outperforming", () => {
|
|
385
|
+
const result = calculateAlpha(100, 150);
|
|
386
|
+
expect(result.alpha).toBe(50);
|
|
387
|
+
expect(result.alphaPercent).toBe(50);
|
|
388
|
+
});
|
|
389
|
+
test("negative alpha when underperforming", () => {
|
|
390
|
+
const result = calculateAlpha(150, 100);
|
|
391
|
+
expect(result.alpha).toBe(-50);
|
|
392
|
+
expect(result.alphaPercent).toBeCloseTo(-33.33, 1);
|
|
393
|
+
});
|
|
394
|
+
test("zero alpha when equal performance", () => {
|
|
395
|
+
const result = calculateAlpha(100, 100);
|
|
396
|
+
expect(result.alpha).toBe(0);
|
|
397
|
+
expect(result.alphaPercent).toBe(0);
|
|
398
|
+
});
|
|
399
|
+
test("handles baseline of zero", () => {
|
|
400
|
+
const result = calculateAlpha(0, 100);
|
|
401
|
+
expect(result.alpha).toBe(100);
|
|
402
|
+
expect(result.alphaPercent).toBe(Infinity);
|
|
403
|
+
});
|
|
404
|
+
test("handles both zero", () => {
|
|
405
|
+
const result = calculateAlpha(0, 0);
|
|
406
|
+
expect(result.alpha).toBe(0);
|
|
407
|
+
expect(result.alphaPercent).toBe(0);
|
|
408
|
+
});
|
|
409
|
+
});
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { describe, expect, it } from "bun:test";
|
|
2
|
+
import { MetricsVisualizer } from "../MetricsVisualizer";
|
|
3
|
+
import { SimulationEngine } from "../SimulationEngine";
|
|
4
|
+
describe("Head-to-Head Benchmark Infrastructure", () => {
|
|
5
|
+
// 1. Test Simulation Engine PnL History Tracking
|
|
6
|
+
describe("SimulationEngine PnL History", () => {
|
|
7
|
+
it("should initialize with empty pnlHistory and return it after run()", async () => {
|
|
8
|
+
const mockSnapshot = {
|
|
9
|
+
id: "test",
|
|
10
|
+
ticks: [],
|
|
11
|
+
initialState: {
|
|
12
|
+
predictionMarkets: [],
|
|
13
|
+
perpetualMarkets: [],
|
|
14
|
+
agents: [],
|
|
15
|
+
},
|
|
16
|
+
groundTruth: {
|
|
17
|
+
marketOutcomes: {},
|
|
18
|
+
priceHistory: {},
|
|
19
|
+
optimalActions: [],
|
|
20
|
+
},
|
|
21
|
+
};
|
|
22
|
+
const engine = new SimulationEngine({
|
|
23
|
+
snapshot: mockSnapshot,
|
|
24
|
+
agentId: "test-agent",
|
|
25
|
+
fastForward: true,
|
|
26
|
+
});
|
|
27
|
+
engine.initialize();
|
|
28
|
+
// Use public API - run() returns pnlHistory
|
|
29
|
+
const result = await engine.run();
|
|
30
|
+
expect(result.pnlHistory).toEqual([]);
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
// 2. Test MetricsVisualizer Logic
|
|
34
|
+
describe("MetricsVisualizer Comparison Logic", () => {
|
|
35
|
+
// Mock Result Helper
|
|
36
|
+
const createMockResult = (id, pnl, history) => ({
|
|
37
|
+
id,
|
|
38
|
+
agentId: id,
|
|
39
|
+
benchmarkId: "bench-1",
|
|
40
|
+
startTime: 0,
|
|
41
|
+
endTime: 1000,
|
|
42
|
+
ticksProcessed: history.length,
|
|
43
|
+
actions: [],
|
|
44
|
+
metrics: {
|
|
45
|
+
totalPnl: pnl,
|
|
46
|
+
predictionMetrics: {
|
|
47
|
+
accuracy: 0.5,
|
|
48
|
+
totalPositions: 0,
|
|
49
|
+
correctPredictions: 0,
|
|
50
|
+
incorrectPredictions: 0,
|
|
51
|
+
avgPnlPerPosition: 0,
|
|
52
|
+
},
|
|
53
|
+
perpMetrics: {
|
|
54
|
+
winRate: 0.5,
|
|
55
|
+
totalTrades: 0,
|
|
56
|
+
profitableTrades: 0,
|
|
57
|
+
avgPnlPerTrade: 0,
|
|
58
|
+
maxDrawdown: 0,
|
|
59
|
+
},
|
|
60
|
+
socialMetrics: {
|
|
61
|
+
postsCreated: 0,
|
|
62
|
+
groupsJoined: 0,
|
|
63
|
+
messagesReceived: 0,
|
|
64
|
+
reputationGained: 0,
|
|
65
|
+
},
|
|
66
|
+
timing: { totalDuration: 0, avgResponseTime: 0, maxResponseTime: 0 },
|
|
67
|
+
optimalityScore: 50,
|
|
68
|
+
},
|
|
69
|
+
trajectory: { states: [], actions: [], rewards: [], windowId: "" },
|
|
70
|
+
pnlHistory: history.map((val, idx) => ({ tick: idx, pnl: val })),
|
|
71
|
+
});
|
|
72
|
+
it("should correctly merge PnL histories of equal length", () => {
|
|
73
|
+
const baseline = createMockResult("baseline", 100, [10, 50, 100]);
|
|
74
|
+
const challenger = createMockResult("challenger", 200, [20, 100, 200]);
|
|
75
|
+
// Use public static method
|
|
76
|
+
const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
|
|
77
|
+
expect(history).toHaveLength(3);
|
|
78
|
+
expect(history[2]).toEqual({ tick: 2, baseline: 100, challenger: 200 });
|
|
79
|
+
});
|
|
80
|
+
it("should handle unequal history lengths (fill with final value)", () => {
|
|
81
|
+
// Baseline died early (e.g., bankruptcy or crash)
|
|
82
|
+
const baseline = createMockResult("baseline", -50, [10, -50]);
|
|
83
|
+
// Challenger kept going
|
|
84
|
+
const challenger = createMockResult("challenger", 100, [20, 60, 80, 100]);
|
|
85
|
+
const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
|
|
86
|
+
expect(history).toHaveLength(4); // Should match longest
|
|
87
|
+
// Tick 0
|
|
88
|
+
expect(history[0]).toEqual({ tick: 0, baseline: 10, challenger: 20 });
|
|
89
|
+
// Tick 1
|
|
90
|
+
expect(history[1]).toEqual({ tick: 1, baseline: -50, challenger: 60 });
|
|
91
|
+
// Tick 2 (Baseline stopped, should carry over -50)
|
|
92
|
+
expect(history[2]).toEqual({ tick: 2, baseline: -50, challenger: 80 });
|
|
93
|
+
// Tick 3
|
|
94
|
+
expect(history[3]).toEqual({ tick: 3, baseline: -50, challenger: 100 });
|
|
95
|
+
});
|
|
96
|
+
it("should generate ASCII chart string", () => {
|
|
97
|
+
const baseline = createMockResult("baseline", 100, [10, 100]);
|
|
98
|
+
const challenger = createMockResult("challenger", 200, [20, 200]);
|
|
99
|
+
const chart = MetricsVisualizer.generateAsciiComparison(baseline, challenger);
|
|
100
|
+
expect(chart).toContain("HEAD-TO-HEAD RESULTS");
|
|
101
|
+
expect(chart).toContain("WINNER: Challenger");
|
|
102
|
+
expect(chart).toContain("Alpha Generated: +$100.00");
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
});
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark Module
|
|
3
|
+
*
|
|
4
|
+
* Tools for evaluating agent performance through simulation.
|
|
5
|
+
*/
|
|
6
|
+
export { ArchetypeMatchupBenchmark, runQuickMatchupBenchmark, } from "./ArchetypeMatchupBenchmark";
|
|
7
|
+
export { BenchmarkChartGenerator } from "./BenchmarkChartGenerator";
|
|
8
|
+
export { BenchmarkDataGenerator, SeededRandom } from "./BenchmarkDataGenerator";
|
|
9
|
+
export { BenchmarkDataViewer } from "./BenchmarkDataViewer";
|
|
10
|
+
export { BenchmarkHistoryService } from "./BenchmarkHistoryService";
|
|
11
|
+
export { BenchmarkRunner } from "./BenchmarkRunner";
|
|
12
|
+
export * as BenchmarkValidator from "./BenchmarkValidator";
|
|
13
|
+
export { FastEvalRunner } from "./FastEvalRunner";
|
|
14
|
+
export { MetricsValidator } from "./MetricsValidator";
|
|
15
|
+
export { MetricsVisualizer } from "./MetricsVisualizer";
|
|
16
|
+
export { ModelBenchmarkService } from "./ModelBenchmarkService";
|
|
17
|
+
export { getBaselineModels, getModelById, getModelByModelId, getModelDisplayName, getModelsByProvider, getModelsByTier, MODEL_REGISTRY, validateModelId, } from "./ModelRegistry";
|
|
18
|
+
// Shared utilities
|
|
19
|
+
export { parseSimulationMetrics, } from "./parseSimulationMetrics";
|
|
20
|
+
export { createRulerContext, extractMarketOutcomesFromBenchmark, getHiddenEventsForTick, getHiddenFactsForTick, getTrueFacts, scoreActionAgainstGroundTruth, wasDecisionOptimal, } from "./RulerBenchmarkIntegration";
|
|
21
|
+
export { SimulationA2AInterface } from "./SimulationA2AInterface";
|
|
22
|
+
export { SimulationEngine } from "./SimulationEngine";
|
|
23
|
+
export { TaskRunner } from "./TaskRunner";
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simulation Metrics Parser
|
|
3
|
+
*
|
|
4
|
+
* Shared utility for validating and parsing SimulationMetrics from JSON data.
|
|
5
|
+
* Used by ModelBenchmarkService and HuggingFaceModelUploader.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Parse and validate SimulationMetrics from JSON data
|
|
9
|
+
*
|
|
10
|
+
* @param data - Raw JSON data to parse
|
|
11
|
+
* @returns Validated SimulationMetrics object
|
|
12
|
+
* @throws Error if data is invalid or missing required fields
|
|
13
|
+
*/
|
|
14
|
+
export function parseSimulationMetrics(data) {
|
|
15
|
+
if (typeof data !== "object" || data === null) {
|
|
16
|
+
throw new Error("Invalid SimulationMetrics: expected object");
|
|
17
|
+
}
|
|
18
|
+
const metrics = data;
|
|
19
|
+
// Validate required fields
|
|
20
|
+
if (typeof metrics.totalPnl !== "number") {
|
|
21
|
+
throw new Error("Invalid SimulationMetrics: totalPnl must be a number");
|
|
22
|
+
}
|
|
23
|
+
if (typeof metrics.predictionMetrics !== "object" ||
|
|
24
|
+
metrics.predictionMetrics === null) {
|
|
25
|
+
throw new Error("Invalid SimulationMetrics: predictionMetrics must be an object");
|
|
26
|
+
}
|
|
27
|
+
if (typeof metrics.perpMetrics !== "object" || metrics.perpMetrics === null) {
|
|
28
|
+
throw new Error("Invalid SimulationMetrics: perpMetrics must be an object");
|
|
29
|
+
}
|
|
30
|
+
if (typeof metrics.optimalityScore !== "number") {
|
|
31
|
+
throw new Error("Invalid SimulationMetrics: optimalityScore must be a number");
|
|
32
|
+
}
|
|
33
|
+
if (typeof metrics.timing !== "object" || metrics.timing === null) {
|
|
34
|
+
throw new Error("Invalid SimulationMetrics: timing must be an object");
|
|
35
|
+
}
|
|
36
|
+
// Validate nested structures
|
|
37
|
+
const predictionMetrics = metrics.predictionMetrics;
|
|
38
|
+
const perpMetrics = metrics.perpMetrics;
|
|
39
|
+
const timing = metrics.timing;
|
|
40
|
+
// Helper to safely get number or default
|
|
41
|
+
const getNumber = (obj, key) => {
|
|
42
|
+
const val = obj[key];
|
|
43
|
+
return typeof val === "number" ? val : 0;
|
|
44
|
+
};
|
|
45
|
+
// Parse socialMetrics if present
|
|
46
|
+
const socialMetricsData = metrics.socialMetrics;
|
|
47
|
+
const socialMetrics = typeof socialMetricsData === "object" && socialMetricsData !== null
|
|
48
|
+
? socialMetricsData
|
|
49
|
+
: null;
|
|
50
|
+
return {
|
|
51
|
+
totalPnl: metrics.totalPnl,
|
|
52
|
+
predictionMetrics: {
|
|
53
|
+
totalPositions: getNumber(predictionMetrics, "totalPositions"),
|
|
54
|
+
correctPredictions: getNumber(predictionMetrics, "correctPredictions"),
|
|
55
|
+
incorrectPredictions: getNumber(predictionMetrics, "incorrectPredictions"),
|
|
56
|
+
accuracy: getNumber(predictionMetrics, "accuracy"),
|
|
57
|
+
avgPnlPerPosition: getNumber(predictionMetrics, "avgPnlPerPosition"),
|
|
58
|
+
},
|
|
59
|
+
perpMetrics: {
|
|
60
|
+
totalTrades: getNumber(perpMetrics, "totalTrades"),
|
|
61
|
+
profitableTrades: getNumber(perpMetrics, "profitableTrades"),
|
|
62
|
+
winRate: getNumber(perpMetrics, "winRate"),
|
|
63
|
+
avgPnlPerTrade: getNumber(perpMetrics, "avgPnlPerTrade"),
|
|
64
|
+
maxDrawdown: getNumber(perpMetrics, "maxDrawdown"),
|
|
65
|
+
},
|
|
66
|
+
socialMetrics: socialMetrics
|
|
67
|
+
? {
|
|
68
|
+
postsCreated: getNumber(socialMetrics, "postsCreated"),
|
|
69
|
+
groupsJoined: getNumber(socialMetrics, "groupsJoined"),
|
|
70
|
+
messagesReceived: getNumber(socialMetrics, "messagesReceived"),
|
|
71
|
+
reputationGained: getNumber(socialMetrics, "reputationGained"),
|
|
72
|
+
}
|
|
73
|
+
: {
|
|
74
|
+
postsCreated: 0,
|
|
75
|
+
groupsJoined: 0,
|
|
76
|
+
messagesReceived: 0,
|
|
77
|
+
reputationGained: 0,
|
|
78
|
+
},
|
|
79
|
+
timing: {
|
|
80
|
+
avgResponseTime: getNumber(timing, "avgResponseTime"),
|
|
81
|
+
maxResponseTime: getNumber(timing, "maxResponseTime"),
|
|
82
|
+
totalDuration: getNumber(timing, "totalDuration"),
|
|
83
|
+
},
|
|
84
|
+
optimalityScore: metrics.optimalityScore,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|