@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/.turbo/turbo-lint.log +0 -3
- package/.turbo/turbo-typecheck.log +0 -1
- package/dist/.tsbuildinfo +0 -1
- package/dist/adapter.js +0 -59
- package/dist/archetypes/ArchetypeConfigService.js +0 -510
- package/dist/archetypes/derive-archetype.js +0 -196
- package/dist/archetypes/index.js +0 -7
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
- package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
- package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
- package/dist/benchmark/BenchmarkDataViewer.js +0 -197
- package/dist/benchmark/BenchmarkHistoryService.js +0 -135
- package/dist/benchmark/BenchmarkRunner.js +0 -483
- package/dist/benchmark/BenchmarkValidator.js +0 -158
- package/dist/benchmark/FastEvalRunner.js +0 -133
- package/dist/benchmark/MetricsValidator.js +0 -104
- package/dist/benchmark/MetricsVisualizer.js +0 -775
- package/dist/benchmark/ModelBenchmarkService.js +0 -433
- package/dist/benchmark/ModelRegistry.js +0 -122
- package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
- package/dist/benchmark/SimulationA2AInterface.js +0 -683
- package/dist/benchmark/SimulationEngine.js +0 -522
- package/dist/benchmark/TaskRunner.js +0 -60
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
- package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
- package/dist/benchmark/index.js +0 -23
- package/dist/benchmark/parseSimulationMetrics.js +0 -86
- package/dist/benchmark/simulation-types.js +0 -1
- package/dist/dependencies.js +0 -197
- package/dist/generation/TrajectoryGenerator.js +0 -244
- package/dist/generation/index.js +0 -6
- package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
- package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
- package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
- package/dist/huggingface/index.js +0 -9
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
- package/dist/index.js +0 -41
- package/dist/init-training.js +0 -43
- package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
- package/dist/metrics/index.js +0 -7
- package/dist/metrics/types.js +0 -21
- package/dist/rubrics/__tests__/index.test.js +0 -150
- package/dist/rubrics/ass-kisser.js +0 -83
- package/dist/rubrics/degen.js +0 -78
- package/dist/rubrics/goody-twoshoes.js +0 -82
- package/dist/rubrics/index.js +0 -184
- package/dist/rubrics/information-trader.js +0 -82
- package/dist/rubrics/infosec.js +0 -99
- package/dist/rubrics/liar.js +0 -102
- package/dist/rubrics/perps-trader.js +0 -85
- package/dist/rubrics/researcher.js +0 -79
- package/dist/rubrics/scammer.js +0 -80
- package/dist/rubrics/social-butterfly.js +0 -71
- package/dist/rubrics/super-predictor.js +0 -95
- package/dist/rubrics/trader.js +0 -65
- package/dist/scoring/ArchetypeScoringService.js +0 -301
- package/dist/scoring/JudgePromptBuilder.js +0 -401
- package/dist/scoring/LLMJudgeCache.js +0 -263
- package/dist/scoring/index.js +0 -8
- package/dist/training/AutomationPipeline.js +0 -714
- package/dist/training/BenchmarkService.js +0 -370
- package/dist/training/ConfigValidator.js +0 -153
- package/dist/training/MarketOutcomesTracker.js +0 -142
- package/dist/training/ModelDeployer.js +0 -128
- package/dist/training/ModelFetcher.js +0 -48
- package/dist/training/ModelSelectionService.js +0 -248
- package/dist/training/ModelUsageVerifier.js +0 -106
- package/dist/training/MultiModelOrchestrator.js +0 -349
- package/dist/training/RLModelConfig.js +0 -295
- package/dist/training/RewardBackpropagationService.js +0 -117
- package/dist/training/RulerScoringService.js +0 -450
- package/dist/training/TrainingMonitor.js +0 -108
- package/dist/training/TrajectoryRecorder.js +0 -281
- package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
- package/dist/training/index.js +0 -30
- package/dist/training/logRLConfig.js +0 -29
- package/dist/training/pipeline.js +0 -80
- package/dist/training/storage/ModelStorageService.js +0 -190
- package/dist/training/storage/TrainingDataArchiver.js +0 -136
- package/dist/training/storage/index.js +0 -7
- package/dist/training/types.js +0 -6
- package/dist/training/window-utils.js +0 -100
- package/dist/utils/index.js +0 -73
- package/dist/utils/logger.js +0 -55
- package/dist/utils/snowflake.js +0 -15
- package/dist/utils/synthetic-detector.js +0 -67
- package/vitest.config.ts +0 -8
|
@@ -1,483 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Benchmark Runner
|
|
3
|
-
*
|
|
4
|
-
* Coordinates the complete benchmarking process:
|
|
5
|
-
* 1. Load or generate benchmark data
|
|
6
|
-
* 2. Initialize simulation engine
|
|
7
|
-
* 3. Run agent through simulation (Autonomous or Forced Strategy)
|
|
8
|
-
* 4. Collect metrics and trajectory data
|
|
9
|
-
* 5. Save results
|
|
10
|
-
*
|
|
11
|
-
* Can run multiple agents and compare their performance.
|
|
12
|
-
*/
|
|
13
|
-
import { promises as fs } from "node:fs";
|
|
14
|
-
import * as path from "node:path";
|
|
15
|
-
import { getAutonomousCoordinator } from "../dependencies";
|
|
16
|
-
import { TrajectoryRecorder } from "../training/TrajectoryRecorder";
|
|
17
|
-
import { logger } from "../utils/logger";
|
|
18
|
-
import { BenchmarkDataGenerator, SeededRandom, } from "./BenchmarkDataGenerator";
|
|
19
|
-
import { SimulationA2AInterface } from "./SimulationA2AInterface";
|
|
20
|
-
import { SimulationEngine, } from "./SimulationEngine";
|
|
21
|
-
export class BenchmarkRunner {
|
|
22
|
-
/**
|
|
23
|
-
* Run a single benchmark
|
|
24
|
-
*
|
|
25
|
-
* Executes a complete benchmark run by loading or generating benchmark data,
|
|
26
|
-
* initializing the simulation engine, running the agent through the simulation,
|
|
27
|
-
* and collecting comprehensive metrics and trajectory data.
|
|
28
|
-
*
|
|
29
|
-
* @param config - Benchmark run configuration
|
|
30
|
-
* @returns SimulationResult with metrics, actions, and trajectory data
|
|
31
|
-
* @throws Error if benchmark fails to load/generate or simulation fails
|
|
32
|
-
*
|
|
33
|
-
* @remarks
|
|
34
|
-
* - Can load existing benchmark from file or generate new one
|
|
35
|
-
* - Supports trajectory recording for RL training
|
|
36
|
-
* - Validates that agent actually took actions
|
|
37
|
-
* - Saves results to output directory
|
|
38
|
-
*
|
|
39
|
-
* @example
|
|
40
|
-
* ```typescript
|
|
41
|
-
* const result = await BenchmarkRunner.runSingle({
|
|
42
|
-
* benchmarkPath: './benchmarks/test.json',
|
|
43
|
-
* agentRuntime: runtime,
|
|
44
|
-
* agentUserId: 'agent-123',
|
|
45
|
-
* saveTrajectory: true,
|
|
46
|
-
* outputDir: './results'
|
|
47
|
-
* });
|
|
48
|
-
* console.log(`P&L: ${result.metrics.totalPnl}`);
|
|
49
|
-
* ```
|
|
50
|
-
*/
|
|
51
|
-
static async runSingle(config) {
|
|
52
|
-
logger.info("Starting benchmark run", {
|
|
53
|
-
agentUserId: config.agentUserId,
|
|
54
|
-
benchmarkPath: config.benchmarkPath,
|
|
55
|
-
strategy: config.forceStrategy || "agent-driven",
|
|
56
|
-
});
|
|
57
|
-
// 1. Load or generate benchmark
|
|
58
|
-
const snapshot = config.benchmarkPath
|
|
59
|
-
? await BenchmarkRunner.loadBenchmark(config.benchmarkPath)
|
|
60
|
-
: await BenchmarkRunner.generateBenchmark(config.generatorConfig ??
|
|
61
|
-
(() => {
|
|
62
|
-
throw new Error("generatorConfig required when benchmarkPath not provided");
|
|
63
|
-
})());
|
|
64
|
-
// 2. Create simulation engine
|
|
65
|
-
const simConfig = {
|
|
66
|
-
snapshot,
|
|
67
|
-
agentId: config.agentUserId,
|
|
68
|
-
fastForward: true,
|
|
69
|
-
responseTimeout: 30000,
|
|
70
|
-
};
|
|
71
|
-
const engine = new SimulationEngine(simConfig);
|
|
72
|
-
// 3. Set up A2A interface for agent
|
|
73
|
-
const a2aInterface = new SimulationA2AInterface(engine, config.agentUserId);
|
|
74
|
-
// Inject A2A interface into agent runtime (if using real agent and not forcing strategy)
|
|
75
|
-
if (!config.forceStrategy) {
|
|
76
|
-
config.agentRuntime.a2aClient = a2aInterface;
|
|
77
|
-
}
|
|
78
|
-
// Force model if specified (for baseline testing)
|
|
79
|
-
if (config.forceModel) {
|
|
80
|
-
logger.info("Forcing model for benchmark", {
|
|
81
|
-
agentUserId: config.agentUserId,
|
|
82
|
-
forcedModel: config.forceModel,
|
|
83
|
-
});
|
|
84
|
-
// Set model in runtime settings
|
|
85
|
-
const runtime = config.agentRuntime;
|
|
86
|
-
if (runtime.character?.settings) {
|
|
87
|
-
runtime.character.settings.GROQ_LARGE_MODEL = config.forceModel;
|
|
88
|
-
runtime.character.settings.GROQ_SMALL_MODEL = config.forceModel;
|
|
89
|
-
}
|
|
90
|
-
if (runtime.setSetting) {
|
|
91
|
-
runtime.setSetting("GROQ_LARGE_MODEL", config.forceModel);
|
|
92
|
-
runtime.setSetting("GROQ_SMALL_MODEL", config.forceModel);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
// 4. Set up trajectory recording if enabled
|
|
96
|
-
let trajectoryRecorder;
|
|
97
|
-
let trajectoryId;
|
|
98
|
-
if (config.saveTrajectory) {
|
|
99
|
-
// Fail fast - trajectory recording setup errors should crash
|
|
100
|
-
trajectoryRecorder = new TrajectoryRecorder();
|
|
101
|
-
trajectoryId = await trajectoryRecorder.startTrajectory({
|
|
102
|
-
agentId: config.agentUserId,
|
|
103
|
-
scenarioId: `benchmark-${snapshot.id}`,
|
|
104
|
-
});
|
|
105
|
-
logger.info("Trajectory recording started", { trajectoryId });
|
|
106
|
-
}
|
|
107
|
-
// 5. Initialize simulation
|
|
108
|
-
engine.initialize();
|
|
109
|
-
// 6. Run simulation loop
|
|
110
|
-
logger.info("Starting simulation loop", {
|
|
111
|
-
agentUserId: config.agentUserId,
|
|
112
|
-
totalTicks: snapshot.ticks.length,
|
|
113
|
-
});
|
|
114
|
-
// Only get coordinator if we are using an autonomous agent (not forced strategy)
|
|
115
|
-
// This prevents errors when running baseline tests without full dependency injection
|
|
116
|
-
const coordinator = !config.forceStrategy
|
|
117
|
-
? getAutonomousCoordinator()
|
|
118
|
-
: undefined;
|
|
119
|
-
// Create seeded RNG for baseline strategies (reproducibility)
|
|
120
|
-
// Use snapshot ID hash as seed for deterministic behavior across runs
|
|
121
|
-
const baselineSeed = config.forceStrategy
|
|
122
|
-
? snapshot.id.split("").reduce((acc, c) => acc + c.charCodeAt(0), 0)
|
|
123
|
-
: 0;
|
|
124
|
-
const baselineRng = config.forceStrategy
|
|
125
|
-
? new SeededRandom(baselineSeed)
|
|
126
|
-
: undefined;
|
|
127
|
-
let ticksCompleted = 0;
|
|
128
|
-
// Run ticks for each simulation tick
|
|
129
|
-
while (!engine.isComplete()) {
|
|
130
|
-
const currentTick = engine.getCurrentTickNumber();
|
|
131
|
-
if (currentTick % 100 === 0 || currentTick < 5) {
|
|
132
|
-
logger.info(`Benchmark progress: ${currentTick}/${snapshot.ticks.length} ticks`, {
|
|
133
|
-
agentUserId: config.agentUserId,
|
|
134
|
-
});
|
|
135
|
-
}
|
|
136
|
-
if (config.forceStrategy && baselineRng) {
|
|
137
|
-
// Execute baseline strategy directly on engine (bypassing LLM)
|
|
138
|
-
await BenchmarkRunner.executeBaselineStrategy(config.forceStrategy, engine, baselineRng);
|
|
139
|
-
}
|
|
140
|
-
else {
|
|
141
|
-
if (!coordinator) {
|
|
142
|
-
throw new Error("AutonomousCoordinator required for agent-driven benchmark but not configured.");
|
|
143
|
-
}
|
|
144
|
-
// Execute autonomous tick (agent makes decisions via A2A)
|
|
145
|
-
// Fail fast - don't catch errors, let them propagate
|
|
146
|
-
const tickResult = await coordinator.executeAutonomousTick(config.agentUserId, config.agentRuntime);
|
|
147
|
-
if (tickResult.success && tickResult.actionsExecuted) {
|
|
148
|
-
const totalActions = tickResult.actionsExecuted.trades +
|
|
149
|
-
tickResult.actionsExecuted.posts +
|
|
150
|
-
tickResult.actionsExecuted.comments +
|
|
151
|
-
tickResult.actionsExecuted.messages +
|
|
152
|
-
tickResult.actionsExecuted.groupMessages +
|
|
153
|
-
tickResult.actionsExecuted.engagements;
|
|
154
|
-
if (totalActions > 0) {
|
|
155
|
-
logger.debug("Agent took actions", {
|
|
156
|
-
tick: currentTick,
|
|
157
|
-
actions: tickResult.actionsExecuted,
|
|
158
|
-
});
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
// Advance simulation tick
|
|
163
|
-
engine.advanceTick();
|
|
164
|
-
ticksCompleted++;
|
|
165
|
-
// Small delay to avoid overwhelming the system
|
|
166
|
-
await new Promise((resolve) => setTimeout(resolve, 5));
|
|
167
|
-
}
|
|
168
|
-
logger.info("Simulation loop complete", {
|
|
169
|
-
agentUserId: config.agentUserId,
|
|
170
|
-
ticksCompleted,
|
|
171
|
-
totalTicks: snapshot.ticks.length,
|
|
172
|
-
});
|
|
173
|
-
// 7. Calculate final results
|
|
174
|
-
const result = await engine.run();
|
|
175
|
-
// 8. Validate results - ensure agent actually did something
|
|
176
|
-
if (result.ticksProcessed === 0) {
|
|
177
|
-
throw new Error("Benchmark failed: No ticks were processed");
|
|
178
|
-
}
|
|
179
|
-
if (result.actions.length === 0) {
|
|
180
|
-
logger.warn("Benchmark completed but agent took no actions", {
|
|
181
|
-
agentUserId: config.agentUserId,
|
|
182
|
-
ticksProcessed: result.ticksProcessed,
|
|
183
|
-
});
|
|
184
|
-
}
|
|
185
|
-
// 9. Save trajectory if enabled
|
|
186
|
-
if (trajectoryRecorder && trajectoryId) {
|
|
187
|
-
await trajectoryRecorder.endTrajectory(trajectoryId, {
|
|
188
|
-
finalPnL: result.metrics.totalPnl,
|
|
189
|
-
finalBalance: undefined, // Let recorder calculate from state
|
|
190
|
-
});
|
|
191
|
-
logger.info("Trajectory recording saved", { trajectoryId });
|
|
192
|
-
}
|
|
193
|
-
// 10. Save results
|
|
194
|
-
await BenchmarkRunner.saveResult(result, config.outputDir);
|
|
195
|
-
logger.info("Benchmark run completed", {
|
|
196
|
-
agentUserId: config.agentUserId,
|
|
197
|
-
totalPnl: result.metrics.totalPnl,
|
|
198
|
-
accuracy: result.metrics.predictionMetrics.accuracy,
|
|
199
|
-
optimalityScore: result.metrics.optimalityScore,
|
|
200
|
-
});
|
|
201
|
-
return result;
|
|
202
|
-
}
|
|
203
|
-
/**
|
|
204
|
-
* Execute baseline strategy logic (Random or Momentum)
|
|
205
|
-
* This runs directly against the engine, bypassing the LLM agent.
|
|
206
|
-
* Uses seeded RNG for reproducibility across benchmark runs.
|
|
207
|
-
*/
|
|
208
|
-
static async executeBaselineStrategy(strategy, engine, rng) {
|
|
209
|
-
const state = engine.getGameState();
|
|
210
|
-
// Rate limiting: Only trade in ~10% of ticks to simulate realistic frequency
|
|
211
|
-
if (rng.next() > 0.1)
|
|
212
|
-
return;
|
|
213
|
-
if (strategy === "random") {
|
|
214
|
-
// Random strategy: Buy prediction shares or open perps randomly
|
|
215
|
-
const actionType = rng.next() > 0.5 ? "prediction" : "perp";
|
|
216
|
-
if (actionType === "prediction" && state.predictionMarkets.length > 0) {
|
|
217
|
-
const marketIndex = Math.floor(rng.next() * state.predictionMarkets.length);
|
|
218
|
-
const market = state.predictionMarkets[marketIndex];
|
|
219
|
-
if (market) {
|
|
220
|
-
const outcome = rng.next() > 0.5 ? "YES" : "NO";
|
|
221
|
-
// Random amount between 10 and 100
|
|
222
|
-
const amount = 10 + rng.next() * 90;
|
|
223
|
-
await engine.performAction("buy_prediction", {
|
|
224
|
-
marketId: market.id,
|
|
225
|
-
outcome,
|
|
226
|
-
amount,
|
|
227
|
-
});
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
else if (state.perpetualMarkets.length > 0) {
|
|
231
|
-
const perpIndex = Math.floor(rng.next() * state.perpetualMarkets.length);
|
|
232
|
-
const perp = state.perpetualMarkets[perpIndex];
|
|
233
|
-
if (perp) {
|
|
234
|
-
const side = rng.next() > 0.5 ? "LONG" : "SHORT";
|
|
235
|
-
await engine.performAction("open_perp", {
|
|
236
|
-
ticker: perp.ticker,
|
|
237
|
-
side,
|
|
238
|
-
size: 10,
|
|
239
|
-
leverage: 1,
|
|
240
|
-
});
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
else if (strategy === "momentum") {
|
|
245
|
-
// Momentum strategy: Follow price trends
|
|
246
|
-
if (state.perpetualMarkets.length > 0) {
|
|
247
|
-
const perpIndex = Math.floor(rng.next() * state.perpetualMarkets.length);
|
|
248
|
-
const perp = state.perpetualMarkets[perpIndex];
|
|
249
|
-
if (perp) {
|
|
250
|
-
// If price up > 0.5% in 24h, go LONG. If down > 0.5%, go SHORT.
|
|
251
|
-
// If relatively flat, do nothing (hold).
|
|
252
|
-
if (perp.priceChange24h > 0.5) {
|
|
253
|
-
await engine.performAction("open_perp", {
|
|
254
|
-
ticker: perp.ticker,
|
|
255
|
-
side: "LONG",
|
|
256
|
-
size: 20,
|
|
257
|
-
leverage: 2,
|
|
258
|
-
});
|
|
259
|
-
}
|
|
260
|
-
else if (perp.priceChange24h < -0.5) {
|
|
261
|
-
await engine.performAction("open_perp", {
|
|
262
|
-
ticker: perp.ticker,
|
|
263
|
-
side: "SHORT",
|
|
264
|
-
size: 20,
|
|
265
|
-
leverage: 2,
|
|
266
|
-
});
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
/**
|
|
273
|
-
* Run multiple benchmarks and compare
|
|
274
|
-
*
|
|
275
|
-
* Executes multiple benchmark runs with the same configuration and compares
|
|
276
|
-
* their results to assess consistency and average performance.
|
|
277
|
-
*
|
|
278
|
-
* @param config - Benchmark run configuration
|
|
279
|
-
* @param numRuns - Number of iterations to run
|
|
280
|
-
* @returns BenchmarkComparisonResult with aggregated metrics and comparison
|
|
281
|
-
*
|
|
282
|
-
* @remarks
|
|
283
|
-
* - Runs benchmarks sequentially with small delays between runs
|
|
284
|
-
* - Calculates average P&L, accuracy, and optimality scores
|
|
285
|
-
* - Identifies best and worst performing runs
|
|
286
|
-
* - Saves comparison report to output directory
|
|
287
|
-
*
|
|
288
|
-
* @example
|
|
289
|
-
* ```typescript
|
|
290
|
-
* const comparison = await BenchmarkRunner.runMultiple(config, 5);
|
|
291
|
-
* console.log(`Average P&L: ${comparison.comparison.avgPnl}`);
|
|
292
|
-
* console.log(`Best run: ${comparison.comparison.bestRun}`);
|
|
293
|
-
* ```
|
|
294
|
-
*/
|
|
295
|
-
static async runMultiple(config, numRuns) {
|
|
296
|
-
logger.info(`Running ${numRuns} benchmark iterations`, {
|
|
297
|
-
agentUserId: config.agentUserId,
|
|
298
|
-
});
|
|
299
|
-
const runs = [];
|
|
300
|
-
const trajectoryPaths = [];
|
|
301
|
-
for (let i = 0; i < numRuns; i++) {
|
|
302
|
-
logger.info(`Starting run ${i + 1}/${numRuns}`);
|
|
303
|
-
const result = await BenchmarkRunner.runSingle({
|
|
304
|
-
...config,
|
|
305
|
-
outputDir: path.join(config.outputDir, `run-${i + 1}`),
|
|
306
|
-
});
|
|
307
|
-
runs.push(result);
|
|
308
|
-
if (config.saveTrajectory) {
|
|
309
|
-
trajectoryPaths.push(path.join(config.outputDir, `run-${i + 1}`, "trajectory.json"));
|
|
310
|
-
}
|
|
311
|
-
// Small delay between runs
|
|
312
|
-
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
313
|
-
}
|
|
314
|
-
// Calculate comparison metrics
|
|
315
|
-
const avgPnl = runs.reduce((sum, r) => sum + r.metrics.totalPnl, 0) / runs.length;
|
|
316
|
-
const avgAccuracy = runs.reduce((sum, r) => sum + r.metrics.predictionMetrics.accuracy, 0) /
|
|
317
|
-
runs.length;
|
|
318
|
-
const avgOptimality = runs.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) / runs.length;
|
|
319
|
-
const bestRun = runs.reduce((best, current) => current.metrics.totalPnl > best.metrics.totalPnl ? current : best);
|
|
320
|
-
const worstRun = runs.reduce((worst, current) => current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst);
|
|
321
|
-
const comparison = {
|
|
322
|
-
avgPnl,
|
|
323
|
-
avgAccuracy,
|
|
324
|
-
avgOptimality,
|
|
325
|
-
bestRun: bestRun.id,
|
|
326
|
-
worstRun: worstRun.id,
|
|
327
|
-
};
|
|
328
|
-
// Save comparison report
|
|
329
|
-
await BenchmarkRunner.saveComparison({
|
|
330
|
-
runs,
|
|
331
|
-
comparison,
|
|
332
|
-
trajectories: config.saveTrajectory ? trajectoryPaths : undefined,
|
|
333
|
-
}, config.outputDir);
|
|
334
|
-
logger.info("Multiple benchmarks completed", comparison);
|
|
335
|
-
return {
|
|
336
|
-
runs,
|
|
337
|
-
comparison,
|
|
338
|
-
trajectories: config.saveTrajectory ? trajectoryPaths : undefined,
|
|
339
|
-
};
|
|
340
|
-
}
|
|
341
|
-
/**
|
|
342
|
-
* Compare two agents on same benchmark
|
|
343
|
-
*
|
|
344
|
-
* Runs two different agents on the same benchmark snapshot and compares
|
|
345
|
-
* their performance to determine which performs better.
|
|
346
|
-
*
|
|
347
|
-
* @param agent1Config - Configuration for first agent
|
|
348
|
-
* @param agent2Config - Configuration for second agent
|
|
349
|
-
* @param benchmarkPath - Path to benchmark snapshot (same for both agents)
|
|
350
|
-
* @returns Comparison result with both agents' results and performance delta
|
|
351
|
-
*
|
|
352
|
-
* @remarks
|
|
353
|
-
* - Runs both agents in parallel for efficiency
|
|
354
|
-
* - Compares P&L, accuracy, and optimality scores
|
|
355
|
-
* - Determines winner based on total P&L
|
|
356
|
-
*
|
|
357
|
-
* @example
|
|
358
|
-
* ```typescript
|
|
359
|
-
* const comparison = await BenchmarkRunner.compareAgents(
|
|
360
|
-
* agent1Config,
|
|
361
|
-
* agent2Config,
|
|
362
|
-
* './benchmarks/test.json'
|
|
363
|
-
* );
|
|
364
|
-
* console.log(`Winner: ${comparison.winner}`);
|
|
365
|
-
* console.log(`P&L Delta: ${comparison.delta.pnl}`);
|
|
366
|
-
* ```
|
|
367
|
-
*/
|
|
368
|
-
static async compareAgents(agent1Config, agent2Config, benchmarkPath) {
|
|
369
|
-
logger.info("Comparing two agents", {
|
|
370
|
-
agent1: agent1Config.agentUserId,
|
|
371
|
-
agent2: agent2Config.agentUserId,
|
|
372
|
-
benchmark: benchmarkPath,
|
|
373
|
-
});
|
|
374
|
-
// Run both agents on same benchmark (concurrently)
|
|
375
|
-
const [result1, result2] = await Promise.all([
|
|
376
|
-
BenchmarkRunner.runSingle({ ...agent1Config, benchmarkPath }),
|
|
377
|
-
BenchmarkRunner.runSingle({ ...agent2Config, benchmarkPath }),
|
|
378
|
-
]);
|
|
379
|
-
const winner = result1.metrics.totalPnl > result2.metrics.totalPnl
|
|
380
|
-
? agent1Config.agentUserId
|
|
381
|
-
: agent2Config.agentUserId;
|
|
382
|
-
const delta = {
|
|
383
|
-
pnl: result1.metrics.totalPnl - result2.metrics.totalPnl,
|
|
384
|
-
accuracy: result1.metrics.predictionMetrics.accuracy -
|
|
385
|
-
result2.metrics.predictionMetrics.accuracy,
|
|
386
|
-
optimality: result1.metrics.optimalityScore - result2.metrics.optimalityScore,
|
|
387
|
-
};
|
|
388
|
-
logger.info("Agent comparison completed", {
|
|
389
|
-
winner,
|
|
390
|
-
delta,
|
|
391
|
-
});
|
|
392
|
-
return {
|
|
393
|
-
agent1: result1,
|
|
394
|
-
agent2: result2,
|
|
395
|
-
winner,
|
|
396
|
-
delta,
|
|
397
|
-
};
|
|
398
|
-
}
|
|
399
|
-
/**
|
|
400
|
-
* Load benchmark from file
|
|
401
|
-
*
|
|
402
|
-
* @param benchmarkPath - Path to benchmark JSON file
|
|
403
|
-
* @returns Parsed benchmark snapshot
|
|
404
|
-
* @throws Error if file cannot be read or parsed
|
|
405
|
-
*/
|
|
406
|
-
static async loadBenchmark(benchmarkPath) {
|
|
407
|
-
try {
|
|
408
|
-
const data = await fs.readFile(benchmarkPath, "utf-8");
|
|
409
|
-
const parsed = JSON.parse(data);
|
|
410
|
-
// Validate basic structure
|
|
411
|
-
if (!parsed.id || !parsed.initialState || !parsed.groundTruth) {
|
|
412
|
-
throw new Error(`Invalid benchmark file: missing required fields (id, initialState, or groundTruth)`);
|
|
413
|
-
}
|
|
414
|
-
return parsed;
|
|
415
|
-
}
|
|
416
|
-
catch (error) {
|
|
417
|
-
if (error instanceof SyntaxError) {
|
|
418
|
-
throw new Error(`Failed to parse benchmark JSON file: ${error.message}`);
|
|
419
|
-
}
|
|
420
|
-
if (error?.code === "ENOENT") {
|
|
421
|
-
throw new Error(`Benchmark file not found: ${benchmarkPath}`);
|
|
422
|
-
}
|
|
423
|
-
throw error;
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
/**
|
|
427
|
-
* Generate new benchmark
|
|
428
|
-
*
|
|
429
|
-
* Creates a new benchmark snapshot using the provided configuration
|
|
430
|
-
* and saves it for future reuse.
|
|
431
|
-
*
|
|
432
|
-
* @param config - Benchmark generation configuration
|
|
433
|
-
* @returns Generated benchmark snapshot
|
|
434
|
-
* @throws Error if generation fails
|
|
435
|
-
*/
|
|
436
|
-
static async generateBenchmark(config) {
|
|
437
|
-
logger.info("Generating new benchmark", config);
|
|
438
|
-
const generator = new BenchmarkDataGenerator(config);
|
|
439
|
-
const snapshot = await generator.generate();
|
|
440
|
-
// Save for reuse
|
|
441
|
-
const outputPath = path.join(process.cwd(), "benchmarks", `benchmark-${snapshot.id}.json`);
|
|
442
|
-
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
443
|
-
await fs.writeFile(outputPath, JSON.stringify(snapshot, null, 2));
|
|
444
|
-
logger.info("Benchmark generated and saved", { path: outputPath });
|
|
445
|
-
return snapshot;
|
|
446
|
-
}
|
|
447
|
-
/**
|
|
448
|
-
* Save simulation result
|
|
449
|
-
*
|
|
450
|
-
* Saves complete simulation results including metrics, trajectory data,
|
|
451
|
-
* and full result object to the output directory.
|
|
452
|
-
*
|
|
453
|
-
* @param result - Simulation result to save
|
|
454
|
-
* @param outputDir - Directory to save results in
|
|
455
|
-
*/
|
|
456
|
-
static async saveResult(result, outputDir) {
|
|
457
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
458
|
-
// Save full result
|
|
459
|
-
const resultPath = path.join(outputDir, "result.json");
|
|
460
|
-
await fs.writeFile(resultPath, JSON.stringify(result, null, 2));
|
|
461
|
-
// Save metrics summary
|
|
462
|
-
const metricsPath = path.join(outputDir, "metrics.json");
|
|
463
|
-
await fs.writeFile(metricsPath, JSON.stringify(result.metrics, null, 2));
|
|
464
|
-
// Save trajectory
|
|
465
|
-
const trajectoryPath = path.join(outputDir, "trajectory.json");
|
|
466
|
-
await fs.writeFile(trajectoryPath, JSON.stringify(result.trajectory, null, 2));
|
|
467
|
-
logger.debug("Results saved", { outputDir });
|
|
468
|
-
}
|
|
469
|
-
/**
|
|
470
|
-
* Save comparison report
|
|
471
|
-
*
|
|
472
|
-
* Saves benchmark comparison results to a JSON file in the output directory.
|
|
473
|
-
*
|
|
474
|
-
* @param comparison - Comparison result to save
|
|
475
|
-
* @param outputDir - Directory to save comparison in
|
|
476
|
-
*/
|
|
477
|
-
static async saveComparison(comparison, outputDir) {
|
|
478
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
479
|
-
const comparisonPath = path.join(outputDir, "comparison.json");
|
|
480
|
-
await fs.writeFile(comparisonPath, JSON.stringify(comparison, null, 2));
|
|
481
|
-
logger.debug("Comparison saved", { outputDir });
|
|
482
|
-
}
|
|
483
|
-
}
|
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Benchmark Data Validator
|
|
3
|
-
*
|
|
4
|
-
* Validates benchmark snapshot data to ensure it's properly formatted
|
|
5
|
-
* and contains all required fields.
|
|
6
|
-
*/
|
|
7
|
-
import { logger } from "../utils/logger";
|
|
8
|
-
/**
|
|
9
|
-
* Validate a benchmark snapshot
|
|
10
|
-
*/
|
|
11
|
-
export function validate(snapshot) {
|
|
12
|
-
const errors = [];
|
|
13
|
-
const warnings = [];
|
|
14
|
-
// 1. Check required top-level fields
|
|
15
|
-
if (!snapshot || typeof snapshot !== "object") {
|
|
16
|
-
errors.push("Snapshot is null, undefined, or not an object");
|
|
17
|
-
return { valid: false, errors, warnings };
|
|
18
|
-
}
|
|
19
|
-
const snap = snapshot;
|
|
20
|
-
if (!snap.id)
|
|
21
|
-
errors.push("Missing required field: id");
|
|
22
|
-
if (!snap.version)
|
|
23
|
-
errors.push("Missing required field: version");
|
|
24
|
-
if (typeof snap.duration !== "number")
|
|
25
|
-
errors.push("Missing or invalid field: duration");
|
|
26
|
-
if (typeof snap.tickInterval !== "number")
|
|
27
|
-
errors.push("Missing or invalid field: tickInterval");
|
|
28
|
-
if (!snap.initialState)
|
|
29
|
-
errors.push("Missing required field: initialState");
|
|
30
|
-
if (!Array.isArray(snap.ticks))
|
|
31
|
-
errors.push("Missing or invalid field: ticks (must be array)");
|
|
32
|
-
if (!snap.groundTruth)
|
|
33
|
-
errors.push("Missing required field: groundTruth");
|
|
34
|
-
// 2. Validate initial state
|
|
35
|
-
if (snap.initialState && typeof snap.initialState === "object") {
|
|
36
|
-
const state = snap.initialState;
|
|
37
|
-
if (typeof state.tick !== "number")
|
|
38
|
-
errors.push("initialState.tick must be a number");
|
|
39
|
-
if (state.tick !== 0)
|
|
40
|
-
warnings.push("initialState.tick should be 0");
|
|
41
|
-
if (!Array.isArray(state.predictionMarkets)) {
|
|
42
|
-
errors.push("initialState.predictionMarkets must be an array");
|
|
43
|
-
}
|
|
44
|
-
if (!Array.isArray(state.perpetualMarkets)) {
|
|
45
|
-
errors.push("initialState.perpetualMarkets must be an array");
|
|
46
|
-
}
|
|
47
|
-
if (!Array.isArray(state.agents)) {
|
|
48
|
-
errors.push("initialState.agents must be an array");
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
// 3. Validate ticks
|
|
52
|
-
if (Array.isArray(snap.ticks)) {
|
|
53
|
-
if (snap.ticks.length === 0) {
|
|
54
|
-
warnings.push("Ticks array is empty");
|
|
55
|
-
}
|
|
56
|
-
snap.ticks.forEach((tick, index) => {
|
|
57
|
-
if (!tick || typeof tick !== "object") {
|
|
58
|
-
errors.push(`Tick ${index}: invalid tick object`);
|
|
59
|
-
return;
|
|
60
|
-
}
|
|
61
|
-
const tickObj = tick;
|
|
62
|
-
if (typeof tickObj.number !== "number") {
|
|
63
|
-
errors.push(`Tick ${index}: missing or invalid 'number' field`);
|
|
64
|
-
}
|
|
65
|
-
if (!Array.isArray(tickObj.events)) {
|
|
66
|
-
errors.push(`Tick ${index}: events must be an array`);
|
|
67
|
-
}
|
|
68
|
-
if (!tickObj.state) {
|
|
69
|
-
errors.push(`Tick ${index}: missing state`);
|
|
70
|
-
}
|
|
71
|
-
});
|
|
72
|
-
// Check tick numbering is sequential
|
|
73
|
-
for (let i = 0; i < snap.ticks.length; i++) {
|
|
74
|
-
const tick = snap.ticks[i];
|
|
75
|
-
if (tick && typeof tick.number === "number" && tick.number !== i) {
|
|
76
|
-
warnings.push(`Tick ${i}: number ${tick.number} doesn't match index`);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
// 4. Validate ground truth
|
|
81
|
-
if (snap.groundTruth && typeof snap.groundTruth === "object") {
|
|
82
|
-
const gt = snap.groundTruth;
|
|
83
|
-
if (!gt.marketOutcomes || typeof gt.marketOutcomes !== "object") {
|
|
84
|
-
errors.push("groundTruth.marketOutcomes must be an object");
|
|
85
|
-
}
|
|
86
|
-
if (!gt.priceHistory || typeof gt.priceHistory !== "object") {
|
|
87
|
-
errors.push("groundTruth.priceHistory must be an object");
|
|
88
|
-
}
|
|
89
|
-
if (!Array.isArray(gt.optimalActions)) {
|
|
90
|
-
errors.push("groundTruth.optimalActions must be an array");
|
|
91
|
-
}
|
|
92
|
-
if (!Array.isArray(gt.socialOpportunities)) {
|
|
93
|
-
errors.push("groundTruth.socialOpportunities must be an array");
|
|
94
|
-
}
|
|
95
|
-
if (!Array.isArray(gt.hiddenFacts)) {
|
|
96
|
-
errors.push("groundTruth.hiddenFacts must be an array");
|
|
97
|
-
}
|
|
98
|
-
if (!Array.isArray(gt.hiddenEvents)) {
|
|
99
|
-
errors.push("groundTruth.hiddenEvents must be an array");
|
|
100
|
-
}
|
|
101
|
-
if (!gt.trueFacts || typeof gt.trueFacts !== "object") {
|
|
102
|
-
errors.push("groundTruth.trueFacts must be an object");
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
// 5. Cross-validate: markets in initialState should have outcomes in groundTruth
|
|
106
|
-
if (snap.initialState &&
|
|
107
|
-
typeof snap.initialState === "object" &&
|
|
108
|
-
snap.groundTruth &&
|
|
109
|
-
typeof snap.groundTruth === "object") {
|
|
110
|
-
const initialState = snap.initialState;
|
|
111
|
-
const groundTruth = snap.groundTruth;
|
|
112
|
-
const markets = (Array.isArray(initialState.predictionMarkets)
|
|
113
|
-
? initialState.predictionMarkets
|
|
114
|
-
: []);
|
|
115
|
-
const outcomes = (groundTruth.marketOutcomes &&
|
|
116
|
-
typeof groundTruth.marketOutcomes === "object"
|
|
117
|
-
? groundTruth.marketOutcomes
|
|
118
|
-
: {});
|
|
119
|
-
markets.forEach((market) => {
|
|
120
|
-
if (market.id &&
|
|
121
|
-
typeof market.id === "string" &&
|
|
122
|
-
!(market.id in outcomes)) {
|
|
123
|
-
warnings.push(`Market ${market.id} in initialState but no outcome in groundTruth`);
|
|
124
|
-
}
|
|
125
|
-
});
|
|
126
|
-
}
|
|
127
|
-
logger.info("Benchmark validation complete", {
|
|
128
|
-
valid: errors.length === 0,
|
|
129
|
-
errors: errors.length,
|
|
130
|
-
warnings: warnings.length,
|
|
131
|
-
});
|
|
132
|
-
return {
|
|
133
|
-
valid: errors.length === 0,
|
|
134
|
-
errors,
|
|
135
|
-
warnings,
|
|
136
|
-
};
|
|
137
|
-
}
|
|
138
|
-
/**
|
|
139
|
-
* Quick sanity check (fast, minimal validation)
|
|
140
|
-
*/
|
|
141
|
-
export function sanityCheck(snapshot) {
|
|
142
|
-
if (!snapshot || typeof snapshot !== "object")
|
|
143
|
-
return false;
|
|
144
|
-
const snap = snapshot;
|
|
145
|
-
return !!(snap.id &&
|
|
146
|
-
snap.initialState &&
|
|
147
|
-
Array.isArray(snap.ticks) &&
|
|
148
|
-
snap.groundTruth);
|
|
149
|
-
}
|
|
150
|
-
/**
|
|
151
|
-
* Validate and throw if invalid
|
|
152
|
-
*/
|
|
153
|
-
export function validateOrThrow(snapshot) {
|
|
154
|
-
const result = validate(snapshot);
|
|
155
|
-
if (!result.valid) {
|
|
156
|
-
throw new Error(`Invalid benchmark data: ${result.errors.join(", ")}`);
|
|
157
|
-
}
|
|
158
|
-
}
|