@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,483 +0,0 @@
1
- /**
2
- * Benchmark Runner
3
- *
4
- * Coordinates the complete benchmarking process:
5
- * 1. Load or generate benchmark data
6
- * 2. Initialize simulation engine
7
- * 3. Run agent through simulation (Autonomous or Forced Strategy)
8
- * 4. Collect metrics and trajectory data
9
- * 5. Save results
10
- *
11
- * Can run multiple agents and compare their performance.
12
- */
13
- import { promises as fs } from "node:fs";
14
- import * as path from "node:path";
15
- import { getAutonomousCoordinator } from "../dependencies";
16
- import { TrajectoryRecorder } from "../training/TrajectoryRecorder";
17
- import { logger } from "../utils/logger";
18
- import { BenchmarkDataGenerator, SeededRandom, } from "./BenchmarkDataGenerator";
19
- import { SimulationA2AInterface } from "./SimulationA2AInterface";
20
- import { SimulationEngine, } from "./SimulationEngine";
21
- export class BenchmarkRunner {
22
- /**
23
- * Run a single benchmark
24
- *
25
- * Executes a complete benchmark run by loading or generating benchmark data,
26
- * initializing the simulation engine, running the agent through the simulation,
27
- * and collecting comprehensive metrics and trajectory data.
28
- *
29
- * @param config - Benchmark run configuration
30
- * @returns SimulationResult with metrics, actions, and trajectory data
31
- * @throws Error if benchmark fails to load/generate or simulation fails
32
- *
33
- * @remarks
34
- * - Can load existing benchmark from file or generate new one
35
- * - Supports trajectory recording for RL training
36
- * - Validates that agent actually took actions
37
- * - Saves results to output directory
38
- *
39
- * @example
40
- * ```typescript
41
- * const result = await BenchmarkRunner.runSingle({
42
- * benchmarkPath: './benchmarks/test.json',
43
- * agentRuntime: runtime,
44
- * agentUserId: 'agent-123',
45
- * saveTrajectory: true,
46
- * outputDir: './results'
47
- * });
48
- * console.log(`P&L: ${result.metrics.totalPnl}`);
49
- * ```
50
- */
51
- static async runSingle(config) {
52
- logger.info("Starting benchmark run", {
53
- agentUserId: config.agentUserId,
54
- benchmarkPath: config.benchmarkPath,
55
- strategy: config.forceStrategy || "agent-driven",
56
- });
57
- // 1. Load or generate benchmark
58
- const snapshot = config.benchmarkPath
59
- ? await BenchmarkRunner.loadBenchmark(config.benchmarkPath)
60
- : await BenchmarkRunner.generateBenchmark(config.generatorConfig ??
61
- (() => {
62
- throw new Error("generatorConfig required when benchmarkPath not provided");
63
- })());
64
- // 2. Create simulation engine
65
- const simConfig = {
66
- snapshot,
67
- agentId: config.agentUserId,
68
- fastForward: true,
69
- responseTimeout: 30000,
70
- };
71
- const engine = new SimulationEngine(simConfig);
72
- // 3. Set up A2A interface for agent
73
- const a2aInterface = new SimulationA2AInterface(engine, config.agentUserId);
74
- // Inject A2A interface into agent runtime (if using real agent and not forcing strategy)
75
- if (!config.forceStrategy) {
76
- config.agentRuntime.a2aClient = a2aInterface;
77
- }
78
- // Force model if specified (for baseline testing)
79
- if (config.forceModel) {
80
- logger.info("Forcing model for benchmark", {
81
- agentUserId: config.agentUserId,
82
- forcedModel: config.forceModel,
83
- });
84
- // Set model in runtime settings
85
- const runtime = config.agentRuntime;
86
- if (runtime.character?.settings) {
87
- runtime.character.settings.GROQ_LARGE_MODEL = config.forceModel;
88
- runtime.character.settings.GROQ_SMALL_MODEL = config.forceModel;
89
- }
90
- if (runtime.setSetting) {
91
- runtime.setSetting("GROQ_LARGE_MODEL", config.forceModel);
92
- runtime.setSetting("GROQ_SMALL_MODEL", config.forceModel);
93
- }
94
- }
95
- // 4. Set up trajectory recording if enabled
96
- let trajectoryRecorder;
97
- let trajectoryId;
98
- if (config.saveTrajectory) {
99
- // Fail fast - trajectory recording setup errors should crash
100
- trajectoryRecorder = new TrajectoryRecorder();
101
- trajectoryId = await trajectoryRecorder.startTrajectory({
102
- agentId: config.agentUserId,
103
- scenarioId: `benchmark-${snapshot.id}`,
104
- });
105
- logger.info("Trajectory recording started", { trajectoryId });
106
- }
107
- // 5. Initialize simulation
108
- engine.initialize();
109
- // 6. Run simulation loop
110
- logger.info("Starting simulation loop", {
111
- agentUserId: config.agentUserId,
112
- totalTicks: snapshot.ticks.length,
113
- });
114
- // Only get coordinator if we are using an autonomous agent (not forced strategy)
115
- // This prevents errors when running baseline tests without full dependency injection
116
- const coordinator = !config.forceStrategy
117
- ? getAutonomousCoordinator()
118
- : undefined;
119
- // Create seeded RNG for baseline strategies (reproducibility)
120
- // Use snapshot ID hash as seed for deterministic behavior across runs
121
- const baselineSeed = config.forceStrategy
122
- ? snapshot.id.split("").reduce((acc, c) => acc + c.charCodeAt(0), 0)
123
- : 0;
124
- const baselineRng = config.forceStrategy
125
- ? new SeededRandom(baselineSeed)
126
- : undefined;
127
- let ticksCompleted = 0;
128
- // Run ticks for each simulation tick
129
- while (!engine.isComplete()) {
130
- const currentTick = engine.getCurrentTickNumber();
131
- if (currentTick % 100 === 0 || currentTick < 5) {
132
- logger.info(`Benchmark progress: ${currentTick}/${snapshot.ticks.length} ticks`, {
133
- agentUserId: config.agentUserId,
134
- });
135
- }
136
- if (config.forceStrategy && baselineRng) {
137
- // Execute baseline strategy directly on engine (bypassing LLM)
138
- await BenchmarkRunner.executeBaselineStrategy(config.forceStrategy, engine, baselineRng);
139
- }
140
- else {
141
- if (!coordinator) {
142
- throw new Error("AutonomousCoordinator required for agent-driven benchmark but not configured.");
143
- }
144
- // Execute autonomous tick (agent makes decisions via A2A)
145
- // Fail fast - don't catch errors, let them propagate
146
- const tickResult = await coordinator.executeAutonomousTick(config.agentUserId, config.agentRuntime);
147
- if (tickResult.success && tickResult.actionsExecuted) {
148
- const totalActions = tickResult.actionsExecuted.trades +
149
- tickResult.actionsExecuted.posts +
150
- tickResult.actionsExecuted.comments +
151
- tickResult.actionsExecuted.messages +
152
- tickResult.actionsExecuted.groupMessages +
153
- tickResult.actionsExecuted.engagements;
154
- if (totalActions > 0) {
155
- logger.debug("Agent took actions", {
156
- tick: currentTick,
157
- actions: tickResult.actionsExecuted,
158
- });
159
- }
160
- }
161
- }
162
- // Advance simulation tick
163
- engine.advanceTick();
164
- ticksCompleted++;
165
- // Small delay to avoid overwhelming the system
166
- await new Promise((resolve) => setTimeout(resolve, 5));
167
- }
168
- logger.info("Simulation loop complete", {
169
- agentUserId: config.agentUserId,
170
- ticksCompleted,
171
- totalTicks: snapshot.ticks.length,
172
- });
173
- // 7. Calculate final results
174
- const result = await engine.run();
175
- // 8. Validate results - ensure agent actually did something
176
- if (result.ticksProcessed === 0) {
177
- throw new Error("Benchmark failed: No ticks were processed");
178
- }
179
- if (result.actions.length === 0) {
180
- logger.warn("Benchmark completed but agent took no actions", {
181
- agentUserId: config.agentUserId,
182
- ticksProcessed: result.ticksProcessed,
183
- });
184
- }
185
- // 9. Save trajectory if enabled
186
- if (trajectoryRecorder && trajectoryId) {
187
- await trajectoryRecorder.endTrajectory(trajectoryId, {
188
- finalPnL: result.metrics.totalPnl,
189
- finalBalance: undefined, // Let recorder calculate from state
190
- });
191
- logger.info("Trajectory recording saved", { trajectoryId });
192
- }
193
- // 10. Save results
194
- await BenchmarkRunner.saveResult(result, config.outputDir);
195
- logger.info("Benchmark run completed", {
196
- agentUserId: config.agentUserId,
197
- totalPnl: result.metrics.totalPnl,
198
- accuracy: result.metrics.predictionMetrics.accuracy,
199
- optimalityScore: result.metrics.optimalityScore,
200
- });
201
- return result;
202
- }
203
- /**
204
- * Execute baseline strategy logic (Random or Momentum)
205
- * This runs directly against the engine, bypassing the LLM agent.
206
- * Uses seeded RNG for reproducibility across benchmark runs.
207
- */
208
- static async executeBaselineStrategy(strategy, engine, rng) {
209
- const state = engine.getGameState();
210
- // Rate limiting: Only trade in ~10% of ticks to simulate realistic frequency
211
- if (rng.next() > 0.1)
212
- return;
213
- if (strategy === "random") {
214
- // Random strategy: Buy prediction shares or open perps randomly
215
- const actionType = rng.next() > 0.5 ? "prediction" : "perp";
216
- if (actionType === "prediction" && state.predictionMarkets.length > 0) {
217
- const marketIndex = Math.floor(rng.next() * state.predictionMarkets.length);
218
- const market = state.predictionMarkets[marketIndex];
219
- if (market) {
220
- const outcome = rng.next() > 0.5 ? "YES" : "NO";
221
- // Random amount between 10 and 100
222
- const amount = 10 + rng.next() * 90;
223
- await engine.performAction("buy_prediction", {
224
- marketId: market.id,
225
- outcome,
226
- amount,
227
- });
228
- }
229
- }
230
- else if (state.perpetualMarkets.length > 0) {
231
- const perpIndex = Math.floor(rng.next() * state.perpetualMarkets.length);
232
- const perp = state.perpetualMarkets[perpIndex];
233
- if (perp) {
234
- const side = rng.next() > 0.5 ? "LONG" : "SHORT";
235
- await engine.performAction("open_perp", {
236
- ticker: perp.ticker,
237
- side,
238
- size: 10,
239
- leverage: 1,
240
- });
241
- }
242
- }
243
- }
244
- else if (strategy === "momentum") {
245
- // Momentum strategy: Follow price trends
246
- if (state.perpetualMarkets.length > 0) {
247
- const perpIndex = Math.floor(rng.next() * state.perpetualMarkets.length);
248
- const perp = state.perpetualMarkets[perpIndex];
249
- if (perp) {
250
- // If price up > 0.5% in 24h, go LONG. If down > 0.5%, go SHORT.
251
- // If relatively flat, do nothing (hold).
252
- if (perp.priceChange24h > 0.5) {
253
- await engine.performAction("open_perp", {
254
- ticker: perp.ticker,
255
- side: "LONG",
256
- size: 20,
257
- leverage: 2,
258
- });
259
- }
260
- else if (perp.priceChange24h < -0.5) {
261
- await engine.performAction("open_perp", {
262
- ticker: perp.ticker,
263
- side: "SHORT",
264
- size: 20,
265
- leverage: 2,
266
- });
267
- }
268
- }
269
- }
270
- }
271
- }
272
- /**
273
- * Run multiple benchmarks and compare
274
- *
275
- * Executes multiple benchmark runs with the same configuration and compares
276
- * their results to assess consistency and average performance.
277
- *
278
- * @param config - Benchmark run configuration
279
- * @param numRuns - Number of iterations to run
280
- * @returns BenchmarkComparisonResult with aggregated metrics and comparison
281
- *
282
- * @remarks
283
- * - Runs benchmarks sequentially with small delays between runs
284
- * - Calculates average P&L, accuracy, and optimality scores
285
- * - Identifies best and worst performing runs
286
- * - Saves comparison report to output directory
287
- *
288
- * @example
289
- * ```typescript
290
- * const comparison = await BenchmarkRunner.runMultiple(config, 5);
291
- * console.log(`Average P&L: ${comparison.comparison.avgPnl}`);
292
- * console.log(`Best run: ${comparison.comparison.bestRun}`);
293
- * ```
294
- */
295
- static async runMultiple(config, numRuns) {
296
- logger.info(`Running ${numRuns} benchmark iterations`, {
297
- agentUserId: config.agentUserId,
298
- });
299
- const runs = [];
300
- const trajectoryPaths = [];
301
- for (let i = 0; i < numRuns; i++) {
302
- logger.info(`Starting run ${i + 1}/${numRuns}`);
303
- const result = await BenchmarkRunner.runSingle({
304
- ...config,
305
- outputDir: path.join(config.outputDir, `run-${i + 1}`),
306
- });
307
- runs.push(result);
308
- if (config.saveTrajectory) {
309
- trajectoryPaths.push(path.join(config.outputDir, `run-${i + 1}`, "trajectory.json"));
310
- }
311
- // Small delay between runs
312
- await new Promise((resolve) => setTimeout(resolve, 1000));
313
- }
314
- // Calculate comparison metrics
315
- const avgPnl = runs.reduce((sum, r) => sum + r.metrics.totalPnl, 0) / runs.length;
316
- const avgAccuracy = runs.reduce((sum, r) => sum + r.metrics.predictionMetrics.accuracy, 0) /
317
- runs.length;
318
- const avgOptimality = runs.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) / runs.length;
319
- const bestRun = runs.reduce((best, current) => current.metrics.totalPnl > best.metrics.totalPnl ? current : best);
320
- const worstRun = runs.reduce((worst, current) => current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst);
321
- const comparison = {
322
- avgPnl,
323
- avgAccuracy,
324
- avgOptimality,
325
- bestRun: bestRun.id,
326
- worstRun: worstRun.id,
327
- };
328
- // Save comparison report
329
- await BenchmarkRunner.saveComparison({
330
- runs,
331
- comparison,
332
- trajectories: config.saveTrajectory ? trajectoryPaths : undefined,
333
- }, config.outputDir);
334
- logger.info("Multiple benchmarks completed", comparison);
335
- return {
336
- runs,
337
- comparison,
338
- trajectories: config.saveTrajectory ? trajectoryPaths : undefined,
339
- };
340
- }
341
- /**
342
- * Compare two agents on same benchmark
343
- *
344
- * Runs two different agents on the same benchmark snapshot and compares
345
- * their performance to determine which performs better.
346
- *
347
- * @param agent1Config - Configuration for first agent
348
- * @param agent2Config - Configuration for second agent
349
- * @param benchmarkPath - Path to benchmark snapshot (same for both agents)
350
- * @returns Comparison result with both agents' results and performance delta
351
- *
352
- * @remarks
353
- * - Runs both agents in parallel for efficiency
354
- * - Compares P&L, accuracy, and optimality scores
355
- * - Determines winner based on total P&L
356
- *
357
- * @example
358
- * ```typescript
359
- * const comparison = await BenchmarkRunner.compareAgents(
360
- * agent1Config,
361
- * agent2Config,
362
- * './benchmarks/test.json'
363
- * );
364
- * console.log(`Winner: ${comparison.winner}`);
365
- * console.log(`P&L Delta: ${comparison.delta.pnl}`);
366
- * ```
367
- */
368
- static async compareAgents(agent1Config, agent2Config, benchmarkPath) {
369
- logger.info("Comparing two agents", {
370
- agent1: agent1Config.agentUserId,
371
- agent2: agent2Config.agentUserId,
372
- benchmark: benchmarkPath,
373
- });
374
- // Run both agents on same benchmark (concurrently)
375
- const [result1, result2] = await Promise.all([
376
- BenchmarkRunner.runSingle({ ...agent1Config, benchmarkPath }),
377
- BenchmarkRunner.runSingle({ ...agent2Config, benchmarkPath }),
378
- ]);
379
- const winner = result1.metrics.totalPnl > result2.metrics.totalPnl
380
- ? agent1Config.agentUserId
381
- : agent2Config.agentUserId;
382
- const delta = {
383
- pnl: result1.metrics.totalPnl - result2.metrics.totalPnl,
384
- accuracy: result1.metrics.predictionMetrics.accuracy -
385
- result2.metrics.predictionMetrics.accuracy,
386
- optimality: result1.metrics.optimalityScore - result2.metrics.optimalityScore,
387
- };
388
- logger.info("Agent comparison completed", {
389
- winner,
390
- delta,
391
- });
392
- return {
393
- agent1: result1,
394
- agent2: result2,
395
- winner,
396
- delta,
397
- };
398
- }
399
- /**
400
- * Load benchmark from file
401
- *
402
- * @param benchmarkPath - Path to benchmark JSON file
403
- * @returns Parsed benchmark snapshot
404
- * @throws Error if file cannot be read or parsed
405
- */
406
- static async loadBenchmark(benchmarkPath) {
407
- try {
408
- const data = await fs.readFile(benchmarkPath, "utf-8");
409
- const parsed = JSON.parse(data);
410
- // Validate basic structure
411
- if (!parsed.id || !parsed.initialState || !parsed.groundTruth) {
412
- throw new Error(`Invalid benchmark file: missing required fields (id, initialState, or groundTruth)`);
413
- }
414
- return parsed;
415
- }
416
- catch (error) {
417
- if (error instanceof SyntaxError) {
418
- throw new Error(`Failed to parse benchmark JSON file: ${error.message}`);
419
- }
420
- if (error?.code === "ENOENT") {
421
- throw new Error(`Benchmark file not found: ${benchmarkPath}`);
422
- }
423
- throw error;
424
- }
425
- }
426
- /**
427
- * Generate new benchmark
428
- *
429
- * Creates a new benchmark snapshot using the provided configuration
430
- * and saves it for future reuse.
431
- *
432
- * @param config - Benchmark generation configuration
433
- * @returns Generated benchmark snapshot
434
- * @throws Error if generation fails
435
- */
436
- static async generateBenchmark(config) {
437
- logger.info("Generating new benchmark", config);
438
- const generator = new BenchmarkDataGenerator(config);
439
- const snapshot = await generator.generate();
440
- // Save for reuse
441
- const outputPath = path.join(process.cwd(), "benchmarks", `benchmark-${snapshot.id}.json`);
442
- await fs.mkdir(path.dirname(outputPath), { recursive: true });
443
- await fs.writeFile(outputPath, JSON.stringify(snapshot, null, 2));
444
- logger.info("Benchmark generated and saved", { path: outputPath });
445
- return snapshot;
446
- }
447
- /**
448
- * Save simulation result
449
- *
450
- * Saves complete simulation results including metrics, trajectory data,
451
- * and full result object to the output directory.
452
- *
453
- * @param result - Simulation result to save
454
- * @param outputDir - Directory to save results in
455
- */
456
- static async saveResult(result, outputDir) {
457
- await fs.mkdir(outputDir, { recursive: true });
458
- // Save full result
459
- const resultPath = path.join(outputDir, "result.json");
460
- await fs.writeFile(resultPath, JSON.stringify(result, null, 2));
461
- // Save metrics summary
462
- const metricsPath = path.join(outputDir, "metrics.json");
463
- await fs.writeFile(metricsPath, JSON.stringify(result.metrics, null, 2));
464
- // Save trajectory
465
- const trajectoryPath = path.join(outputDir, "trajectory.json");
466
- await fs.writeFile(trajectoryPath, JSON.stringify(result.trajectory, null, 2));
467
- logger.debug("Results saved", { outputDir });
468
- }
469
- /**
470
- * Save comparison report
471
- *
472
- * Saves benchmark comparison results to a JSON file in the output directory.
473
- *
474
- * @param comparison - Comparison result to save
475
- * @param outputDir - Directory to save comparison in
476
- */
477
- static async saveComparison(comparison, outputDir) {
478
- await fs.mkdir(outputDir, { recursive: true });
479
- const comparisonPath = path.join(outputDir, "comparison.json");
480
- await fs.writeFile(comparisonPath, JSON.stringify(comparison, null, 2));
481
- logger.debug("Comparison saved", { outputDir });
482
- }
483
- }
@@ -1,158 +0,0 @@
1
- /**
2
- * Benchmark Data Validator
3
- *
4
- * Validates benchmark snapshot data to ensure it's properly formatted
5
- * and contains all required fields.
6
- */
7
- import { logger } from "../utils/logger";
8
- /**
9
- * Validate a benchmark snapshot
10
- */
11
- export function validate(snapshot) {
12
- const errors = [];
13
- const warnings = [];
14
- // 1. Check required top-level fields
15
- if (!snapshot || typeof snapshot !== "object") {
16
- errors.push("Snapshot is null, undefined, or not an object");
17
- return { valid: false, errors, warnings };
18
- }
19
- const snap = snapshot;
20
- if (!snap.id)
21
- errors.push("Missing required field: id");
22
- if (!snap.version)
23
- errors.push("Missing required field: version");
24
- if (typeof snap.duration !== "number")
25
- errors.push("Missing or invalid field: duration");
26
- if (typeof snap.tickInterval !== "number")
27
- errors.push("Missing or invalid field: tickInterval");
28
- if (!snap.initialState)
29
- errors.push("Missing required field: initialState");
30
- if (!Array.isArray(snap.ticks))
31
- errors.push("Missing or invalid field: ticks (must be array)");
32
- if (!snap.groundTruth)
33
- errors.push("Missing required field: groundTruth");
34
- // 2. Validate initial state
35
- if (snap.initialState && typeof snap.initialState === "object") {
36
- const state = snap.initialState;
37
- if (typeof state.tick !== "number")
38
- errors.push("initialState.tick must be a number");
39
- if (state.tick !== 0)
40
- warnings.push("initialState.tick should be 0");
41
- if (!Array.isArray(state.predictionMarkets)) {
42
- errors.push("initialState.predictionMarkets must be an array");
43
- }
44
- if (!Array.isArray(state.perpetualMarkets)) {
45
- errors.push("initialState.perpetualMarkets must be an array");
46
- }
47
- if (!Array.isArray(state.agents)) {
48
- errors.push("initialState.agents must be an array");
49
- }
50
- }
51
- // 3. Validate ticks
52
- if (Array.isArray(snap.ticks)) {
53
- if (snap.ticks.length === 0) {
54
- warnings.push("Ticks array is empty");
55
- }
56
- snap.ticks.forEach((tick, index) => {
57
- if (!tick || typeof tick !== "object") {
58
- errors.push(`Tick ${index}: invalid tick object`);
59
- return;
60
- }
61
- const tickObj = tick;
62
- if (typeof tickObj.number !== "number") {
63
- errors.push(`Tick ${index}: missing or invalid 'number' field`);
64
- }
65
- if (!Array.isArray(tickObj.events)) {
66
- errors.push(`Tick ${index}: events must be an array`);
67
- }
68
- if (!tickObj.state) {
69
- errors.push(`Tick ${index}: missing state`);
70
- }
71
- });
72
- // Check tick numbering is sequential
73
- for (let i = 0; i < snap.ticks.length; i++) {
74
- const tick = snap.ticks[i];
75
- if (tick && typeof tick.number === "number" && tick.number !== i) {
76
- warnings.push(`Tick ${i}: number ${tick.number} doesn't match index`);
77
- }
78
- }
79
- }
80
- // 4. Validate ground truth
81
- if (snap.groundTruth && typeof snap.groundTruth === "object") {
82
- const gt = snap.groundTruth;
83
- if (!gt.marketOutcomes || typeof gt.marketOutcomes !== "object") {
84
- errors.push("groundTruth.marketOutcomes must be an object");
85
- }
86
- if (!gt.priceHistory || typeof gt.priceHistory !== "object") {
87
- errors.push("groundTruth.priceHistory must be an object");
88
- }
89
- if (!Array.isArray(gt.optimalActions)) {
90
- errors.push("groundTruth.optimalActions must be an array");
91
- }
92
- if (!Array.isArray(gt.socialOpportunities)) {
93
- errors.push("groundTruth.socialOpportunities must be an array");
94
- }
95
- if (!Array.isArray(gt.hiddenFacts)) {
96
- errors.push("groundTruth.hiddenFacts must be an array");
97
- }
98
- if (!Array.isArray(gt.hiddenEvents)) {
99
- errors.push("groundTruth.hiddenEvents must be an array");
100
- }
101
- if (!gt.trueFacts || typeof gt.trueFacts !== "object") {
102
- errors.push("groundTruth.trueFacts must be an object");
103
- }
104
- }
105
- // 5. Cross-validate: markets in initialState should have outcomes in groundTruth
106
- if (snap.initialState &&
107
- typeof snap.initialState === "object" &&
108
- snap.groundTruth &&
109
- typeof snap.groundTruth === "object") {
110
- const initialState = snap.initialState;
111
- const groundTruth = snap.groundTruth;
112
- const markets = (Array.isArray(initialState.predictionMarkets)
113
- ? initialState.predictionMarkets
114
- : []);
115
- const outcomes = (groundTruth.marketOutcomes &&
116
- typeof groundTruth.marketOutcomes === "object"
117
- ? groundTruth.marketOutcomes
118
- : {});
119
- markets.forEach((market) => {
120
- if (market.id &&
121
- typeof market.id === "string" &&
122
- !(market.id in outcomes)) {
123
- warnings.push(`Market ${market.id} in initialState but no outcome in groundTruth`);
124
- }
125
- });
126
- }
127
- logger.info("Benchmark validation complete", {
128
- valid: errors.length === 0,
129
- errors: errors.length,
130
- warnings: warnings.length,
131
- });
132
- return {
133
- valid: errors.length === 0,
134
- errors,
135
- warnings,
136
- };
137
- }
138
- /**
139
- * Quick sanity check (fast, minimal validation)
140
- */
141
- export function sanityCheck(snapshot) {
142
- if (!snapshot || typeof snapshot !== "object")
143
- return false;
144
- const snap = snapshot;
145
- return !!(snap.id &&
146
- snap.initialState &&
147
- Array.isArray(snap.ticks) &&
148
- snap.groundTruth);
149
- }
150
- /**
151
- * Validate and throw if invalid
152
- */
153
- export function validateOrThrow(snapshot) {
154
- const result = validate(snapshot);
155
- if (!result.valid) {
156
- throw new Error(`Invalid benchmark data: ${result.errors.join(", ")}`);
157
- }
158
- }