@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -11,24 +11,24 @@
|
|
|
11
11
|
* Can run multiple agents and compare their performance.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import
|
|
15
|
-
import
|
|
16
|
-
import
|
|
17
|
-
import { getAutonomousCoordinator } from
|
|
18
|
-
import { TrajectoryRecorder } from
|
|
19
|
-
import { logger } from
|
|
14
|
+
import { promises as fs } from "node:fs";
|
|
15
|
+
import * as path from "node:path";
|
|
16
|
+
import type { IAgentRuntimeLike } from "../dependencies";
|
|
17
|
+
import { getAutonomousCoordinator } from "../dependencies";
|
|
18
|
+
import { TrajectoryRecorder } from "../training/TrajectoryRecorder";
|
|
19
|
+
import { logger } from "../utils/logger";
|
|
20
20
|
import {
|
|
21
21
|
type BenchmarkConfig,
|
|
22
22
|
BenchmarkDataGenerator,
|
|
23
23
|
type BenchmarkGameSnapshot,
|
|
24
24
|
SeededRandom,
|
|
25
|
-
} from
|
|
26
|
-
import { SimulationA2AInterface } from
|
|
25
|
+
} from "./BenchmarkDataGenerator";
|
|
26
|
+
import { SimulationA2AInterface } from "./SimulationA2AInterface";
|
|
27
27
|
import {
|
|
28
28
|
type SimulationConfig,
|
|
29
29
|
SimulationEngine,
|
|
30
30
|
type SimulationResult,
|
|
31
|
-
} from
|
|
31
|
+
} from "./SimulationEngine";
|
|
32
32
|
|
|
33
33
|
export interface BenchmarkRunConfig {
|
|
34
34
|
/** Path to benchmark snapshot file (or will generate new one) */
|
|
@@ -53,7 +53,7 @@ export interface BenchmarkRunConfig {
|
|
|
53
53
|
forceModel?: string;
|
|
54
54
|
|
|
55
55
|
/** Force a baseline strategy (overrides agent behavior) */
|
|
56
|
-
forceStrategy?:
|
|
56
|
+
forceStrategy?: "random" | "momentum";
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
export interface BenchmarkComparisonResult {
|
|
@@ -104,18 +104,18 @@ export class BenchmarkRunner {
|
|
|
104
104
|
* ```
|
|
105
105
|
*/
|
|
106
106
|
static async runSingle(
|
|
107
|
-
config: BenchmarkRunConfig
|
|
107
|
+
config: BenchmarkRunConfig,
|
|
108
108
|
): Promise<SimulationResult> {
|
|
109
|
-
logger.info(
|
|
109
|
+
logger.info("Starting benchmark run", {
|
|
110
110
|
agentUserId: config.agentUserId,
|
|
111
111
|
benchmarkPath: config.benchmarkPath,
|
|
112
|
-
strategy: config.forceStrategy ||
|
|
112
|
+
strategy: config.forceStrategy || "agent-driven",
|
|
113
113
|
});
|
|
114
114
|
|
|
115
115
|
// 1. Load or generate benchmark
|
|
116
116
|
const snapshot = config.benchmarkPath
|
|
117
|
-
? await
|
|
118
|
-
: await
|
|
117
|
+
? await BenchmarkRunner.loadBenchmark(config.benchmarkPath)
|
|
118
|
+
: await BenchmarkRunner.generateBenchmark(config.generatorConfig!);
|
|
119
119
|
|
|
120
120
|
// 2. Create simulation engine
|
|
121
121
|
const simConfig: SimulationConfig = {
|
|
@@ -132,12 +132,16 @@ export class BenchmarkRunner {
|
|
|
132
132
|
|
|
133
133
|
// Inject A2A interface into agent runtime (if using real agent and not forcing strategy)
|
|
134
134
|
if (!config.forceStrategy) {
|
|
135
|
-
(
|
|
135
|
+
(
|
|
136
|
+
config.agentRuntime as IAgentRuntimeLike & {
|
|
137
|
+
a2aClient?: SimulationA2AInterface;
|
|
138
|
+
}
|
|
139
|
+
).a2aClient = a2aInterface;
|
|
136
140
|
}
|
|
137
141
|
|
|
138
142
|
// Force model if specified (for baseline testing)
|
|
139
143
|
if (config.forceModel) {
|
|
140
|
-
logger.info(
|
|
144
|
+
logger.info("Forcing model for benchmark", {
|
|
141
145
|
agentUserId: config.agentUserId,
|
|
142
146
|
forcedModel: config.forceModel,
|
|
143
147
|
});
|
|
@@ -155,8 +159,8 @@ export class BenchmarkRunner {
|
|
|
155
159
|
}
|
|
156
160
|
|
|
157
161
|
if (runtime.setSetting) {
|
|
158
|
-
runtime.setSetting(
|
|
159
|
-
runtime.setSetting(
|
|
162
|
+
runtime.setSetting("GROQ_LARGE_MODEL", config.forceModel);
|
|
163
|
+
runtime.setSetting("GROQ_SMALL_MODEL", config.forceModel);
|
|
160
164
|
}
|
|
161
165
|
}
|
|
162
166
|
|
|
@@ -170,14 +174,14 @@ export class BenchmarkRunner {
|
|
|
170
174
|
agentId: config.agentUserId,
|
|
171
175
|
scenarioId: `benchmark-${snapshot.id}`,
|
|
172
176
|
});
|
|
173
|
-
logger.info(
|
|
177
|
+
logger.info("Trajectory recording started", { trajectoryId });
|
|
174
178
|
}
|
|
175
179
|
|
|
176
180
|
// 5. Initialize simulation
|
|
177
181
|
engine.initialize();
|
|
178
182
|
|
|
179
183
|
// 6. Run simulation loop
|
|
180
|
-
logger.info(
|
|
184
|
+
logger.info("Starting simulation loop", {
|
|
181
185
|
agentUserId: config.agentUserId,
|
|
182
186
|
totalTicks: snapshot.ticks.length,
|
|
183
187
|
});
|
|
@@ -191,7 +195,7 @@ export class BenchmarkRunner {
|
|
|
191
195
|
// Create seeded RNG for baseline strategies (reproducibility)
|
|
192
196
|
// Use snapshot ID hash as seed for deterministic behavior across runs
|
|
193
197
|
const baselineSeed = config.forceStrategy
|
|
194
|
-
? snapshot.id.split(
|
|
198
|
+
? snapshot.id.split("").reduce((acc, c) => acc + c.charCodeAt(0), 0)
|
|
195
199
|
: 0;
|
|
196
200
|
const baselineRng = config.forceStrategy
|
|
197
201
|
? new SeededRandom(baselineSeed)
|
|
@@ -208,21 +212,21 @@ export class BenchmarkRunner {
|
|
|
208
212
|
`Benchmark progress: ${currentTick}/${snapshot.ticks.length} ticks`,
|
|
209
213
|
{
|
|
210
214
|
agentUserId: config.agentUserId,
|
|
211
|
-
}
|
|
215
|
+
},
|
|
212
216
|
);
|
|
213
217
|
}
|
|
214
218
|
|
|
215
219
|
if (config.forceStrategy && baselineRng) {
|
|
216
220
|
// Execute baseline strategy directly on engine (bypassing LLM)
|
|
217
|
-
await
|
|
221
|
+
await BenchmarkRunner.executeBaselineStrategy(
|
|
218
222
|
config.forceStrategy,
|
|
219
223
|
engine,
|
|
220
|
-
baselineRng
|
|
224
|
+
baselineRng,
|
|
221
225
|
);
|
|
222
226
|
} else {
|
|
223
227
|
if (!coordinator) {
|
|
224
228
|
throw new Error(
|
|
225
|
-
|
|
229
|
+
"AutonomousCoordinator required for agent-driven benchmark but not configured.",
|
|
226
230
|
);
|
|
227
231
|
}
|
|
228
232
|
|
|
@@ -230,7 +234,7 @@ export class BenchmarkRunner {
|
|
|
230
234
|
// Fail fast - don't catch errors, let them propagate
|
|
231
235
|
const tickResult = await coordinator.executeAutonomousTick(
|
|
232
236
|
config.agentUserId,
|
|
233
|
-
config.agentRuntime
|
|
237
|
+
config.agentRuntime,
|
|
234
238
|
);
|
|
235
239
|
|
|
236
240
|
if (tickResult.success && tickResult.actionsExecuted) {
|
|
@@ -243,7 +247,7 @@ export class BenchmarkRunner {
|
|
|
243
247
|
tickResult.actionsExecuted.engagements;
|
|
244
248
|
|
|
245
249
|
if (totalActions > 0) {
|
|
246
|
-
logger.debug(
|
|
250
|
+
logger.debug("Agent took actions", {
|
|
247
251
|
tick: currentTick,
|
|
248
252
|
actions: tickResult.actionsExecuted,
|
|
249
253
|
});
|
|
@@ -259,7 +263,7 @@ export class BenchmarkRunner {
|
|
|
259
263
|
await new Promise((resolve) => setTimeout(resolve, 5));
|
|
260
264
|
}
|
|
261
265
|
|
|
262
|
-
logger.info(
|
|
266
|
+
logger.info("Simulation loop complete", {
|
|
263
267
|
agentUserId: config.agentUserId,
|
|
264
268
|
ticksCompleted,
|
|
265
269
|
totalTicks: snapshot.ticks.length,
|
|
@@ -270,11 +274,11 @@ export class BenchmarkRunner {
|
|
|
270
274
|
|
|
271
275
|
// 8. Validate results - ensure agent actually did something
|
|
272
276
|
if (result.ticksProcessed === 0) {
|
|
273
|
-
throw new Error(
|
|
277
|
+
throw new Error("Benchmark failed: No ticks were processed");
|
|
274
278
|
}
|
|
275
279
|
|
|
276
280
|
if (result.actions.length === 0) {
|
|
277
|
-
logger.warn(
|
|
281
|
+
logger.warn("Benchmark completed but agent took no actions", {
|
|
278
282
|
agentUserId: config.agentUserId,
|
|
279
283
|
ticksProcessed: result.ticksProcessed,
|
|
280
284
|
});
|
|
@@ -286,13 +290,13 @@ export class BenchmarkRunner {
|
|
|
286
290
|
finalPnL: result.metrics.totalPnl,
|
|
287
291
|
finalBalance: undefined, // Let recorder calculate from state
|
|
288
292
|
});
|
|
289
|
-
logger.info(
|
|
293
|
+
logger.info("Trajectory recording saved", { trajectoryId });
|
|
290
294
|
}
|
|
291
295
|
|
|
292
296
|
// 10. Save results
|
|
293
|
-
await
|
|
297
|
+
await BenchmarkRunner.saveResult(result, config.outputDir);
|
|
294
298
|
|
|
295
|
-
logger.info(
|
|
299
|
+
logger.info("Benchmark run completed", {
|
|
296
300
|
agentUserId: config.agentUserId,
|
|
297
301
|
totalPnl: result.metrics.totalPnl,
|
|
298
302
|
accuracy: result.metrics.predictionMetrics.accuracy,
|
|
@@ -308,31 +312,31 @@ export class BenchmarkRunner {
|
|
|
308
312
|
* Uses seeded RNG for reproducibility across benchmark runs.
|
|
309
313
|
*/
|
|
310
314
|
private static async executeBaselineStrategy(
|
|
311
|
-
strategy:
|
|
315
|
+
strategy: "random" | "momentum",
|
|
312
316
|
engine: SimulationEngine,
|
|
313
|
-
rng: SeededRandom
|
|
317
|
+
rng: SeededRandom,
|
|
314
318
|
): Promise<void> {
|
|
315
319
|
const state = engine.getGameState();
|
|
316
320
|
|
|
317
321
|
// Rate limiting: Only trade in ~10% of ticks to simulate realistic frequency
|
|
318
322
|
if (rng.next() > 0.1) return;
|
|
319
323
|
|
|
320
|
-
if (strategy ===
|
|
324
|
+
if (strategy === "random") {
|
|
321
325
|
// Random strategy: Buy prediction shares or open perps randomly
|
|
322
|
-
const actionType = rng.next() > 0.5 ?
|
|
326
|
+
const actionType = rng.next() > 0.5 ? "prediction" : "perp";
|
|
323
327
|
|
|
324
|
-
if (actionType ===
|
|
328
|
+
if (actionType === "prediction" && state.predictionMarkets.length > 0) {
|
|
325
329
|
const marketIndex = Math.floor(
|
|
326
|
-
rng.next() * state.predictionMarkets.length
|
|
330
|
+
rng.next() * state.predictionMarkets.length,
|
|
327
331
|
);
|
|
328
332
|
const market = state.predictionMarkets[marketIndex];
|
|
329
333
|
|
|
330
334
|
if (market) {
|
|
331
|
-
const outcome = rng.next() > 0.5 ?
|
|
335
|
+
const outcome = rng.next() > 0.5 ? "YES" : "NO";
|
|
332
336
|
// Random amount between 10 and 100
|
|
333
337
|
const amount = 10 + rng.next() * 90;
|
|
334
338
|
|
|
335
|
-
await engine.performAction(
|
|
339
|
+
await engine.performAction("buy_prediction", {
|
|
336
340
|
marketId: market.id,
|
|
337
341
|
outcome,
|
|
338
342
|
amount,
|
|
@@ -340,13 +344,13 @@ export class BenchmarkRunner {
|
|
|
340
344
|
}
|
|
341
345
|
} else if (state.perpetualMarkets.length > 0) {
|
|
342
346
|
const perpIndex = Math.floor(
|
|
343
|
-
rng.next() * state.perpetualMarkets.length
|
|
347
|
+
rng.next() * state.perpetualMarkets.length,
|
|
344
348
|
);
|
|
345
349
|
const perp = state.perpetualMarkets[perpIndex];
|
|
346
350
|
|
|
347
351
|
if (perp) {
|
|
348
|
-
const side = rng.next() > 0.5 ?
|
|
349
|
-
await engine.performAction(
|
|
352
|
+
const side = rng.next() > 0.5 ? "LONG" : "SHORT";
|
|
353
|
+
await engine.performAction("open_perp", {
|
|
350
354
|
ticker: perp.ticker,
|
|
351
355
|
side,
|
|
352
356
|
size: 10,
|
|
@@ -354,11 +358,11 @@ export class BenchmarkRunner {
|
|
|
354
358
|
});
|
|
355
359
|
}
|
|
356
360
|
}
|
|
357
|
-
} else if (strategy ===
|
|
361
|
+
} else if (strategy === "momentum") {
|
|
358
362
|
// Momentum strategy: Follow price trends
|
|
359
363
|
if (state.perpetualMarkets.length > 0) {
|
|
360
364
|
const perpIndex = Math.floor(
|
|
361
|
-
rng.next() * state.perpetualMarkets.length
|
|
365
|
+
rng.next() * state.perpetualMarkets.length,
|
|
362
366
|
);
|
|
363
367
|
const perp = state.perpetualMarkets[perpIndex];
|
|
364
368
|
|
|
@@ -366,16 +370,16 @@ export class BenchmarkRunner {
|
|
|
366
370
|
// If price up > 0.5% in 24h, go LONG. If down > 0.5%, go SHORT.
|
|
367
371
|
// If relatively flat, do nothing (hold).
|
|
368
372
|
if (perp.priceChange24h > 0.5) {
|
|
369
|
-
await engine.performAction(
|
|
373
|
+
await engine.performAction("open_perp", {
|
|
370
374
|
ticker: perp.ticker,
|
|
371
|
-
side:
|
|
375
|
+
side: "LONG",
|
|
372
376
|
size: 20,
|
|
373
377
|
leverage: 2,
|
|
374
378
|
});
|
|
375
379
|
} else if (perp.priceChange24h < -0.5) {
|
|
376
|
-
await engine.performAction(
|
|
380
|
+
await engine.performAction("open_perp", {
|
|
377
381
|
ticker: perp.ticker,
|
|
378
|
-
side:
|
|
382
|
+
side: "SHORT",
|
|
379
383
|
size: 20,
|
|
380
384
|
leverage: 2,
|
|
381
385
|
});
|
|
@@ -410,7 +414,7 @@ export class BenchmarkRunner {
|
|
|
410
414
|
*/
|
|
411
415
|
static async runMultiple(
|
|
412
416
|
config: BenchmarkRunConfig,
|
|
413
|
-
numRuns: number
|
|
417
|
+
numRuns: number,
|
|
414
418
|
): Promise<BenchmarkComparisonResult> {
|
|
415
419
|
logger.info(`Running ${numRuns} benchmark iterations`, {
|
|
416
420
|
agentUserId: config.agentUserId,
|
|
@@ -422,7 +426,7 @@ export class BenchmarkRunner {
|
|
|
422
426
|
for (let i = 0; i < numRuns; i++) {
|
|
423
427
|
logger.info(`Starting run ${i + 1}/${numRuns}`);
|
|
424
428
|
|
|
425
|
-
const result = await
|
|
429
|
+
const result = await BenchmarkRunner.runSingle({
|
|
426
430
|
...config,
|
|
427
431
|
outputDir: path.join(config.outputDir, `run-${i + 1}`),
|
|
428
432
|
});
|
|
@@ -431,7 +435,7 @@ export class BenchmarkRunner {
|
|
|
431
435
|
|
|
432
436
|
if (config.saveTrajectory) {
|
|
433
437
|
trajectoryPaths.push(
|
|
434
|
-
path.join(config.outputDir, `run-${i + 1}`,
|
|
438
|
+
path.join(config.outputDir, `run-${i + 1}`, "trajectory.json"),
|
|
435
439
|
);
|
|
436
440
|
}
|
|
437
441
|
|
|
@@ -449,11 +453,11 @@ export class BenchmarkRunner {
|
|
|
449
453
|
runs.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) / runs.length;
|
|
450
454
|
|
|
451
455
|
const bestRun = runs.reduce((best, current) =>
|
|
452
|
-
current.metrics.totalPnl > best.metrics.totalPnl ? current : best
|
|
456
|
+
current.metrics.totalPnl > best.metrics.totalPnl ? current : best,
|
|
453
457
|
);
|
|
454
458
|
|
|
455
459
|
const worstRun = runs.reduce((worst, current) =>
|
|
456
|
-
current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst
|
|
460
|
+
current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst,
|
|
457
461
|
);
|
|
458
462
|
|
|
459
463
|
const comparison = {
|
|
@@ -465,16 +469,16 @@ export class BenchmarkRunner {
|
|
|
465
469
|
};
|
|
466
470
|
|
|
467
471
|
// Save comparison report
|
|
468
|
-
await
|
|
472
|
+
await BenchmarkRunner.saveComparison(
|
|
469
473
|
{
|
|
470
474
|
runs,
|
|
471
475
|
comparison,
|
|
472
476
|
trajectories: config.saveTrajectory ? trajectoryPaths : undefined,
|
|
473
477
|
},
|
|
474
|
-
config.outputDir
|
|
478
|
+
config.outputDir,
|
|
475
479
|
);
|
|
476
480
|
|
|
477
|
-
logger.info(
|
|
481
|
+
logger.info("Multiple benchmarks completed", comparison);
|
|
478
482
|
|
|
479
483
|
return {
|
|
480
484
|
runs,
|
|
@@ -513,7 +517,7 @@ export class BenchmarkRunner {
|
|
|
513
517
|
static async compareAgents(
|
|
514
518
|
agent1Config: BenchmarkRunConfig,
|
|
515
519
|
agent2Config: BenchmarkRunConfig,
|
|
516
|
-
benchmarkPath: string
|
|
520
|
+
benchmarkPath: string,
|
|
517
521
|
): Promise<{
|
|
518
522
|
agent1: SimulationResult;
|
|
519
523
|
agent2: SimulationResult;
|
|
@@ -524,7 +528,7 @@ export class BenchmarkRunner {
|
|
|
524
528
|
optimality: number;
|
|
525
529
|
};
|
|
526
530
|
}> {
|
|
527
|
-
logger.info(
|
|
531
|
+
logger.info("Comparing two agents", {
|
|
528
532
|
agent1: agent1Config.agentUserId,
|
|
529
533
|
agent2: agent2Config.agentUserId,
|
|
530
534
|
benchmark: benchmarkPath,
|
|
@@ -532,8 +536,8 @@ export class BenchmarkRunner {
|
|
|
532
536
|
|
|
533
537
|
// Run both agents on same benchmark (concurrently)
|
|
534
538
|
const [result1, result2] = await Promise.all([
|
|
535
|
-
|
|
536
|
-
|
|
539
|
+
BenchmarkRunner.runSingle({ ...agent1Config, benchmarkPath }),
|
|
540
|
+
BenchmarkRunner.runSingle({ ...agent2Config, benchmarkPath }),
|
|
537
541
|
]);
|
|
538
542
|
|
|
539
543
|
const winner =
|
|
@@ -550,7 +554,7 @@ export class BenchmarkRunner {
|
|
|
550
554
|
result1.metrics.optimalityScore - result2.metrics.optimalityScore,
|
|
551
555
|
};
|
|
552
556
|
|
|
553
|
-
logger.info(
|
|
557
|
+
logger.info("Agent comparison completed", {
|
|
554
558
|
winner,
|
|
555
559
|
delta,
|
|
556
560
|
});
|
|
@@ -571,16 +575,16 @@ export class BenchmarkRunner {
|
|
|
571
575
|
* @throws Error if file cannot be read or parsed
|
|
572
576
|
*/
|
|
573
577
|
private static async loadBenchmark(
|
|
574
|
-
benchmarkPath: string
|
|
578
|
+
benchmarkPath: string,
|
|
575
579
|
): Promise<BenchmarkGameSnapshot> {
|
|
576
580
|
try {
|
|
577
|
-
const data = await fs.readFile(benchmarkPath,
|
|
581
|
+
const data = await fs.readFile(benchmarkPath, "utf-8");
|
|
578
582
|
const parsed = JSON.parse(data) as BenchmarkGameSnapshot;
|
|
579
583
|
|
|
580
584
|
// Validate basic structure
|
|
581
585
|
if (!parsed.id || !parsed.initialState || !parsed.groundTruth) {
|
|
582
586
|
throw new Error(
|
|
583
|
-
`Invalid benchmark file: missing required fields (id, initialState, or groundTruth)
|
|
587
|
+
`Invalid benchmark file: missing required fields (id, initialState, or groundTruth)`,
|
|
584
588
|
);
|
|
585
589
|
}
|
|
586
590
|
|
|
@@ -588,10 +592,10 @@ export class BenchmarkRunner {
|
|
|
588
592
|
} catch (error) {
|
|
589
593
|
if (error instanceof SyntaxError) {
|
|
590
594
|
throw new Error(
|
|
591
|
-
`Failed to parse benchmark JSON file: ${error.message}
|
|
595
|
+
`Failed to parse benchmark JSON file: ${error.message}`,
|
|
592
596
|
);
|
|
593
597
|
}
|
|
594
|
-
if ((error as { code?: string })?.code ===
|
|
598
|
+
if ((error as { code?: string })?.code === "ENOENT") {
|
|
595
599
|
throw new Error(`Benchmark file not found: ${benchmarkPath}`);
|
|
596
600
|
}
|
|
597
601
|
throw error;
|
|
@@ -609,9 +613,9 @@ export class BenchmarkRunner {
|
|
|
609
613
|
* @throws Error if generation fails
|
|
610
614
|
*/
|
|
611
615
|
private static async generateBenchmark(
|
|
612
|
-
config: BenchmarkConfig
|
|
616
|
+
config: BenchmarkConfig,
|
|
613
617
|
): Promise<BenchmarkGameSnapshot> {
|
|
614
|
-
logger.info(
|
|
618
|
+
logger.info("Generating new benchmark", config);
|
|
615
619
|
|
|
616
620
|
const generator = new BenchmarkDataGenerator(config);
|
|
617
621
|
const snapshot = await generator.generate();
|
|
@@ -619,13 +623,13 @@ export class BenchmarkRunner {
|
|
|
619
623
|
// Save for reuse
|
|
620
624
|
const outputPath = path.join(
|
|
621
625
|
process.cwd(),
|
|
622
|
-
|
|
623
|
-
`benchmark-${snapshot.id}.json
|
|
626
|
+
"benchmarks",
|
|
627
|
+
`benchmark-${snapshot.id}.json`,
|
|
624
628
|
);
|
|
625
629
|
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
626
630
|
await fs.writeFile(outputPath, JSON.stringify(snapshot, null, 2));
|
|
627
631
|
|
|
628
|
-
logger.info(
|
|
632
|
+
logger.info("Benchmark generated and saved", { path: outputPath });
|
|
629
633
|
|
|
630
634
|
return snapshot;
|
|
631
635
|
}
|
|
@@ -641,26 +645,26 @@ export class BenchmarkRunner {
|
|
|
641
645
|
*/
|
|
642
646
|
private static async saveResult(
|
|
643
647
|
result: SimulationResult,
|
|
644
|
-
outputDir: string
|
|
648
|
+
outputDir: string,
|
|
645
649
|
): Promise<void> {
|
|
646
650
|
await fs.mkdir(outputDir, { recursive: true });
|
|
647
651
|
|
|
648
652
|
// Save full result
|
|
649
|
-
const resultPath = path.join(outputDir,
|
|
653
|
+
const resultPath = path.join(outputDir, "result.json");
|
|
650
654
|
await fs.writeFile(resultPath, JSON.stringify(result, null, 2));
|
|
651
655
|
|
|
652
656
|
// Save metrics summary
|
|
653
|
-
const metricsPath = path.join(outputDir,
|
|
657
|
+
const metricsPath = path.join(outputDir, "metrics.json");
|
|
654
658
|
await fs.writeFile(metricsPath, JSON.stringify(result.metrics, null, 2));
|
|
655
659
|
|
|
656
660
|
// Save trajectory
|
|
657
|
-
const trajectoryPath = path.join(outputDir,
|
|
661
|
+
const trajectoryPath = path.join(outputDir, "trajectory.json");
|
|
658
662
|
await fs.writeFile(
|
|
659
663
|
trajectoryPath,
|
|
660
|
-
JSON.stringify(result.trajectory, null, 2)
|
|
664
|
+
JSON.stringify(result.trajectory, null, 2),
|
|
661
665
|
);
|
|
662
666
|
|
|
663
|
-
logger.debug(
|
|
667
|
+
logger.debug("Results saved", { outputDir });
|
|
664
668
|
}
|
|
665
669
|
|
|
666
670
|
/**
|
|
@@ -673,13 +677,13 @@ export class BenchmarkRunner {
|
|
|
673
677
|
*/
|
|
674
678
|
private static async saveComparison(
|
|
675
679
|
comparison: BenchmarkComparisonResult,
|
|
676
|
-
outputDir: string
|
|
680
|
+
outputDir: string,
|
|
677
681
|
): Promise<void> {
|
|
678
682
|
await fs.mkdir(outputDir, { recursive: true });
|
|
679
683
|
|
|
680
|
-
const comparisonPath = path.join(outputDir,
|
|
684
|
+
const comparisonPath = path.join(outputDir, "comparison.json");
|
|
681
685
|
await fs.writeFile(comparisonPath, JSON.stringify(comparison, null, 2));
|
|
682
686
|
|
|
683
|
-
logger.debug(
|
|
687
|
+
logger.debug("Comparison saved", { outputDir });
|
|
684
688
|
}
|
|
685
689
|
}
|