@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,522 @@
1
+ /**
2
+ * Simulation Engine
3
+ *
4
+ * Replays pre-generated benchmark data to provide a deterministic game environment.
5
+ * Agents can query game state and make actions, but the underlying game progression
6
+ * is fixed based on the benchmark snapshot.
7
+ *
8
+ * Key features:
9
+ * - Tick-by-tick replay of pre-recorded game data
10
+ * - Fast-forward mode (ticks advance on agent response)
11
+ * - A2A-compatible interface
12
+ * - Tracks agent actions for performance evaluation
13
+ */
14
+ import { logger } from "../utils/logger";
15
+ import { MetricsValidator } from "./MetricsValidator";
16
+ export class SimulationEngine {
17
+ config;
18
+ currentTick = 0;
19
+ actions = [];
20
+ startTime = 0;
21
+ pnlHistory = [];
22
+ /** Agent positions tracked for metrics calculation */
23
+ predictionPositions = new Map();
24
+ perpPositions = new Map();
25
+ socialStats = {
26
+ postsCreated: 0,
27
+ groupsJoined: 0,
28
+ messagesReceived: 0,
29
+ };
30
+ constructor(config) {
31
+ this.config = config;
32
+ }
33
+ /**
34
+ * Run the complete simulation.
35
+ *
36
+ * Calculates final metrics after the simulation has been run.
37
+ * In fast-forward mode, the actual simulation must be driven externally by:
38
+ * 1. External runner calling agent logic
39
+ * 2. Agent making A2A calls which trigger actions
40
+ * 3. External runner calling advanceTick() to move forward
41
+ *
42
+ * This method then computes the final results.
43
+ */
44
+ async run() {
45
+ if (this.startTime === 0) {
46
+ this.startTime = Date.now();
47
+ }
48
+ const endTime = Date.now();
49
+ const metrics = this.calculateMetrics();
50
+ const validation = MetricsValidator.validate(metrics, this.actions, this.config.snapshot.groundTruth);
51
+ if (!validation.valid) {
52
+ logger.error("Metrics validation failed", {
53
+ errors: validation.errors,
54
+ warnings: validation.warnings,
55
+ });
56
+ }
57
+ if (validation.warnings.length > 0) {
58
+ logger.warn("Metrics validation warnings", {
59
+ warnings: validation.warnings,
60
+ });
61
+ }
62
+ const trajectory = this.buildTrajectory();
63
+ logger.info("Simulation completed", {
64
+ duration: endTime - this.startTime,
65
+ ticksProcessed: this.currentTick,
66
+ totalPnl: metrics.totalPnl,
67
+ actionsCount: this.actions.length,
68
+ metricsValid: validation.valid,
69
+ });
70
+ return {
71
+ id: `sim-${Date.now()}`,
72
+ agentId: this.config.agentId,
73
+ benchmarkId: this.config.snapshot.id,
74
+ startTime: this.startTime,
75
+ endTime,
76
+ ticksProcessed: this.currentTick,
77
+ actions: this.actions,
78
+ metrics,
79
+ trajectory,
80
+ pnlHistory: this.pnlHistory,
81
+ };
82
+ }
83
+ /**
84
+ * Initialize simulation
85
+ */
86
+ initialize() {
87
+ this.startTime = Date.now();
88
+ this.currentTick = 0;
89
+ this.pnlHistory = [];
90
+ logger.info("Simulation initialized", {
91
+ benchmarkId: this.config.snapshot.id,
92
+ agentId: this.config.agentId,
93
+ totalTicks: this.config.snapshot.ticks.length,
94
+ });
95
+ }
96
+ /**
97
+ * Check if simulation is complete
98
+ * Returns true if we've processed all ticks
99
+ */
100
+ isComplete() {
101
+ return this.currentTick >= this.config.snapshot.ticks.length;
102
+ }
103
+ /**
104
+ * Get total number of ticks in this simulation
105
+ */
106
+ getTotalTicks() {
107
+ return this.config.snapshot.ticks.length;
108
+ }
109
+ /**
110
+ * Get current tick number
111
+ */
112
+ getCurrentTickNumber() {
113
+ return this.currentTick;
114
+ }
115
+ /**
116
+ * Apply tick updates (called after agent actions)
117
+ * Updates position values based on current market prices
118
+ */
119
+ applyTickUpdates() {
120
+ const tick = this.config.snapshot.ticks[this.currentTick];
121
+ if (tick) {
122
+ this.updatePositionValues(tick);
123
+ }
124
+ }
125
+ /**
126
+ * Get current game state (called by agent via A2A)
127
+ */
128
+ getGameState() {
129
+ if (this.currentTick === 0) {
130
+ return this.config.snapshot.initialState;
131
+ }
132
+ const tick = this.config.snapshot.ticks[this.currentTick - 1];
133
+ return tick ? tick.state : this.config.snapshot.initialState;
134
+ }
135
+ /**
136
+ * Get specific historical tick (for backtesting/analysis)
137
+ */
138
+ getTickState(tickNumber) {
139
+ const tick = this.config.snapshot.ticks[tickNumber];
140
+ return tick ? tick.state : null;
141
+ }
142
+ /**
143
+ * Agent performs an action
144
+ */
145
+ async performAction(type, data) {
146
+ const actionStart = Date.now();
147
+ let result;
148
+ let correctness;
149
+ try {
150
+ switch (type) {
151
+ case "buy_prediction": {
152
+ result = this.handleBuyPrediction(data);
153
+ const { marketId, outcome } = data;
154
+ // Track correctness for prediction markets
155
+ const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[marketId];
156
+ if (marketOutcome !== undefined) {
157
+ const predictedOutcome = outcome === "YES";
158
+ const isCorrect = predictedOutcome === marketOutcome;
159
+ correctness = {
160
+ predictionCorrect: isCorrect,
161
+ actualOutcome: marketOutcome,
162
+ predictedOutcome,
163
+ };
164
+ }
165
+ break;
166
+ }
167
+ case "open_perp": {
168
+ result = this.handleOpenPerp(data);
169
+ const { ticker, side } = data;
170
+ // Track correctness for perp trades based on sentiment and price movement
171
+ const state = this.getGameState();
172
+ const market = state.perpetualMarkets.find((m) => m.ticker === ticker);
173
+ if (market) {
174
+ // Calculate sentiment (simplified: based on price change)
175
+ const priceHistory = this.config.snapshot.groundTruth.priceHistory[ticker];
176
+ const currentPrice = market.price;
177
+ const futurePrice = priceHistory?.[Math.min(this.currentTick + 10, priceHistory.length - 1)]?.price;
178
+ if (futurePrice !== undefined) {
179
+ const priceChange = (futurePrice - currentPrice) / currentPrice;
180
+ const sentimentAtTrade = priceChange > 0 ? 0.5 : -0.5; // Simplified sentiment
181
+ // Determine if trade was correct
182
+ // If sentiment is negative and we went short, that's correct
183
+ // If sentiment is positive and we went long, that's correct
184
+ const expectedDirection = sentimentAtTrade < 0 ? "down" : "up";
185
+ const tradeDirection = side === "SHORT" ? "down" : "up";
186
+ const isCorrect = expectedDirection === tradeDirection;
187
+ correctness = {
188
+ perpCorrect: isCorrect,
189
+ sentimentAtTrade,
190
+ priceChange,
191
+ expectedDirection,
192
+ };
193
+ }
194
+ }
195
+ break;
196
+ }
197
+ case "close_perp":
198
+ result = this.handleClosePerp(data);
199
+ break;
200
+ case "join_group":
201
+ result = this.handleJoinGroup(data);
202
+ break;
203
+ case "create_post":
204
+ result = this.handleCreatePost(data);
205
+ break;
206
+ default:
207
+ return { success: false, error: `Unknown action type: ${type}` };
208
+ }
209
+ // Record action with correctness metadata
210
+ this.actions.push({
211
+ tick: this.currentTick,
212
+ timestamp: Date.now(),
213
+ type,
214
+ data,
215
+ duration: Date.now() - actionStart,
216
+ correctness,
217
+ });
218
+ return { success: true, result };
219
+ }
220
+ catch (error) {
221
+ return {
222
+ success: false,
223
+ error: error instanceof Error ? error.message : String(error),
224
+ };
225
+ }
226
+ }
227
+ /**
228
+ * Advance to next tick (called after agent finishes processing)
229
+ */
230
+ advanceTick() {
231
+ // Apply any price-based updates for current tick
232
+ this.applyTickUpdates();
233
+ // Record PnL history
234
+ const currentPnl = this.calculateCurrentTotalPnl();
235
+ this.pnlHistory.push({
236
+ tick: this.currentTick,
237
+ pnl: currentPnl,
238
+ });
239
+ // Move to next tick
240
+ this.currentTick++;
241
+ }
242
+ /**
243
+ * Calculate current PnL (realized + unrealized)
244
+ */
245
+ calculateCurrentTotalPnl() {
246
+ let totalPnl = 0;
247
+ // Realized PnL from closed trades
248
+ for (const pos of this.perpPositions.values()) {
249
+ totalPnl += pos.realizedPnl || 0;
250
+ totalPnl += pos.unrealizedPnl || 0;
251
+ }
252
+ // Note: Prediction market PnL is only realized on resolution in this simple engine
253
+ // We could add mark-to-market valuation here if needed
254
+ return totalPnl;
255
+ }
256
+ /**
257
+ * Stop simulation early
258
+ */
259
+ stop() {
260
+ // Stop simulation (currently not actively used but kept for API compatibility)
261
+ }
262
+ /**
263
+ * Handle buying prediction market shares
264
+ */
265
+ handleBuyPrediction(data) {
266
+ const { marketId, outcome, amount } = data;
267
+ const state = this.getGameState();
268
+ const market = state.predictionMarkets.find((m) => m.id === marketId);
269
+ if (!market) {
270
+ throw new Error(`Market ${marketId} not found`);
271
+ }
272
+ // Calculate shares based on current price
273
+ const price = outcome === "YES" ? market.yesPrice : market.noPrice;
274
+ const shares = amount / price;
275
+ // Record position
276
+ const positionId = `pos-${Date.now()}`;
277
+ this.predictionPositions.set(positionId, {
278
+ marketId,
279
+ outcome,
280
+ shares,
281
+ entryPrice: price,
282
+ amount,
283
+ openedAt: this.currentTick,
284
+ });
285
+ return { positionId, shares };
286
+ }
287
+ /**
288
+ * Handle opening perpetual position
289
+ */
290
+ handleOpenPerp(data) {
291
+ const { ticker, side, size, leverage } = data;
292
+ const state = this.getGameState();
293
+ const market = state.perpetualMarkets.find((m) => m.ticker === ticker);
294
+ if (!market) {
295
+ throw new Error(`Market ${ticker} not found`);
296
+ }
297
+ const positionId = `perp-${Date.now()}`;
298
+ this.perpPositions.set(positionId, {
299
+ ticker,
300
+ side,
301
+ size,
302
+ leverage,
303
+ entryPrice: market.price,
304
+ openedAt: this.currentTick,
305
+ unrealizedPnl: 0,
306
+ });
307
+ return { positionId };
308
+ }
309
+ /**
310
+ * Handle closing perpetual position
311
+ */
312
+ handleClosePerp(data) {
313
+ const { positionId } = data;
314
+ const position = this.perpPositions.get(positionId);
315
+ if (!position) {
316
+ throw new Error(`Position ${positionId} not found`);
317
+ }
318
+ const state = this.getGameState();
319
+ const market = state.perpetualMarkets.find((m) => m.ticker === position.ticker);
320
+ if (!market) {
321
+ throw new Error(`Market ${position.ticker} not found`);
322
+ }
323
+ // Calculate realized P&L
324
+ const priceChange = market.price - position.entryPrice;
325
+ const pnl = position.side === "LONG"
326
+ ? priceChange * position.size * position.leverage
327
+ : -priceChange * position.size * position.leverage;
328
+ position.realizedPnl = pnl;
329
+ position.unrealizedPnl = 0;
330
+ position.closedAt = this.currentTick;
331
+ return { pnl };
332
+ }
333
+ /**
334
+ * Handle joining group chat
335
+ */
336
+ handleJoinGroup(_data) {
337
+ this.socialStats.groupsJoined++;
338
+ return { success: true };
339
+ }
340
+ /**
341
+ * Handle creating post
342
+ */
343
+ handleCreatePost(_data) {
344
+ this.socialStats.postsCreated++;
345
+ return { postId: `post-${Date.now()}` };
346
+ }
347
+ /**
348
+ * Update position values based on current prices
349
+ */
350
+ updatePositionValues(tick) {
351
+ // Update perp positions with unrealized P&L
352
+ for (const [_positionId, position] of this.perpPositions.entries()) {
353
+ if (position.closedAt)
354
+ continue; // Skip closed positions
355
+ const market = tick.state.perpetualMarkets.find((m) => m.ticker === position.ticker);
356
+ if (!market)
357
+ continue;
358
+ const priceChange = market.price - position.entryPrice;
359
+ position.unrealizedPnl =
360
+ position.side === "LONG"
361
+ ? priceChange * position.size * position.leverage
362
+ : -priceChange * position.size * position.leverage;
363
+ }
364
+ }
365
+ /**
366
+ * Calculate comprehensive metrics
367
+ */
368
+ calculateMetrics() {
369
+ // Calculate P&L from all positions
370
+ let totalPnl = 0;
371
+ // Prediction markets
372
+ let correctPredictions = 0;
373
+ let incorrectPredictions = 0;
374
+ let predictionPnl = 0;
375
+ for (const [_positionId, position] of this.predictionPositions.entries()) {
376
+ const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[position.marketId];
377
+ const isCorrect = (position.outcome === "YES" && marketOutcome) ||
378
+ (position.outcome === "NO" && !marketOutcome);
379
+ if (isCorrect) {
380
+ correctPredictions++;
381
+ const profit = position.amount; // Simplified: win amount = stake
382
+ predictionPnl += profit;
383
+ }
384
+ else {
385
+ incorrectPredictions++;
386
+ predictionPnl -= position.amount;
387
+ }
388
+ }
389
+ totalPnl += predictionPnl;
390
+ // Perpetual trades
391
+ let profitableTrades = 0;
392
+ let perpPnl = 0;
393
+ let maxDrawdown = 0;
394
+ let runningPnl = 0;
395
+ let peak = 0;
396
+ for (const [_positionId, position] of this.perpPositions.entries()) {
397
+ const pnl = position.realizedPnl ?? position.unrealizedPnl;
398
+ perpPnl += pnl;
399
+ if (pnl > 0)
400
+ profitableTrades++;
401
+ // Track drawdown
402
+ runningPnl += pnl;
403
+ if (runningPnl > peak)
404
+ peak = runningPnl;
405
+ const drawdown = peak - runningPnl;
406
+ if (drawdown > maxDrawdown)
407
+ maxDrawdown = drawdown;
408
+ }
409
+ totalPnl += perpPnl;
410
+ // Timing metrics
411
+ const responseTimes = this.actions.map((a) => a.duration);
412
+ const avgResponseTime = responseTimes.length > 0
413
+ ? responseTimes.reduce((sum, t) => sum + t, 0) / responseTimes.length
414
+ : 0;
415
+ const maxResponseTime = responseTimes.length > 0 ? Math.max(...responseTimes) : 0;
416
+ // Calculate optimality score (how well did agent follow optimal actions)
417
+ const optimalityScore = this.calculateOptimalityScore();
418
+ return {
419
+ totalPnl,
420
+ predictionMetrics: {
421
+ totalPositions: this.predictionPositions.size,
422
+ correctPredictions,
423
+ incorrectPredictions,
424
+ accuracy: this.predictionPositions.size > 0
425
+ ? correctPredictions / this.predictionPositions.size
426
+ : 0,
427
+ avgPnlPerPosition: this.predictionPositions.size > 0
428
+ ? predictionPnl / this.predictionPositions.size
429
+ : 0,
430
+ },
431
+ perpMetrics: {
432
+ totalTrades: this.perpPositions.size,
433
+ profitableTrades,
434
+ winRate: this.perpPositions.size > 0
435
+ ? profitableTrades / this.perpPositions.size
436
+ : 0,
437
+ avgPnlPerTrade: this.perpPositions.size > 0 ? perpPnl / this.perpPositions.size : 0,
438
+ maxDrawdown,
439
+ },
440
+ socialMetrics: {
441
+ postsCreated: this.socialStats.postsCreated,
442
+ groupsJoined: this.socialStats.groupsJoined,
443
+ messagesReceived: this.socialStats.messagesReceived,
444
+ reputationGained: correctPredictions * 10 - incorrectPredictions * 5,
445
+ },
446
+ timing: {
447
+ avgResponseTime,
448
+ maxResponseTime,
449
+ totalDuration: Date.now() - this.startTime,
450
+ },
451
+ optimalityScore,
452
+ };
453
+ }
454
+ /**
455
+ * Calculate how close agent came to optimal play
456
+ */
457
+ calculateOptimalityScore() {
458
+ const optimalActions = this.config.snapshot.groundTruth.optimalActions;
459
+ let matchedActions = 0;
460
+ for (const optimalAction of optimalActions) {
461
+ // Check if agent took this action within reasonable window
462
+ const windowStart = optimalAction.tick - 2;
463
+ const windowEnd = optimalAction.tick + 2;
464
+ const agentAction = this.actions.find((a) => {
465
+ if (a.tick < windowStart || a.tick > windowEnd)
466
+ return false;
467
+ // Match action type and target
468
+ if (optimalAction.type === "buy_prediction" &&
469
+ a.type === "buy_prediction") {
470
+ return a.data.marketId === optimalAction.target;
471
+ }
472
+ if (optimalAction.type === "open_perp" && a.type === "open_perp") {
473
+ return a.data.ticker === optimalAction.target;
474
+ }
475
+ return false;
476
+ });
477
+ if (agentAction)
478
+ matchedActions++;
479
+ }
480
+ return optimalActions.length > 0
481
+ ? (matchedActions / optimalActions.length) * 100
482
+ : 0;
483
+ }
484
+ /**
485
+ * Build trajectory data for RL training
486
+ */
487
+ buildTrajectory() {
488
+ const states = [];
489
+ const rewards = [];
490
+ // Extract states at each action point
491
+ for (const action of this.actions) {
492
+ const tick = this.config.snapshot.ticks[action.tick];
493
+ if (tick) {
494
+ states.push(tick.state);
495
+ // Calculate reward based on action outcome
496
+ let reward = 0;
497
+ if (action.type === "buy_prediction") {
498
+ const positionId = Object.keys(Object.fromEntries(this.predictionPositions)).find((id) => {
499
+ const pos = this.predictionPositions.get(id);
500
+ return pos !== undefined && pos.openedAt === action.tick;
501
+ });
502
+ if (positionId) {
503
+ const position = this.predictionPositions.get(positionId);
504
+ if (!position)
505
+ continue;
506
+ const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[position.marketId];
507
+ const isCorrect = (position.outcome === "YES" && marketOutcome) ||
508
+ (position.outcome === "NO" && !marketOutcome);
509
+ reward = isCorrect ? 1.0 : -1.0;
510
+ }
511
+ }
512
+ rewards.push(reward);
513
+ }
514
+ }
515
+ return {
516
+ states,
517
+ actions: this.actions,
518
+ rewards,
519
+ windowId: `benchmark-${this.config.snapshot.id}`,
520
+ };
521
+ }
522
+ }
@@ -0,0 +1,60 @@
1
+ import { getAgentRuntimeManager, getAgentService, getTaskInteractor, } from "../dependencies";
2
+ import { logger } from "../utils/logger";
3
+ export class TaskRunner {
4
+ config;
5
+ constructor(config) {
6
+ this.config = config;
7
+ }
8
+ async run() {
9
+ logger.info("Starting Task Benchmark", { config: this.config }, "TaskRunner");
10
+ const agentService = getAgentService();
11
+ const runtimeManager = getAgentRuntimeManager();
12
+ const taskInteractor = getTaskInteractor();
13
+ // 1. Create or get agent
14
+ // For simplicity, we assume we create a temp agent or use existing.
15
+ // Let's create a temporary agent for this run to ensure clean state.
16
+ const agentUser = await agentService.createAgent({
17
+ userId: "task-benchmark-manager", // Dummy manager ID
18
+ name: this.config.agentName,
19
+ system: "You are a helpful assistant.", // Base system prompt
20
+ bio: ["Helpful", "Smart"],
21
+ modelTier: "standard", // or whatever maps to config.model internally
22
+ });
23
+ const runtime = await runtimeManager.getRuntime(agentUser.id);
24
+ if (!runtime) {
25
+ throw new Error(`Failed to get runtime for agent ${agentUser.id}`);
26
+ }
27
+ const results = [];
28
+ // 2. Run iterations
29
+ for (let i = 0; i < this.config.iterations; i++) {
30
+ logger.info(`Running iteration ${i + 1}/${this.config.iterations}...`, {}, "TaskRunner");
31
+ const start = Date.now();
32
+ try {
33
+ const result = await taskInteractor.executeTask(runtime, this.config.taskPrompt);
34
+ results.push({
35
+ iteration: i + 1,
36
+ success: result.success,
37
+ response: result.response,
38
+ trajectoryId: result.trajectoryId,
39
+ error: result.error,
40
+ duration: Date.now() - start,
41
+ });
42
+ }
43
+ catch (err) {
44
+ logger.error("Iteration failed", { error: err }, "TaskRunner");
45
+ results.push({
46
+ iteration: i + 1,
47
+ success: false,
48
+ response: "",
49
+ error: err instanceof Error ? err.message : String(err),
50
+ duration: Date.now() - start,
51
+ });
52
+ }
53
+ }
54
+ // 3. Cleanup ?
55
+ // AgentService might not have delete method exposed in interface?
56
+ // Dependencies has `resetRuntime` but not deleteAgent.
57
+ // Access adapter if needed, but for now we leave it.
58
+ return results;
59
+ }
60
+ }