@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simulation Engine
|
|
3
|
+
*
|
|
4
|
+
* Replays pre-generated benchmark data to provide a deterministic game environment.
|
|
5
|
+
* Agents can query game state and make actions, but the underlying game progression
|
|
6
|
+
* is fixed based on the benchmark snapshot.
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Tick-by-tick replay of pre-recorded game data
|
|
10
|
+
* - Fast-forward mode (ticks advance on agent response)
|
|
11
|
+
* - A2A-compatible interface
|
|
12
|
+
* - Tracks agent actions for performance evaluation
|
|
13
|
+
*/
|
|
14
|
+
import { logger } from "../utils/logger";
|
|
15
|
+
import { MetricsValidator } from "./MetricsValidator";
|
|
16
|
+
export class SimulationEngine {
|
|
17
|
+
config;
|
|
18
|
+
currentTick = 0;
|
|
19
|
+
actions = [];
|
|
20
|
+
startTime = 0;
|
|
21
|
+
pnlHistory = [];
|
|
22
|
+
/** Agent positions tracked for metrics calculation */
|
|
23
|
+
predictionPositions = new Map();
|
|
24
|
+
perpPositions = new Map();
|
|
25
|
+
socialStats = {
|
|
26
|
+
postsCreated: 0,
|
|
27
|
+
groupsJoined: 0,
|
|
28
|
+
messagesReceived: 0,
|
|
29
|
+
};
|
|
30
|
+
constructor(config) {
|
|
31
|
+
this.config = config;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Run the complete simulation.
|
|
35
|
+
*
|
|
36
|
+
* Calculates final metrics after the simulation has been run.
|
|
37
|
+
* In fast-forward mode, the actual simulation must be driven externally by:
|
|
38
|
+
* 1. External runner calling agent logic
|
|
39
|
+
* 2. Agent making A2A calls which trigger actions
|
|
40
|
+
* 3. External runner calling advanceTick() to move forward
|
|
41
|
+
*
|
|
42
|
+
* This method then computes the final results.
|
|
43
|
+
*/
|
|
44
|
+
async run() {
|
|
45
|
+
if (this.startTime === 0) {
|
|
46
|
+
this.startTime = Date.now();
|
|
47
|
+
}
|
|
48
|
+
const endTime = Date.now();
|
|
49
|
+
const metrics = this.calculateMetrics();
|
|
50
|
+
const validation = MetricsValidator.validate(metrics, this.actions, this.config.snapshot.groundTruth);
|
|
51
|
+
if (!validation.valid) {
|
|
52
|
+
logger.error("Metrics validation failed", {
|
|
53
|
+
errors: validation.errors,
|
|
54
|
+
warnings: validation.warnings,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
if (validation.warnings.length > 0) {
|
|
58
|
+
logger.warn("Metrics validation warnings", {
|
|
59
|
+
warnings: validation.warnings,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
const trajectory = this.buildTrajectory();
|
|
63
|
+
logger.info("Simulation completed", {
|
|
64
|
+
duration: endTime - this.startTime,
|
|
65
|
+
ticksProcessed: this.currentTick,
|
|
66
|
+
totalPnl: metrics.totalPnl,
|
|
67
|
+
actionsCount: this.actions.length,
|
|
68
|
+
metricsValid: validation.valid,
|
|
69
|
+
});
|
|
70
|
+
return {
|
|
71
|
+
id: `sim-${Date.now()}`,
|
|
72
|
+
agentId: this.config.agentId,
|
|
73
|
+
benchmarkId: this.config.snapshot.id,
|
|
74
|
+
startTime: this.startTime,
|
|
75
|
+
endTime,
|
|
76
|
+
ticksProcessed: this.currentTick,
|
|
77
|
+
actions: this.actions,
|
|
78
|
+
metrics,
|
|
79
|
+
trajectory,
|
|
80
|
+
pnlHistory: this.pnlHistory,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Initialize simulation
|
|
85
|
+
*/
|
|
86
|
+
initialize() {
|
|
87
|
+
this.startTime = Date.now();
|
|
88
|
+
this.currentTick = 0;
|
|
89
|
+
this.pnlHistory = [];
|
|
90
|
+
logger.info("Simulation initialized", {
|
|
91
|
+
benchmarkId: this.config.snapshot.id,
|
|
92
|
+
agentId: this.config.agentId,
|
|
93
|
+
totalTicks: this.config.snapshot.ticks.length,
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Check if simulation is complete
|
|
98
|
+
* Returns true if we've processed all ticks
|
|
99
|
+
*/
|
|
100
|
+
isComplete() {
|
|
101
|
+
return this.currentTick >= this.config.snapshot.ticks.length;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Get total number of ticks in this simulation
|
|
105
|
+
*/
|
|
106
|
+
getTotalTicks() {
|
|
107
|
+
return this.config.snapshot.ticks.length;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Get current tick number
|
|
111
|
+
*/
|
|
112
|
+
getCurrentTickNumber() {
|
|
113
|
+
return this.currentTick;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Apply tick updates (called after agent actions)
|
|
117
|
+
* Updates position values based on current market prices
|
|
118
|
+
*/
|
|
119
|
+
applyTickUpdates() {
|
|
120
|
+
const tick = this.config.snapshot.ticks[this.currentTick];
|
|
121
|
+
if (tick) {
|
|
122
|
+
this.updatePositionValues(tick);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Get current game state (called by agent via A2A)
|
|
127
|
+
*/
|
|
128
|
+
getGameState() {
|
|
129
|
+
if (this.currentTick === 0) {
|
|
130
|
+
return this.config.snapshot.initialState;
|
|
131
|
+
}
|
|
132
|
+
const tick = this.config.snapshot.ticks[this.currentTick - 1];
|
|
133
|
+
return tick ? tick.state : this.config.snapshot.initialState;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Get specific historical tick (for backtesting/analysis)
|
|
137
|
+
*/
|
|
138
|
+
getTickState(tickNumber) {
|
|
139
|
+
const tick = this.config.snapshot.ticks[tickNumber];
|
|
140
|
+
return tick ? tick.state : null;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Agent performs an action
|
|
144
|
+
*/
|
|
145
|
+
async performAction(type, data) {
|
|
146
|
+
const actionStart = Date.now();
|
|
147
|
+
let result;
|
|
148
|
+
let correctness;
|
|
149
|
+
try {
|
|
150
|
+
switch (type) {
|
|
151
|
+
case "buy_prediction": {
|
|
152
|
+
result = this.handleBuyPrediction(data);
|
|
153
|
+
const { marketId, outcome } = data;
|
|
154
|
+
// Track correctness for prediction markets
|
|
155
|
+
const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[marketId];
|
|
156
|
+
if (marketOutcome !== undefined) {
|
|
157
|
+
const predictedOutcome = outcome === "YES";
|
|
158
|
+
const isCorrect = predictedOutcome === marketOutcome;
|
|
159
|
+
correctness = {
|
|
160
|
+
predictionCorrect: isCorrect,
|
|
161
|
+
actualOutcome: marketOutcome,
|
|
162
|
+
predictedOutcome,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
break;
|
|
166
|
+
}
|
|
167
|
+
case "open_perp": {
|
|
168
|
+
result = this.handleOpenPerp(data);
|
|
169
|
+
const { ticker, side } = data;
|
|
170
|
+
// Track correctness for perp trades based on sentiment and price movement
|
|
171
|
+
const state = this.getGameState();
|
|
172
|
+
const market = state.perpetualMarkets.find((m) => m.ticker === ticker);
|
|
173
|
+
if (market) {
|
|
174
|
+
// Calculate sentiment (simplified: based on price change)
|
|
175
|
+
const priceHistory = this.config.snapshot.groundTruth.priceHistory[ticker];
|
|
176
|
+
const currentPrice = market.price;
|
|
177
|
+
const futurePrice = priceHistory?.[Math.min(this.currentTick + 10, priceHistory.length - 1)]?.price;
|
|
178
|
+
if (futurePrice !== undefined) {
|
|
179
|
+
const priceChange = (futurePrice - currentPrice) / currentPrice;
|
|
180
|
+
const sentimentAtTrade = priceChange > 0 ? 0.5 : -0.5; // Simplified sentiment
|
|
181
|
+
// Determine if trade was correct
|
|
182
|
+
// If sentiment is negative and we went short, that's correct
|
|
183
|
+
// If sentiment is positive and we went long, that's correct
|
|
184
|
+
const expectedDirection = sentimentAtTrade < 0 ? "down" : "up";
|
|
185
|
+
const tradeDirection = side === "SHORT" ? "down" : "up";
|
|
186
|
+
const isCorrect = expectedDirection === tradeDirection;
|
|
187
|
+
correctness = {
|
|
188
|
+
perpCorrect: isCorrect,
|
|
189
|
+
sentimentAtTrade,
|
|
190
|
+
priceChange,
|
|
191
|
+
expectedDirection,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
case "close_perp":
|
|
198
|
+
result = this.handleClosePerp(data);
|
|
199
|
+
break;
|
|
200
|
+
case "join_group":
|
|
201
|
+
result = this.handleJoinGroup(data);
|
|
202
|
+
break;
|
|
203
|
+
case "create_post":
|
|
204
|
+
result = this.handleCreatePost(data);
|
|
205
|
+
break;
|
|
206
|
+
default:
|
|
207
|
+
return { success: false, error: `Unknown action type: ${type}` };
|
|
208
|
+
}
|
|
209
|
+
// Record action with correctness metadata
|
|
210
|
+
this.actions.push({
|
|
211
|
+
tick: this.currentTick,
|
|
212
|
+
timestamp: Date.now(),
|
|
213
|
+
type,
|
|
214
|
+
data,
|
|
215
|
+
duration: Date.now() - actionStart,
|
|
216
|
+
correctness,
|
|
217
|
+
});
|
|
218
|
+
return { success: true, result };
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
return {
|
|
222
|
+
success: false,
|
|
223
|
+
error: error instanceof Error ? error.message : String(error),
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Advance to next tick (called after agent finishes processing)
|
|
229
|
+
*/
|
|
230
|
+
advanceTick() {
|
|
231
|
+
// Apply any price-based updates for current tick
|
|
232
|
+
this.applyTickUpdates();
|
|
233
|
+
// Record PnL history
|
|
234
|
+
const currentPnl = this.calculateCurrentTotalPnl();
|
|
235
|
+
this.pnlHistory.push({
|
|
236
|
+
tick: this.currentTick,
|
|
237
|
+
pnl: currentPnl,
|
|
238
|
+
});
|
|
239
|
+
// Move to next tick
|
|
240
|
+
this.currentTick++;
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Calculate current PnL (realized + unrealized)
|
|
244
|
+
*/
|
|
245
|
+
calculateCurrentTotalPnl() {
|
|
246
|
+
let totalPnl = 0;
|
|
247
|
+
// Realized PnL from closed trades
|
|
248
|
+
for (const pos of this.perpPositions.values()) {
|
|
249
|
+
totalPnl += pos.realizedPnl || 0;
|
|
250
|
+
totalPnl += pos.unrealizedPnl || 0;
|
|
251
|
+
}
|
|
252
|
+
// Note: Prediction market PnL is only realized on resolution in this simple engine
|
|
253
|
+
// We could add mark-to-market valuation here if needed
|
|
254
|
+
return totalPnl;
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Stop simulation early
|
|
258
|
+
*/
|
|
259
|
+
stop() {
|
|
260
|
+
// Stop simulation (currently not actively used but kept for API compatibility)
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Handle buying prediction market shares
|
|
264
|
+
*/
|
|
265
|
+
handleBuyPrediction(data) {
|
|
266
|
+
const { marketId, outcome, amount } = data;
|
|
267
|
+
const state = this.getGameState();
|
|
268
|
+
const market = state.predictionMarkets.find((m) => m.id === marketId);
|
|
269
|
+
if (!market) {
|
|
270
|
+
throw new Error(`Market ${marketId} not found`);
|
|
271
|
+
}
|
|
272
|
+
// Calculate shares based on current price
|
|
273
|
+
const price = outcome === "YES" ? market.yesPrice : market.noPrice;
|
|
274
|
+
const shares = amount / price;
|
|
275
|
+
// Record position
|
|
276
|
+
const positionId = `pos-${Date.now()}`;
|
|
277
|
+
this.predictionPositions.set(positionId, {
|
|
278
|
+
marketId,
|
|
279
|
+
outcome,
|
|
280
|
+
shares,
|
|
281
|
+
entryPrice: price,
|
|
282
|
+
amount,
|
|
283
|
+
openedAt: this.currentTick,
|
|
284
|
+
});
|
|
285
|
+
return { positionId, shares };
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Handle opening perpetual position
|
|
289
|
+
*/
|
|
290
|
+
handleOpenPerp(data) {
|
|
291
|
+
const { ticker, side, size, leverage } = data;
|
|
292
|
+
const state = this.getGameState();
|
|
293
|
+
const market = state.perpetualMarkets.find((m) => m.ticker === ticker);
|
|
294
|
+
if (!market) {
|
|
295
|
+
throw new Error(`Market ${ticker} not found`);
|
|
296
|
+
}
|
|
297
|
+
const positionId = `perp-${Date.now()}`;
|
|
298
|
+
this.perpPositions.set(positionId, {
|
|
299
|
+
ticker,
|
|
300
|
+
side,
|
|
301
|
+
size,
|
|
302
|
+
leverage,
|
|
303
|
+
entryPrice: market.price,
|
|
304
|
+
openedAt: this.currentTick,
|
|
305
|
+
unrealizedPnl: 0,
|
|
306
|
+
});
|
|
307
|
+
return { positionId };
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Handle closing perpetual position
|
|
311
|
+
*/
|
|
312
|
+
handleClosePerp(data) {
|
|
313
|
+
const { positionId } = data;
|
|
314
|
+
const position = this.perpPositions.get(positionId);
|
|
315
|
+
if (!position) {
|
|
316
|
+
throw new Error(`Position ${positionId} not found`);
|
|
317
|
+
}
|
|
318
|
+
const state = this.getGameState();
|
|
319
|
+
const market = state.perpetualMarkets.find((m) => m.ticker === position.ticker);
|
|
320
|
+
if (!market) {
|
|
321
|
+
throw new Error(`Market ${position.ticker} not found`);
|
|
322
|
+
}
|
|
323
|
+
// Calculate realized P&L
|
|
324
|
+
const priceChange = market.price - position.entryPrice;
|
|
325
|
+
const pnl = position.side === "LONG"
|
|
326
|
+
? priceChange * position.size * position.leverage
|
|
327
|
+
: -priceChange * position.size * position.leverage;
|
|
328
|
+
position.realizedPnl = pnl;
|
|
329
|
+
position.unrealizedPnl = 0;
|
|
330
|
+
position.closedAt = this.currentTick;
|
|
331
|
+
return { pnl };
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Handle joining group chat
|
|
335
|
+
*/
|
|
336
|
+
handleJoinGroup(_data) {
|
|
337
|
+
this.socialStats.groupsJoined++;
|
|
338
|
+
return { success: true };
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* Handle creating post
|
|
342
|
+
*/
|
|
343
|
+
handleCreatePost(_data) {
|
|
344
|
+
this.socialStats.postsCreated++;
|
|
345
|
+
return { postId: `post-${Date.now()}` };
|
|
346
|
+
}
|
|
347
|
+
/**
|
|
348
|
+
* Update position values based on current prices
|
|
349
|
+
*/
|
|
350
|
+
updatePositionValues(tick) {
|
|
351
|
+
// Update perp positions with unrealized P&L
|
|
352
|
+
for (const [_positionId, position] of this.perpPositions.entries()) {
|
|
353
|
+
if (position.closedAt)
|
|
354
|
+
continue; // Skip closed positions
|
|
355
|
+
const market = tick.state.perpetualMarkets.find((m) => m.ticker === position.ticker);
|
|
356
|
+
if (!market)
|
|
357
|
+
continue;
|
|
358
|
+
const priceChange = market.price - position.entryPrice;
|
|
359
|
+
position.unrealizedPnl =
|
|
360
|
+
position.side === "LONG"
|
|
361
|
+
? priceChange * position.size * position.leverage
|
|
362
|
+
: -priceChange * position.size * position.leverage;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Calculate comprehensive metrics
|
|
367
|
+
*/
|
|
368
|
+
calculateMetrics() {
|
|
369
|
+
// Calculate P&L from all positions
|
|
370
|
+
let totalPnl = 0;
|
|
371
|
+
// Prediction markets
|
|
372
|
+
let correctPredictions = 0;
|
|
373
|
+
let incorrectPredictions = 0;
|
|
374
|
+
let predictionPnl = 0;
|
|
375
|
+
for (const [_positionId, position] of this.predictionPositions.entries()) {
|
|
376
|
+
const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[position.marketId];
|
|
377
|
+
const isCorrect = (position.outcome === "YES" && marketOutcome) ||
|
|
378
|
+
(position.outcome === "NO" && !marketOutcome);
|
|
379
|
+
if (isCorrect) {
|
|
380
|
+
correctPredictions++;
|
|
381
|
+
const profit = position.amount; // Simplified: win amount = stake
|
|
382
|
+
predictionPnl += profit;
|
|
383
|
+
}
|
|
384
|
+
else {
|
|
385
|
+
incorrectPredictions++;
|
|
386
|
+
predictionPnl -= position.amount;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
totalPnl += predictionPnl;
|
|
390
|
+
// Perpetual trades
|
|
391
|
+
let profitableTrades = 0;
|
|
392
|
+
let perpPnl = 0;
|
|
393
|
+
let maxDrawdown = 0;
|
|
394
|
+
let runningPnl = 0;
|
|
395
|
+
let peak = 0;
|
|
396
|
+
for (const [_positionId, position] of this.perpPositions.entries()) {
|
|
397
|
+
const pnl = position.realizedPnl ?? position.unrealizedPnl;
|
|
398
|
+
perpPnl += pnl;
|
|
399
|
+
if (pnl > 0)
|
|
400
|
+
profitableTrades++;
|
|
401
|
+
// Track drawdown
|
|
402
|
+
runningPnl += pnl;
|
|
403
|
+
if (runningPnl > peak)
|
|
404
|
+
peak = runningPnl;
|
|
405
|
+
const drawdown = peak - runningPnl;
|
|
406
|
+
if (drawdown > maxDrawdown)
|
|
407
|
+
maxDrawdown = drawdown;
|
|
408
|
+
}
|
|
409
|
+
totalPnl += perpPnl;
|
|
410
|
+
// Timing metrics
|
|
411
|
+
const responseTimes = this.actions.map((a) => a.duration);
|
|
412
|
+
const avgResponseTime = responseTimes.length > 0
|
|
413
|
+
? responseTimes.reduce((sum, t) => sum + t, 0) / responseTimes.length
|
|
414
|
+
: 0;
|
|
415
|
+
const maxResponseTime = responseTimes.length > 0 ? Math.max(...responseTimes) : 0;
|
|
416
|
+
// Calculate optimality score (how well did agent follow optimal actions)
|
|
417
|
+
const optimalityScore = this.calculateOptimalityScore();
|
|
418
|
+
return {
|
|
419
|
+
totalPnl,
|
|
420
|
+
predictionMetrics: {
|
|
421
|
+
totalPositions: this.predictionPositions.size,
|
|
422
|
+
correctPredictions,
|
|
423
|
+
incorrectPredictions,
|
|
424
|
+
accuracy: this.predictionPositions.size > 0
|
|
425
|
+
? correctPredictions / this.predictionPositions.size
|
|
426
|
+
: 0,
|
|
427
|
+
avgPnlPerPosition: this.predictionPositions.size > 0
|
|
428
|
+
? predictionPnl / this.predictionPositions.size
|
|
429
|
+
: 0,
|
|
430
|
+
},
|
|
431
|
+
perpMetrics: {
|
|
432
|
+
totalTrades: this.perpPositions.size,
|
|
433
|
+
profitableTrades,
|
|
434
|
+
winRate: this.perpPositions.size > 0
|
|
435
|
+
? profitableTrades / this.perpPositions.size
|
|
436
|
+
: 0,
|
|
437
|
+
avgPnlPerTrade: this.perpPositions.size > 0 ? perpPnl / this.perpPositions.size : 0,
|
|
438
|
+
maxDrawdown,
|
|
439
|
+
},
|
|
440
|
+
socialMetrics: {
|
|
441
|
+
postsCreated: this.socialStats.postsCreated,
|
|
442
|
+
groupsJoined: this.socialStats.groupsJoined,
|
|
443
|
+
messagesReceived: this.socialStats.messagesReceived,
|
|
444
|
+
reputationGained: correctPredictions * 10 - incorrectPredictions * 5,
|
|
445
|
+
},
|
|
446
|
+
timing: {
|
|
447
|
+
avgResponseTime,
|
|
448
|
+
maxResponseTime,
|
|
449
|
+
totalDuration: Date.now() - this.startTime,
|
|
450
|
+
},
|
|
451
|
+
optimalityScore,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Calculate how close agent came to optimal play
|
|
456
|
+
*/
|
|
457
|
+
calculateOptimalityScore() {
|
|
458
|
+
const optimalActions = this.config.snapshot.groundTruth.optimalActions;
|
|
459
|
+
let matchedActions = 0;
|
|
460
|
+
for (const optimalAction of optimalActions) {
|
|
461
|
+
// Check if agent took this action within reasonable window
|
|
462
|
+
const windowStart = optimalAction.tick - 2;
|
|
463
|
+
const windowEnd = optimalAction.tick + 2;
|
|
464
|
+
const agentAction = this.actions.find((a) => {
|
|
465
|
+
if (a.tick < windowStart || a.tick > windowEnd)
|
|
466
|
+
return false;
|
|
467
|
+
// Match action type and target
|
|
468
|
+
if (optimalAction.type === "buy_prediction" &&
|
|
469
|
+
a.type === "buy_prediction") {
|
|
470
|
+
return a.data.marketId === optimalAction.target;
|
|
471
|
+
}
|
|
472
|
+
if (optimalAction.type === "open_perp" && a.type === "open_perp") {
|
|
473
|
+
return a.data.ticker === optimalAction.target;
|
|
474
|
+
}
|
|
475
|
+
return false;
|
|
476
|
+
});
|
|
477
|
+
if (agentAction)
|
|
478
|
+
matchedActions++;
|
|
479
|
+
}
|
|
480
|
+
return optimalActions.length > 0
|
|
481
|
+
? (matchedActions / optimalActions.length) * 100
|
|
482
|
+
: 0;
|
|
483
|
+
}
|
|
484
|
+
/**
|
|
485
|
+
* Build trajectory data for RL training
|
|
486
|
+
*/
|
|
487
|
+
buildTrajectory() {
|
|
488
|
+
const states = [];
|
|
489
|
+
const rewards = [];
|
|
490
|
+
// Extract states at each action point
|
|
491
|
+
for (const action of this.actions) {
|
|
492
|
+
const tick = this.config.snapshot.ticks[action.tick];
|
|
493
|
+
if (tick) {
|
|
494
|
+
states.push(tick.state);
|
|
495
|
+
// Calculate reward based on action outcome
|
|
496
|
+
let reward = 0;
|
|
497
|
+
if (action.type === "buy_prediction") {
|
|
498
|
+
const positionId = Object.keys(Object.fromEntries(this.predictionPositions)).find((id) => {
|
|
499
|
+
const pos = this.predictionPositions.get(id);
|
|
500
|
+
return pos !== undefined && pos.openedAt === action.tick;
|
|
501
|
+
});
|
|
502
|
+
if (positionId) {
|
|
503
|
+
const position = this.predictionPositions.get(positionId);
|
|
504
|
+
if (!position)
|
|
505
|
+
continue;
|
|
506
|
+
const marketOutcome = this.config.snapshot.groundTruth.marketOutcomes[position.marketId];
|
|
507
|
+
const isCorrect = (position.outcome === "YES" && marketOutcome) ||
|
|
508
|
+
(position.outcome === "NO" && !marketOutcome);
|
|
509
|
+
reward = isCorrect ? 1.0 : -1.0;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
rewards.push(reward);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return {
|
|
516
|
+
states,
|
|
517
|
+
actions: this.actions,
|
|
518
|
+
rewards,
|
|
519
|
+
windowId: `benchmark-${this.config.snapshot.id}`,
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { getAgentRuntimeManager, getAgentService, getTaskInteractor, } from "../dependencies";
|
|
2
|
+
import { logger } from "../utils/logger";
|
|
3
|
+
export class TaskRunner {
|
|
4
|
+
config;
|
|
5
|
+
constructor(config) {
|
|
6
|
+
this.config = config;
|
|
7
|
+
}
|
|
8
|
+
async run() {
|
|
9
|
+
logger.info("Starting Task Benchmark", { config: this.config }, "TaskRunner");
|
|
10
|
+
const agentService = getAgentService();
|
|
11
|
+
const runtimeManager = getAgentRuntimeManager();
|
|
12
|
+
const taskInteractor = getTaskInteractor();
|
|
13
|
+
// 1. Create or get agent
|
|
14
|
+
// For simplicity, we assume we create a temp agent or use existing.
|
|
15
|
+
// Let's create a temporary agent for this run to ensure clean state.
|
|
16
|
+
const agentUser = await agentService.createAgent({
|
|
17
|
+
userId: "task-benchmark-manager", // Dummy manager ID
|
|
18
|
+
name: this.config.agentName,
|
|
19
|
+
system: "You are a helpful assistant.", // Base system prompt
|
|
20
|
+
bio: ["Helpful", "Smart"],
|
|
21
|
+
modelTier: "standard", // or whatever maps to config.model internally
|
|
22
|
+
});
|
|
23
|
+
const runtime = await runtimeManager.getRuntime(agentUser.id);
|
|
24
|
+
if (!runtime) {
|
|
25
|
+
throw new Error(`Failed to get runtime for agent ${agentUser.id}`);
|
|
26
|
+
}
|
|
27
|
+
const results = [];
|
|
28
|
+
// 2. Run iterations
|
|
29
|
+
for (let i = 0; i < this.config.iterations; i++) {
|
|
30
|
+
logger.info(`Running iteration ${i + 1}/${this.config.iterations}...`, {}, "TaskRunner");
|
|
31
|
+
const start = Date.now();
|
|
32
|
+
try {
|
|
33
|
+
const result = await taskInteractor.executeTask(runtime, this.config.taskPrompt);
|
|
34
|
+
results.push({
|
|
35
|
+
iteration: i + 1,
|
|
36
|
+
success: result.success,
|
|
37
|
+
response: result.response,
|
|
38
|
+
trajectoryId: result.trajectoryId,
|
|
39
|
+
error: result.error,
|
|
40
|
+
duration: Date.now() - start,
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
catch (err) {
|
|
44
|
+
logger.error("Iteration failed", { error: err }, "TaskRunner");
|
|
45
|
+
results.push({
|
|
46
|
+
iteration: i + 1,
|
|
47
|
+
success: false,
|
|
48
|
+
response: "",
|
|
49
|
+
error: err instanceof Error ? err.message : String(err),
|
|
50
|
+
duration: Date.now() - start,
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// 3. Cleanup ?
|
|
55
|
+
// AgentService might not have delete method exposed in interface?
|
|
56
|
+
// Dependencies has `resetRuntime` but not deleteAgent.
|
|
57
|
+
// Access adapter if needed, but for now we leave it.
|
|
58
|
+
return results;
|
|
59
|
+
}
|
|
60
|
+
}
|