@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,825 @@
1
+ /**
2
+ * Archetype Matchup Benchmark
3
+ *
4
+ * Simulates multiple archetypes competing against each other to understand:
5
+ * - Which archetypes perform best in different market conditions
6
+ * - How archetypes interact (trader vs scammer, social-butterfly vs contrarian, etc.)
7
+ * - Relative strengths and weaknesses
8
+ *
9
+ * Uses the multi-model orchestrator to efficiently run multiple archetype models.
10
+ */
11
+
12
+ import {
13
+ type ArchetypeConfig,
14
+ ArchetypeConfigService,
15
+ } from '../archetypes/ArchetypeConfigService';
16
+ import {
17
+ createMultiModelOrchestrator,
18
+ type MultiModelOrchestrator,
19
+ } from '../training/MultiModelOrchestrator';
20
+ import { logger } from '../utils/logger';
21
+ import {
22
+ type BenchmarkConfig,
23
+ BenchmarkDataGenerator,
24
+ type BenchmarkGameSnapshot,
25
+ type Tick,
26
+ } from './BenchmarkDataGenerator';
27
+
28
+ /**
29
+ * Individual agent in the matchup simulation
30
+ */
31
+ export interface MatchupAgent {
32
+ id: string;
33
+ archetype: string;
34
+ config: ArchetypeConfig;
35
+ }
36
+
37
+ /**
38
+ * Result for a single agent in the matchup
39
+ */
40
+ export interface MatchupAgentResult {
41
+ agentId: string;
42
+ archetype: string;
43
+ pnl: number;
44
+ tradingMetrics: {
45
+ totalTrades: number;
46
+ winRate: number;
47
+ avgPnlPerTrade: number;
48
+ };
49
+ socialMetrics: {
50
+ postsCreated: number;
51
+ engagementReceived: number;
52
+ reputationGained: number;
53
+ };
54
+ actions: number;
55
+ rank: number; // 1-based rank in this matchup
56
+ }
57
+
58
+ /**
59
+ * Head-to-head comparison between two archetypes
60
+ */
61
+ export interface ArchetypeVsResult {
62
+ archetype1: string;
63
+ archetype2: string;
64
+ archetype1Wins: number;
65
+ archetype2Wins: number;
66
+ ties: number;
67
+ archetype1AvgMargin: number;
68
+ archetype2AvgMargin: number;
69
+ winRate1: number;
70
+ winRate2: number;
71
+ }
72
+
73
+ /**
74
+ * Complete matchup benchmark result
75
+ */
76
+ export interface MatchupBenchmarkResult {
77
+ benchmarkId: string;
78
+ timestamp: number;
79
+ duration: number;
80
+
81
+ /** All agents that participated */
82
+ agents: MatchupAgentResult[];
83
+
84
+ /** Overall archetype rankings across all matchups */
85
+ archetypeRankings: Array<{
86
+ archetype: string;
87
+ avgRank: number;
88
+ avgPnl: number;
89
+ totalWins: number;
90
+ totalLosses: number;
91
+ winRate: number;
92
+ }>;
93
+
94
+ /** Head-to-head matchup results */
95
+ headToHead: ArchetypeVsResult[];
96
+
97
+ /** Market condition during benchmark */
98
+ marketCondition: 'bull' | 'bear' | 'volatile' | 'stable';
99
+
100
+ /** Insights derived from the matchup */
101
+ insights: string[];
102
+ }
103
+
104
+ /**
105
+ * Configuration for matchup benchmark
106
+ */
107
+ export interface MatchupBenchmarkConfig {
108
+ /** Archetypes to include in matchup (or 'all' for all archetypes) */
109
+ archetypes: string[] | 'all';
110
+
111
+ /** Number of agents per archetype */
112
+ agentsPerArchetype: number;
113
+
114
+ /** Number of simulation rounds */
115
+ rounds: number;
116
+
117
+ /** Number of ticks per round */
118
+ ticksPerRound: number;
119
+
120
+ /** Market conditions to test */
121
+ marketConditions: Array<'bull' | 'bear' | 'volatile' | 'stable'>;
122
+
123
+ /** Available VRAM for model loading */
124
+ availableVramGb: number;
125
+ }
126
+
127
+ /**
128
+ * Runs multi-archetype benchmark simulations
129
+ */
130
+ export class ArchetypeMatchupBenchmark {
131
+ private config: MatchupBenchmarkConfig;
132
+ private orchestrator: MultiModelOrchestrator;
133
+
134
+ constructor(config: MatchupBenchmarkConfig) {
135
+ this.config = config;
136
+ this.orchestrator = createMultiModelOrchestrator(config.availableVramGb);
137
+ }
138
+
139
+ /**
140
+ * Get all archetypes to benchmark
141
+ */
142
+ private getArchetypes(): string[] {
143
+ if (this.config.archetypes === 'all') {
144
+ return ArchetypeConfigService.getAvailableArchetypes();
145
+ }
146
+ return this.config.archetypes;
147
+ }
148
+
149
+ /**
150
+ * Create agents for the matchup
151
+ */
152
+ private createAgents(): MatchupAgent[] {
153
+ const agents: MatchupAgent[] = [];
154
+ const archetypes = this.getArchetypes();
155
+
156
+ for (const archetype of archetypes) {
157
+ const archetypeConfig = ArchetypeConfigService.getConfig(archetype);
158
+
159
+ for (let i = 0; i < this.config.agentsPerArchetype; i++) {
160
+ agents.push({
161
+ id: `${archetype}-${i + 1}`,
162
+ archetype,
163
+ config: archetypeConfig,
164
+ });
165
+ }
166
+ }
167
+
168
+ return agents;
169
+ }
170
+
171
+ /**
172
+ * Generate benchmark data for a market condition
173
+ * Market condition affects seed to create different scenarios
174
+ */
175
+ private async generateBenchmarkData(
176
+ condition: 'bull' | 'bear' | 'volatile' | 'stable'
177
+ ): Promise<BenchmarkGameSnapshot> {
178
+ // Convert ticks to duration minutes (assuming 1 tick per second)
179
+ const durationMinutes = Math.ceil(this.config.ticksPerRound / 60);
180
+
181
+ // Use condition to create different but reproducible seeds
182
+ const conditionSeeds: Record<string, number> = {
183
+ bull: 1001,
184
+ bear: 2002,
185
+ volatile: 3003,
186
+ stable: 4004,
187
+ };
188
+ const baseSeed = conditionSeeds[condition] || 1000;
189
+
190
+ const benchmarkConfig: BenchmarkConfig = {
191
+ durationMinutes,
192
+ tickInterval: 1,
193
+ numPredictionMarkets: condition === 'volatile' ? 8 : 5,
194
+ numPerpetualMarkets: condition === 'volatile' ? 5 : 3,
195
+ numAgents: 10,
196
+ seed: baseSeed + (Date.now() % 1000), // Semi-reproducible
197
+ };
198
+
199
+ const generator = new BenchmarkDataGenerator(benchmarkConfig);
200
+ return generator.generate();
201
+ }
202
+
203
+ /**
204
+ * Simulate a single round of the matchup
205
+ */
206
+ private async simulateRound(
207
+ agents: MatchupAgent[],
208
+ snapshot: BenchmarkGameSnapshot,
209
+ roundNumber: number
210
+ ): Promise<MatchupAgentResult[]> {
211
+ const results: MatchupAgentResult[] = [];
212
+
213
+ logger.info(
214
+ `Simulating round ${roundNumber} with ${agents.length} agents`,
215
+ { archetypes: [...new Set(agents.map((a) => a.archetype))] },
216
+ 'ArchetypeMatchupBenchmark'
217
+ );
218
+
219
+ // Check if we should use real inference or simulation
220
+ const useRealInference = process.env.USE_REAL_INFERENCE === 'true';
221
+
222
+ if (useRealInference) {
223
+ // Use real model inference via the orchestrator
224
+ for (const agent of agents) {
225
+ const result = await this.runAgentWithRealModel(agent, snapshot);
226
+ results.push(result);
227
+ }
228
+ } else {
229
+ // Use simulated performance based on archetype characteristics
230
+ for (const agent of agents) {
231
+ const result = this.simulateAgentPerformance(agent, snapshot);
232
+ results.push(result);
233
+ }
234
+ }
235
+
236
+ // Assign ranks
237
+ results.sort((a, b) => b.pnl - a.pnl);
238
+ results.forEach((r, i) => {
239
+ r.rank = i + 1;
240
+ });
241
+
242
+ return results;
243
+ }
244
+
245
+ /**
246
+ * Run an agent with real model inference
247
+ */
248
+ private async runAgentWithRealModel(
249
+ agent: MatchupAgent,
250
+ snapshot: BenchmarkGameSnapshot
251
+ ): Promise<MatchupAgentResult> {
252
+ let totalPnl = 0;
253
+ let totalTrades = 0;
254
+ let wins = 0;
255
+ let postsCreated = 0;
256
+
257
+ // Process a subset of ticks (every 10th tick to speed up)
258
+ const ticksToProcess = snapshot.ticks
259
+ .filter((_, i) => i % 10 === 0)
260
+ .slice(0, 10);
261
+
262
+ for (const tick of ticksToProcess) {
263
+ // Build a prompt with the current game state
264
+ const prompt = this.buildDecisionPrompt(agent, tick);
265
+
266
+ // Get decision from model
267
+ const response = await this.orchestrator.inference({
268
+ archetype: agent.archetype,
269
+ prompt,
270
+ systemPrompt: agent.config.system,
271
+ maxTokens: 256,
272
+ temperature: 0.7,
273
+ });
274
+
275
+ // Parse the decision and simulate outcome
276
+ const decision = this.parseAgentDecision(response.response);
277
+
278
+ if (decision.action === 'trade') {
279
+ totalTrades++;
280
+ // Simulate trade outcome based on market conditions
281
+ const marketTrend = this.getMarketTrend(tick);
282
+ const isCorrectDirection =
283
+ (decision.direction === 'long' && marketTrend > 0) ||
284
+ (decision.direction === 'short' && marketTrend < 0);
285
+ if (isCorrectDirection) {
286
+ wins++;
287
+ totalPnl += Math.abs(marketTrend) * 100 * (decision.confidence || 1);
288
+ } else {
289
+ totalPnl -= Math.abs(marketTrend) * 50 * (decision.confidence || 1);
290
+ }
291
+ } else if (decision.action === 'post') {
292
+ postsCreated++;
293
+ }
294
+ }
295
+
296
+ const winRate = totalTrades > 0 ? wins / totalTrades : 0;
297
+
298
+ return {
299
+ agentId: agent.id,
300
+ archetype: agent.archetype,
301
+ pnl: totalPnl,
302
+ tradingMetrics: {
303
+ totalTrades,
304
+ winRate,
305
+ avgPnlPerTrade: totalTrades > 0 ? totalPnl / totalTrades : 0,
306
+ },
307
+ socialMetrics: {
308
+ postsCreated,
309
+ engagementReceived: postsCreated * 5,
310
+ reputationGained: postsCreated * 10 + wins * 5,
311
+ },
312
+ actions: totalTrades + postsCreated,
313
+ rank: 0,
314
+ };
315
+ }
316
+
317
+ /**
318
+ * Build a decision prompt for the agent
319
+ */
320
+ private buildDecisionPrompt(agent: MatchupAgent, tick: Tick): string {
321
+ const state = tick.state;
322
+ // Find agent's balance from agents array
323
+ const agentState = state.agents.find((a) => a.id === agent.id);
324
+ const agentBalance =
325
+ agentState?.totalPnl !== undefined ? 1000 + agentState.totalPnl : 1000;
326
+
327
+ // Extract market prices from perpetual markets
328
+ const marketPrices = Object.fromEntries(
329
+ state.perpetualMarkets.map((m) => [m.ticker, m.price])
330
+ );
331
+
332
+ // Recent posts can serve as "news"
333
+ const recentNews = state.posts?.slice(-5).map((p) => p.content) || [];
334
+
335
+ return `
336
+ Current game state:
337
+ - Timestamp: ${tick.timestamp}
338
+ - Your balance: ${agentBalance}
339
+ - Market prices: ${JSON.stringify(marketPrices)}
340
+ - Recent news: ${JSON.stringify(recentNews)}
341
+
342
+ As a ${agent.archetype} agent, what action would you take?
343
+ Respond with a JSON object containing:
344
+ - action: "trade" | "post" | "observe"
345
+ - direction: "long" | "short" (if trading)
346
+ - confidence: 0.0 to 1.0
347
+ - reasoning: brief explanation
348
+ `;
349
+ }
350
+
351
+ /**
352
+ * Parse agent decision from model response
353
+ */
354
+ private parseAgentDecision(response: string): {
355
+ action: 'trade' | 'post' | 'observe';
356
+ direction?: 'long' | 'short';
357
+ confidence?: number;
358
+ } {
359
+ try {
360
+ // Try to extract JSON from response
361
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
362
+ if (jsonMatch) {
363
+ const parsed = JSON.parse(jsonMatch[0]);
364
+ return {
365
+ action: parsed.action || 'observe',
366
+ direction: parsed.direction,
367
+ confidence: parsed.confidence || 0.5,
368
+ };
369
+ }
370
+ } catch {
371
+ // Failed to parse, default to observe
372
+ }
373
+
374
+ // Default behavior based on response content
375
+ if (
376
+ response.toLowerCase().includes('trade') ||
377
+ response.toLowerCase().includes('buy') ||
378
+ response.toLowerCase().includes('sell')
379
+ ) {
380
+ return {
381
+ action: 'trade',
382
+ direction: response.toLowerCase().includes('short') ? 'short' : 'long',
383
+ confidence: 0.5,
384
+ };
385
+ }
386
+
387
+ if (
388
+ response.toLowerCase().includes('post') ||
389
+ response.toLowerCase().includes('share')
390
+ ) {
391
+ return { action: 'post' };
392
+ }
393
+
394
+ return { action: 'observe' };
395
+ }
396
+
397
+ /**
398
+ * Get market trend from tick data
399
+ */
400
+ private getMarketTrend(tick: Tick): number {
401
+ const state = tick.state;
402
+ // Extract prices from perpetual markets
403
+ if (state.perpetualMarkets.length === 0) return 0;
404
+
405
+ const prices = Object.fromEntries(
406
+ state.perpetualMarkets.map((m) => [m.ticker, m.price])
407
+ );
408
+
409
+ // Calculate average price change
410
+ const priceValues = Object.values(prices);
411
+ if (priceValues.length === 0) return 0;
412
+
413
+ const avgPrice =
414
+ priceValues.reduce((a, b) => a + b, 0) / priceValues.length;
415
+ // Normalize to -1 to 1 range
416
+ return (avgPrice - 100) / 100;
417
+ }
418
+
419
+ /**
420
+ * Simulate agent performance based on archetype characteristics
421
+ * Used when real model inference is not available
422
+ */
423
+ private simulateAgentPerformance(
424
+ agent: MatchupAgent,
425
+ snapshot: BenchmarkGameSnapshot
426
+ ): MatchupAgentResult {
427
+ const config = agent.config;
428
+ const tickCount = snapshot.ticks.length;
429
+
430
+ // Calculate expected performance based on archetype traits
431
+ // Higher risk tolerance = higher variance in PnL
432
+ const riskFactor = config.riskTolerance;
433
+ const basePnl = (Math.random() - 0.5) * 1000 * riskFactor;
434
+
435
+ // Trading-focused archetypes trade more
436
+ const tradeWeight = config.actionWeights.trade;
437
+ const totalTrades = Math.floor(tickCount * tradeWeight * 0.1);
438
+ const winRate =
439
+ 0.45 + (config.riskTolerance < 0.5 ? 0.15 : -0.05) + Math.random() * 0.1;
440
+
441
+ // Social-focused archetypes post more
442
+ const postWeight = config.actionWeights.post;
443
+ const postsCreated = Math.floor(tickCount * postWeight * 0.05);
444
+
445
+ return {
446
+ agentId: agent.id,
447
+ archetype: agent.archetype,
448
+ pnl: basePnl + (winRate > 0.5 ? 100 : -100) * Math.random(),
449
+ tradingMetrics: {
450
+ totalTrades,
451
+ winRate,
452
+ avgPnlPerTrade: basePnl / Math.max(totalTrades, 1),
453
+ },
454
+ socialMetrics: {
455
+ postsCreated,
456
+ engagementReceived: postsCreated * (2 + Math.random() * 5),
457
+ reputationGained: postsCreated * 10,
458
+ },
459
+ actions: totalTrades + postsCreated,
460
+ rank: 0, // Set after sorting
461
+ };
462
+ }
463
+
464
+ /**
465
+ * Calculate head-to-head results between archetypes
466
+ */
467
+ private calculateHeadToHead(
468
+ allResults: MatchupAgentResult[][]
469
+ ): ArchetypeVsResult[] {
470
+ const archetypes = this.getArchetypes();
471
+ const headToHead: ArchetypeVsResult[] = [];
472
+
473
+ for (let i = 0; i < archetypes.length; i++) {
474
+ for (let j = i + 1; j < archetypes.length; j++) {
475
+ const arch1 = archetypes[i] as string;
476
+ const arch2 = archetypes[j] as string;
477
+
478
+ let wins1 = 0;
479
+ let wins2 = 0;
480
+ let ties = 0;
481
+ let margin1Total = 0;
482
+ let margin2Total = 0;
483
+
484
+ // Compare performance in each round
485
+ for (const roundResults of allResults) {
486
+ const arch1Results = roundResults.filter(
487
+ (r) => r.archetype === arch1
488
+ );
489
+ const arch2Results = roundResults.filter(
490
+ (r) => r.archetype === arch2
491
+ );
492
+
493
+ if (arch1Results.length === 0 || arch2Results.length === 0) continue;
494
+
495
+ const avgPnl1 =
496
+ arch1Results.reduce((sum, r) => sum + r.pnl, 0) /
497
+ arch1Results.length;
498
+ const avgPnl2 =
499
+ arch2Results.reduce((sum, r) => sum + r.pnl, 0) /
500
+ arch2Results.length;
501
+
502
+ if (avgPnl1 > avgPnl2) {
503
+ wins1++;
504
+ margin1Total += avgPnl1 - avgPnl2;
505
+ } else if (avgPnl2 > avgPnl1) {
506
+ wins2++;
507
+ margin2Total += avgPnl2 - avgPnl1;
508
+ } else {
509
+ ties++;
510
+ }
511
+ }
512
+
513
+ const totalGames = wins1 + wins2 + ties;
514
+ headToHead.push({
515
+ archetype1: arch1,
516
+ archetype2: arch2,
517
+ archetype1Wins: wins1,
518
+ archetype2Wins: wins2,
519
+ ties,
520
+ archetype1AvgMargin: wins1 > 0 ? margin1Total / wins1 : 0,
521
+ archetype2AvgMargin: wins2 > 0 ? margin2Total / wins2 : 0,
522
+ winRate1: totalGames > 0 ? wins1 / totalGames : 0,
523
+ winRate2: totalGames > 0 ? wins2 / totalGames : 0,
524
+ });
525
+ }
526
+ }
527
+
528
+ return headToHead;
529
+ }
530
+
531
+ /**
532
+ * Calculate overall archetype rankings
533
+ */
534
+ private calculateRankings(
535
+ allResults: MatchupAgentResult[][]
536
+ ): MatchupBenchmarkResult['archetypeRankings'] {
537
+ const archetypes = this.getArchetypes();
538
+ const rankings: Map<
539
+ string,
540
+ {
541
+ totalRank: number;
542
+ totalPnl: number;
543
+ wins: number;
544
+ losses: number;
545
+ count: number;
546
+ }
547
+ > = new Map();
548
+
549
+ // Initialize
550
+ for (const arch of archetypes) {
551
+ rankings.set(arch, {
552
+ totalRank: 0,
553
+ totalPnl: 0,
554
+ wins: 0,
555
+ losses: 0,
556
+ count: 0,
557
+ });
558
+ }
559
+
560
+ // Aggregate results
561
+ for (const roundResults of allResults) {
562
+ const archetypeResults = new Map<string, number[]>();
563
+
564
+ for (const result of roundResults) {
565
+ const existing = archetypeResults.get(result.archetype) || [];
566
+ existing.push(result.pnl);
567
+ archetypeResults.set(result.archetype, existing);
568
+
569
+ const stats = rankings.get(result.archetype);
570
+ if (stats) {
571
+ stats.totalRank += result.rank;
572
+ stats.totalPnl += result.pnl;
573
+ stats.count++;
574
+ if (result.rank === 1) stats.wins++;
575
+ if (result.rank === roundResults.length) stats.losses++;
576
+ }
577
+ }
578
+ }
579
+
580
+ return Array.from(rankings.entries())
581
+ .map(([archetype, stats]) => ({
582
+ archetype,
583
+ avgRank: stats.count > 0 ? stats.totalRank / stats.count : 0,
584
+ avgPnl: stats.count > 0 ? stats.totalPnl / stats.count : 0,
585
+ totalWins: stats.wins,
586
+ totalLosses: stats.losses,
587
+ winRate: stats.count > 0 ? stats.wins / stats.count : 0,
588
+ }))
589
+ .sort((a, b) => a.avgRank - b.avgRank);
590
+ }
591
+
592
+ /**
593
+ * Generate insights from the matchup results
594
+ */
595
+ private generateInsights(
596
+ rankings: MatchupBenchmarkResult['archetypeRankings'],
597
+ headToHead: ArchetypeVsResult[],
598
+ marketCondition: string
599
+ ): string[] {
600
+ const insights: string[] = [];
601
+
602
+ // Top performer insight
603
+ const topRanking = rankings[0];
604
+ if (topRanking) {
605
+ insights.push(
606
+ `${topRanking.archetype} performed best in ${marketCondition} conditions with avg rank ${topRanking.avgRank.toFixed(2)}`
607
+ );
608
+ }
609
+
610
+ // Find dominant matchups
611
+ for (const h2h of headToHead) {
612
+ if (h2h.winRate1 >= 0.7) {
613
+ insights.push(
614
+ `${h2h.archetype1} dominates ${h2h.archetype2} (${(h2h.winRate1 * 100).toFixed(0)}% win rate)`
615
+ );
616
+ } else if (h2h.winRate2 >= 0.7) {
617
+ insights.push(
618
+ `${h2h.archetype2} dominates ${h2h.archetype1} (${(h2h.winRate2 * 100).toFixed(0)}% win rate)`
619
+ );
620
+ }
621
+ }
622
+
623
+ // Find rock-paper-scissors patterns
624
+ const counters = this.findCounterArchetypes(headToHead);
625
+ for (const counter of counters) {
626
+ insights.push(counter);
627
+ }
628
+
629
+ return insights;
630
+ }
631
+
632
+ /**
633
+ * Find archetype counter relationships (A beats B, B beats C, C beats A)
634
+ */
635
+ private findCounterArchetypes(headToHead: ArchetypeVsResult[]): string[] {
636
+ const insights: string[] = [];
637
+ const wins = new Map<string, Set<string>>();
638
+
639
+ // Build win graph
640
+ for (const h2h of headToHead) {
641
+ if (h2h.winRate1 > 0.6) {
642
+ const set = wins.get(h2h.archetype1) || new Set();
643
+ set.add(h2h.archetype2);
644
+ wins.set(h2h.archetype1, set);
645
+ }
646
+ if (h2h.winRate2 > 0.6) {
647
+ const set = wins.get(h2h.archetype2) || new Set();
648
+ set.add(h2h.archetype1);
649
+ wins.set(h2h.archetype2, set);
650
+ }
651
+ }
652
+
653
+ // Find triangles (rock-paper-scissors patterns)
654
+ for (const [a, aWins] of wins) {
655
+ for (const b of aWins) {
656
+ const bWins = wins.get(b);
657
+ if (bWins) {
658
+ for (const c of bWins) {
659
+ const cWins = wins.get(c);
660
+ if (cWins && cWins.has(a)) {
661
+ insights.push(
662
+ `Counter triangle found: ${a} → ${b} → ${c} → ${a}`
663
+ );
664
+ }
665
+ }
666
+ }
667
+ }
668
+ }
669
+
670
+ return insights;
671
+ }
672
+
673
+ /**
674
+ * Run the complete matchup benchmark
675
+ */
676
+ async run(): Promise<MatchupBenchmarkResult[]> {
677
+ const startTime = Date.now();
678
+ const results: MatchupBenchmarkResult[] = [];
679
+
680
+ logger.info(
681
+ 'Starting Archetype Matchup Benchmark',
682
+ {
683
+ archetypes: this.getArchetypes(),
684
+ agentsPerArchetype: this.config.agentsPerArchetype,
685
+ rounds: this.config.rounds,
686
+ conditions: this.config.marketConditions,
687
+ },
688
+ 'ArchetypeMatchupBenchmark'
689
+ );
690
+
691
+ const agents = this.createAgents();
692
+
693
+ for (const condition of this.config.marketConditions) {
694
+ logger.info(
695
+ `Testing in ${condition} market conditions`,
696
+ {},
697
+ 'ArchetypeMatchupBenchmark'
698
+ );
699
+
700
+ const allRoundResults: MatchupAgentResult[][] = [];
701
+
702
+ for (let round = 0; round < this.config.rounds; round++) {
703
+ const snapshot = await this.generateBenchmarkData(condition);
704
+ const roundResults = await this.simulateRound(
705
+ agents,
706
+ snapshot,
707
+ round + 1
708
+ );
709
+ allRoundResults.push(roundResults);
710
+ }
711
+
712
+ // Flatten agent results for this condition
713
+ const flatAgentResults = allRoundResults.flat();
714
+
715
+ // Calculate aggregated results
716
+ const headToHead = this.calculateHeadToHead(allRoundResults);
717
+ const rankings = this.calculateRankings(allRoundResults);
718
+ const insights = this.generateInsights(rankings, headToHead, condition);
719
+
720
+ results.push({
721
+ benchmarkId: `matchup-${condition}-${Date.now()}`,
722
+ timestamp: Date.now(),
723
+ duration: Date.now() - startTime,
724
+ agents: flatAgentResults,
725
+ archetypeRankings: rankings,
726
+ headToHead,
727
+ marketCondition: condition,
728
+ insights,
729
+ });
730
+
731
+ logger.info(
732
+ `Completed ${condition} market benchmark`,
733
+ {
734
+ topArchetype: rankings[0]?.archetype,
735
+ avgPnl: rankings[0]?.avgPnl.toFixed(2),
736
+ },
737
+ 'ArchetypeMatchupBenchmark'
738
+ );
739
+ }
740
+
741
+ // Cleanup
742
+ this.orchestrator.unloadAll();
743
+
744
+ const totalDuration = Date.now() - startTime;
745
+ logger.info(
746
+ 'Archetype Matchup Benchmark complete',
747
+ {
748
+ totalDurationMs: totalDuration,
749
+ conditionsTested: this.config.marketConditions.length,
750
+ totalRounds: this.config.rounds * this.config.marketConditions.length,
751
+ },
752
+ 'ArchetypeMatchupBenchmark'
753
+ );
754
+
755
+ return results;
756
+ }
757
+
758
+ /**
759
+ * Generate a summary report of the matchup results
760
+ */
761
+ static generateReport(results: MatchupBenchmarkResult[]): string {
762
+ const lines: string[] = [];
763
+ lines.push('# Archetype Matchup Benchmark Report\n');
764
+
765
+ for (const result of results) {
766
+ lines.push(
767
+ `## ${result.marketCondition.toUpperCase()} Market Conditions\n`
768
+ );
769
+
770
+ // Rankings table
771
+ lines.push('### Overall Rankings\n');
772
+ lines.push('| Rank | Archetype | Avg PnL | Win Rate |');
773
+ lines.push('|------|-----------|---------|----------|');
774
+ for (const ranking of result.archetypeRankings) {
775
+ lines.push(
776
+ `| ${ranking.avgRank.toFixed(1)} | ${ranking.archetype} | ${ranking.avgPnl.toFixed(2)} | ${(ranking.winRate * 100).toFixed(1)}% |`
777
+ );
778
+ }
779
+ lines.push('');
780
+
781
+ // Head-to-head table
782
+ lines.push('### Head-to-Head Results\n');
783
+ lines.push('| Matchup | Winner | Win Rate |');
784
+ lines.push('|---------|--------|----------|');
785
+ for (const h2h of result.headToHead) {
786
+ const winner =
787
+ h2h.winRate1 > h2h.winRate2 ? h2h.archetype1 : h2h.archetype2;
788
+ const winRate = Math.max(h2h.winRate1, h2h.winRate2);
789
+ lines.push(
790
+ `| ${h2h.archetype1} vs ${h2h.archetype2} | ${winner} | ${(winRate * 100).toFixed(1)}% |`
791
+ );
792
+ }
793
+ lines.push('');
794
+
795
+ // Insights
796
+ if (result.insights.length > 0) {
797
+ lines.push('### Key Insights\n');
798
+ for (const insight of result.insights) {
799
+ lines.push(`- ${insight}`);
800
+ }
801
+ lines.push('');
802
+ }
803
+ }
804
+
805
+ return lines.join('\n');
806
+ }
807
+ }
808
+
809
+ /**
810
+ * Run a quick matchup benchmark with sensible defaults
811
+ */
812
+ export async function runQuickMatchupBenchmark(): Promise<
813
+ MatchupBenchmarkResult[]
814
+ > {
815
+ const benchmark = new ArchetypeMatchupBenchmark({
816
+ archetypes: 'all',
817
+ agentsPerArchetype: 2,
818
+ rounds: 5,
819
+ ticksPerRound: 100,
820
+ marketConditions: ['bull', 'bear', 'volatile', 'stable'],
821
+ availableVramGb: 16,
822
+ });
823
+
824
+ return benchmark.run();
825
+ }