@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Real Archetype Benchmark
4
+ *
5
+ * Queries actual agent data from the Babylon database.
6
+ *
7
+ * Usage:
8
+ * bun run packages/training/scripts/real-archetype-benchmark.ts
9
+ */
10
+
11
+ import {
12
+ agentPerformanceMetrics,
13
+ and,
14
+ db,
15
+ desc,
16
+ eq,
17
+ isNull,
18
+ poolPositions,
19
+ users,
20
+ } from '@elizaos/db';
21
+ import { ArchetypeConfigService } from '../src/archetypes/ArchetypeConfigService';
22
+
23
+ // Get the available archetypes from our actual config
24
+ const ARCHETYPES = ArchetypeConfigService.getAvailableArchetypes();
25
+
26
+ interface RealAgentMetrics {
27
+ agentId: string;
28
+ agentName: string;
29
+ archetype: string;
30
+ lifetimePnL: number;
31
+ totalTrades: number;
32
+ winRate: number;
33
+ openPositions: number;
34
+ reputationPoints: number;
35
+ }
36
+
37
+ async function main() {
38
+ console.log(
39
+ '═══════════════════════════════════════════════════════════════'
40
+ );
41
+ console.log(' Babylon Real Archetype Benchmark');
42
+ console.log(' Using ACTUAL data from the game engine');
43
+ console.log(
44
+ '═══════════════════════════════════════════════════════════════\n'
45
+ );
46
+
47
+ console.log('Fetching real agent data from database...');
48
+
49
+ // Get all agents - use select() without specifying columns
50
+ const agents = await db
51
+ .select()
52
+ .from(users)
53
+ .where(eq(users.isAgent, true))
54
+ .orderBy(desc(users.reputationPoints))
55
+ .limit(100);
56
+
57
+ console.log(`Found ${agents.length} agents in database`);
58
+
59
+ if (agents.length === 0) {
60
+ console.log('\n⚠️ No agents found in database.');
61
+ console.log(' To generate real benchmark data:');
62
+ console.log(' 1. Run the game with agents: bun run dev');
63
+ console.log(' 2. Create agents with archetypes');
64
+ console.log(' 3. Let them trade for a while');
65
+ console.log(' 4. Re-run this benchmark\n');
66
+ return;
67
+ }
68
+
69
+ const results: RealAgentMetrics[] = [];
70
+
71
+ for (const agent of agents) {
72
+ // Get performance metrics for this agent
73
+ const performanceMetricsList = await db
74
+ .select()
75
+ .from(agentPerformanceMetrics)
76
+ .where(eq(agentPerformanceMetrics.userId, agent.id))
77
+ .limit(1);
78
+
79
+ const performanceMetrics = performanceMetricsList[0];
80
+
81
+ // Get open positions count
82
+ const openPositionsList = await db
83
+ .select()
84
+ .from(poolPositions)
85
+ .where(
86
+ and(eq(poolPositions.userId, agent.id), isNull(poolPositions.closedAt))
87
+ );
88
+
89
+ // Infer archetype from username/displayName
90
+ const agentName = agent.displayName || agent.username || 'Unknown';
91
+ let archetype = 'default';
92
+ const lowerName = agentName.toLowerCase();
93
+ for (const a of ARCHETYPES) {
94
+ if (
95
+ lowerName.includes(a.replace('-', '').toLowerCase()) ||
96
+ lowerName.includes(a.replace('-', ' ').toLowerCase())
97
+ ) {
98
+ archetype = a;
99
+ break;
100
+ }
101
+ }
102
+
103
+ results.push({
104
+ agentId: agent.id,
105
+ agentName,
106
+ archetype,
107
+ lifetimePnL: Number(agent.lifetimePnL) || 0,
108
+ totalTrades: performanceMetrics?.totalTrades || 0,
109
+ winRate: performanceMetrics?.winRate || 0,
110
+ openPositions: openPositionsList.length,
111
+ reputationPoints: agent.reputationPoints || 0,
112
+ });
113
+ }
114
+
115
+ console.log(`\nProcessed ${results.length} agents\n`);
116
+
117
+ // Group by archetype
118
+ const grouped = new Map<string, RealAgentMetrics[]>();
119
+ for (const agent of results) {
120
+ const existing = grouped.get(agent.archetype) || [];
121
+ existing.push(agent);
122
+ grouped.set(agent.archetype, existing);
123
+ }
124
+
125
+ // Print summary
126
+ console.log('Archetype Performance Summary:');
127
+ console.log('─'.repeat(70));
128
+ console.log(
129
+ 'Archetype | Agents | Avg PnL | Win Rate | Reputation'
130
+ );
131
+ console.log('─'.repeat(70));
132
+
133
+ const benchmarkResults: Array<{
134
+ archetype: string;
135
+ count: number;
136
+ avgPnL: number;
137
+ avgWinRate: number;
138
+ avgReputation: number;
139
+ }> = [];
140
+
141
+ for (const [archetype, agentsList] of grouped) {
142
+ const count = agentsList.length;
143
+ const avgPnL =
144
+ count > 0
145
+ ? agentsList.reduce((sum, a) => sum + a.lifetimePnL, 0) / count
146
+ : 0;
147
+ const avgWinRate =
148
+ count > 0 ? agentsList.reduce((sum, a) => sum + a.winRate, 0) / count : 0;
149
+ const avgReputation =
150
+ count > 0
151
+ ? agentsList.reduce((sum, a) => sum + a.reputationPoints, 0) / count
152
+ : 0;
153
+
154
+ benchmarkResults.push({
155
+ archetype,
156
+ count,
157
+ avgPnL,
158
+ avgWinRate,
159
+ avgReputation,
160
+ });
161
+ }
162
+
163
+ benchmarkResults.sort((a, b) => b.avgPnL - a.avgPnL);
164
+
165
+ for (const r of benchmarkResults) {
166
+ console.log(
167
+ `${r.archetype.padEnd(20)} | ` +
168
+ `${r.count.toString().padStart(6)} | ` +
169
+ `$${r.avgPnL.toFixed(2).padStart(11)} | ` +
170
+ `${(r.avgWinRate * 100).toFixed(1).padStart(7)}% | ` +
171
+ `${r.avgReputation.toFixed(0).padStart(10)}`
172
+ );
173
+ }
174
+
175
+ console.log('─'.repeat(70));
176
+
177
+ // Save report
178
+ const { mkdirSync, writeFileSync } = await import('fs');
179
+ const outputDir = './research-output/real-benchmarks';
180
+ mkdirSync(outputDir, { recursive: true });
181
+
182
+ const report = `# Babylon Real Archetype Benchmark
183
+
184
+ Generated: ${new Date().toISOString()}
185
+
186
+ ## Agents: ${results.length}
187
+
188
+ | Archetype | Count | Avg PnL | Win Rate | Reputation |
189
+ |-----------|-------|---------|----------|------------|
190
+ ${benchmarkResults
191
+ .map(
192
+ (r) =>
193
+ `| ${r.archetype} | ${r.count} | $${r.avgPnL.toFixed(2)} | ${(r.avgWinRate * 100).toFixed(1)}% | ${r.avgReputation.toFixed(0)} |`
194
+ )
195
+ .join('\n')}
196
+ `;
197
+
198
+ const reportPath = `${outputDir}/benchmark-${Date.now()}.md`;
199
+ writeFileSync(reportPath, report);
200
+
201
+ console.log(`\n✓ Report saved to: ${reportPath}`);
202
+ console.log(
203
+ '═══════════════════════════════════════════════════════════════'
204
+ );
205
+ }
206
+
207
+ main().catch((err) => {
208
+ console.error('Benchmark failed:', err);
209
+ process.exit(1);
210
+ });
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Head-to-Head Benchmark Script
5
+ *
6
+ * Runs two parallel simulations on the exact same market conditions ("Fixed Seed").
7
+ * Compares "Baseline" (Random/Momentum) vs "Challenger" (Smart LLM Agent).
8
+ *
9
+ * Usage:
10
+ * bun packages/training/scripts/run-baseline-comparison.ts
11
+ */
12
+
13
+ // Import initializeJsonMode to enable file-based DB for trajectory recording
14
+ // This prevents "Database not initialized" errors when saveTrajectory is true
15
+ import { initializeJsonMode } from '@elizaos/db';
16
+ import type { IAgentRuntime } from '@elizaos/core';
17
+ import { mkdirSync } from 'fs';
18
+ import * as path from 'path';
19
+ import { type BenchmarkConfig } from '../src/benchmark/BenchmarkDataGenerator';
20
+ import { BenchmarkRunner } from '../src/benchmark/BenchmarkRunner';
21
+ import { MetricsVisualizer } from '../src/benchmark/MetricsVisualizer';
22
+ import { logger } from '../src/utils/logger';
23
+
24
+ // Mock Agent Runtime for the runner structure
25
+ const mockRuntime = {
26
+ character: {
27
+ settings: {
28
+ model: 'gpt-4-turbo',
29
+ },
30
+ },
31
+ } as unknown as IAgentRuntime;
32
+
33
+ async function main() {
34
+ console.log(
35
+ '═══════════════════════════════════════════════════════════════'
36
+ );
37
+ console.log(' 🥊 HEAD-TO-HEAD BENCHMARK: Random vs LLM Agent');
38
+ console.log(
39
+ '═══════════════════════════════════════════════════════════════\n'
40
+ );
41
+
42
+ const outputDir = path.join(
43
+ process.cwd(),
44
+ 'benchmark-results',
45
+ `h2h-${Date.now()}`
46
+ );
47
+ mkdirSync(outputDir, { recursive: true });
48
+
49
+ // 0. Initialize Database in JSON Mode
50
+ // This ensures TrajectoryRecorder writes to files instead of crashing on missing Postgres
51
+ const dbPath = path.join(outputDir, 'db_storage');
52
+ mkdirSync(dbPath, { recursive: true });
53
+ try {
54
+ initializeJsonMode(dbPath);
55
+ logger.info(`Initialized JSON DB at ${dbPath}`);
56
+ } catch (e) {
57
+ logger.warn(
58
+ 'Could not initialize JSON DB mode. Trajectory recording might fail if no Postgres connection.',
59
+ { error: e instanceof Error ? e.message : String(e) }
60
+ );
61
+ }
62
+
63
+ // 1. Configuration for Fixed Benchmark
64
+ const generatorConfig: BenchmarkConfig = {
65
+ durationMinutes: 10,
66
+ tickInterval: 1,
67
+ numPredictionMarkets: 5,
68
+ numPerpetualMarkets: 3,
69
+ numAgents: 5,
70
+ seed: 12345, // FIXED SEED for fairness
71
+ };
72
+
73
+ logger.info('Generating fixed benchmark snapshot...');
74
+
75
+ // 2. Run Baseline (Random Strategy)
76
+ // Note: We use the SAME generator config, so the runner will generate the SAME snapshot
77
+ // because of the fixed seed.
78
+ logger.info('>>> STARTING RUN A: BASELINE (RANDOM) <<<');
79
+ const baselineResult = await BenchmarkRunner.runSingle({
80
+ generatorConfig,
81
+ agentRuntime: mockRuntime, // Not used for baseline strategy
82
+ agentUserId: 'baseline-agent',
83
+ saveTrajectory: false, // Baseline doesn't need trajectory recording
84
+ outputDir: path.join(outputDir, 'baseline'),
85
+ forceStrategy: 'random', // Force dumb strategy
86
+ });
87
+
88
+ // 3. Run Challenger (Smart/Momentum Strategy for this demo)
89
+ // We use 'momentum' here to simulate a "Smart" agent for demonstration.
90
+ logger.info('>>> STARTING RUN B: CHALLENGER (MOMENTUM/LLM) <<<');
91
+
92
+ const challengerResult = await BenchmarkRunner.runSingle({
93
+ generatorConfig, // Same config -> Same seed -> Same market conditions
94
+ agentRuntime: mockRuntime,
95
+ agentUserId: 'challenger-agent',
96
+ saveTrajectory: true, // Record trajectory for the "Smart" agent to analyze decisions
97
+ outputDir: path.join(outputDir, 'challenger'),
98
+ forceStrategy: 'momentum', // Simulating "Smart" behavior
99
+ });
100
+
101
+ // 4. Generate Comparison Report
102
+ await MetricsVisualizer.generateComparisonReport(
103
+ baselineResult,
104
+ challengerResult,
105
+ outputDir
106
+ );
107
+
108
+ console.log(`\n✅ Benchmark complete. Results saved to: ${outputDir}`);
109
+
110
+ process.exit(0);
111
+ }
112
+
113
+ main().catch((error) => {
114
+ console.error('Benchmark failed:', error);
115
+ process.exit(1);
116
+ });
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Full Training Pipeline Test
5
+ *
6
+ * This script runs the complete training pipeline end-to-end:
7
+ * 1. Initialize training package
8
+ * 2. Generate real trajectories (or use existing)
9
+ * 3. Score trajectories with LLM-as-judge
10
+ * 4. Export training data
11
+ * 5. Run archetype matchup benchmark
12
+ *
13
+ * Usage:
14
+ * bun run packages/training/scripts/run-full-pipeline.ts
15
+ *
16
+ * Options:
17
+ * --skip-generation Skip trajectory generation (use existing data)
18
+ * --skip-scoring Skip LLM scoring
19
+ * --archetypes Comma-separated archetypes (default: trader,researcher)
20
+ * --agents Agents per archetype (default: 2)
21
+ * --ticks Ticks per agent (default: 10)
22
+ */
23
+
24
+ import { count, db, eq, isNotNull, trajectories } from '@elizaos/db';
25
+ import { parseArgs } from 'util';
26
+
27
+ // Parse command line arguments
28
+ const { values } = parseArgs({
29
+ args: process.argv.slice(2),
30
+ options: {
31
+ 'skip-generation': { type: 'boolean', default: false },
32
+ 'skip-scoring': { type: 'boolean', default: false },
33
+ archetypes: { type: 'string', default: 'trader,researcher' },
34
+ agents: { type: 'string', default: '2' },
35
+ ticks: { type: 'string', default: '10' },
36
+ help: { type: 'boolean', short: 'h' },
37
+ },
38
+ });
39
+
40
+ if (values.help) {
41
+ console.log(`
42
+ Full Training Pipeline Test
43
+
44
+ Usage:
45
+ bun run packages/training/scripts/run-full-pipeline.ts [options]
46
+
47
+ Options:
48
+ --skip-generation Skip trajectory generation (use existing data)
49
+ --skip-scoring Skip LLM scoring
50
+ --archetypes Comma-separated archetypes (default: trader,researcher)
51
+ --agents Agents per archetype (default: 2)
52
+ --ticks Ticks per agent (default: 10)
53
+ -h, --help Show this help message
54
+ `);
55
+ process.exit(0);
56
+ }
57
+
58
+ const config = {
59
+ skipGeneration: values['skip-generation'] as boolean,
60
+ skipScoring: values['skip-scoring'] as boolean,
61
+ archetypes: (values.archetypes as string).split(','),
62
+ agentsPerArchetype: parseInt(values.agents as string, 10),
63
+ ticksPerAgent: parseInt(values.ticks as string, 10),
64
+ };
65
+
66
+ console.log('═══════════════════════════════════════════════════════════════');
67
+ console.log(' Babylon Full Training Pipeline');
68
+ console.log('═══════════════════════════════════════════════════════════════');
69
+ console.log(` Archetypes: ${config.archetypes.join(', ')}`);
70
+ console.log(` Agents per archetype: ${config.agentsPerArchetype}`);
71
+ console.log(` Ticks per agent: ${config.ticksPerAgent}`);
72
+ console.log(` Skip generation: ${config.skipGeneration}`);
73
+ console.log(` Skip scoring: ${config.skipScoring}`);
74
+ console.log(
75
+ '═══════════════════════════════════════════════════════════════\n'
76
+ );
77
+
78
+ async function runPipeline() {
79
+ const startTime = Date.now();
80
+
81
+ // Step 1: Check database connection
82
+ console.log('Step 1: Checking database connection...');
83
+ try {
84
+ const result = await db.select({ count: count() }).from(trajectories);
85
+ console.log(
86
+ ` ✅ Database connected. ${result[0]?.count || 0} existing trajectories.\n`
87
+ );
88
+ } catch (error) {
89
+ console.log(` ❌ Database connection failed: ${error}`);
90
+ console.log(' Make sure DATABASE_URL is set correctly.\n');
91
+ process.exit(1);
92
+ }
93
+
94
+ // Step 2: Initialize training package
95
+ console.log('Step 2: Initializing training package...');
96
+ try {
97
+ const { initializeTrainingPackage } = await import('../src/init-training');
98
+ await initializeTrainingPackage();
99
+ console.log(' ✅ Training package initialized.\n');
100
+ } catch (error) {
101
+ console.log(` ⚠️ Training package initialization failed: ${error}`);
102
+ console.log(' Will continue with limited functionality.\n');
103
+ }
104
+
105
+ // Step 3: Generate trajectories
106
+ if (!config.skipGeneration) {
107
+ console.log('Step 3: Generating real trajectories...');
108
+ try {
109
+ const { TrajectoryGenerator } = await import(
110
+ '../src/generation/TrajectoryGenerator'
111
+ );
112
+
113
+ // Get a manager ID (first user in DB or create one)
114
+ const { users, desc } = await import('@elizaos/db');
115
+ const managerResult = await db
116
+ .select({ id: users.id })
117
+ .from(users)
118
+ .orderBy(desc(users.createdAt))
119
+ .limit(1);
120
+
121
+ if (managerResult.length === 0) {
122
+ console.log(' ⚠️ No users found in database. Skipping generation.');
123
+ console.log(' Create a user first or use --skip-generation.\n');
124
+ } else {
125
+ const managerId = managerResult[0].id;
126
+
127
+ const generator = new TrajectoryGenerator({
128
+ archetypes: config.archetypes,
129
+ agentsPerArchetype: config.agentsPerArchetype,
130
+ ticksPerAgent: config.ticksPerAgent,
131
+ parallelAgents: 3,
132
+ recordTrajectories: true,
133
+ managerId,
134
+ });
135
+
136
+ const result = await generator.generate();
137
+ console.log(
138
+ ` ✅ Generated ${result.trajectoryIds.length} trajectories.`
139
+ );
140
+ console.log(` Agents created: ${result.agentsCreated.length}`);
141
+ console.log(` Duration: ${result.duration}ms\n`);
142
+
143
+ // Cleanup test agents
144
+ await generator.cleanup();
145
+ }
146
+ } catch (error) {
147
+ console.log(` ❌ Generation failed: ${error}`);
148
+ console.log(
149
+ ' Make sure the server is running or use --skip-generation.\n'
150
+ );
151
+ }
152
+ } else {
153
+ console.log(
154
+ 'Step 3: Skipping trajectory generation (--skip-generation).\n'
155
+ );
156
+ }
157
+
158
+ // Step 4: Score trajectories
159
+ if (!config.skipScoring) {
160
+ console.log('Step 4: Scoring trajectories with LLM-as-judge...');
161
+ try {
162
+ const { archetypeScoringService } = await import(
163
+ '../src/scoring/ArchetypeScoringService'
164
+ );
165
+
166
+ // Check for unscored trajectories
167
+ const unscoredCount = await db
168
+ .select({ count: count() })
169
+ .from(trajectories)
170
+ .where(eq(trajectories.isTrainingData, true));
171
+
172
+ const scoredCount = await db
173
+ .select({ count: count() })
174
+ .from(trajectories)
175
+ .where(isNotNull(trajectories.aiJudgeReward));
176
+
177
+ console.log(` Training trajectories: ${unscoredCount[0]?.count || 0}`);
178
+ console.log(` Already scored: ${scoredCount[0]?.count || 0}`);
179
+
180
+ // Score a batch of unscored trajectories
181
+ const result = await archetypeScoringService.scoreUnscoredTrajectories(
182
+ 'default',
183
+ 10
184
+ );
185
+ console.log(
186
+ ` ✅ Scored ${result.scored} trajectories (${result.errors} errors).\n`
187
+ );
188
+ } catch (error) {
189
+ console.log(` ❌ Scoring failed: ${error}`);
190
+ console.log(' Make sure GROQ_API_KEY is set.\n');
191
+ }
192
+ } else {
193
+ console.log('Step 4: Skipping scoring (--skip-scoring).\n');
194
+ }
195
+
196
+ // Step 5: Run archetype matchup benchmark
197
+ console.log('Step 5: Running archetype matchup benchmark...');
198
+ try {
199
+ const { ArchetypeMatchupBenchmark } = await import(
200
+ '../src/benchmark/ArchetypeMatchupBenchmark'
201
+ );
202
+
203
+ const benchmark = new ArchetypeMatchupBenchmark({
204
+ archetypes: config.archetypes,
205
+ agentsPerArchetype: 2,
206
+ rounds: 3,
207
+ ticksPerRound: 50,
208
+ marketConditions: ['bull', 'bear'],
209
+ availableVramGb: 16,
210
+ });
211
+
212
+ const results = await benchmark.run();
213
+
214
+ console.log(` ✅ Benchmark complete.`);
215
+ for (const result of results) {
216
+ console.log(` ${result.marketCondition.toUpperCase()} market:`);
217
+ const top3 = result.archetypeRankings.slice(0, 3);
218
+ for (const r of top3) {
219
+ console.log(
220
+ ` ${r.avgRank.toFixed(1)}. ${r.archetype} (avg PnL: ${r.avgPnl.toFixed(2)})`
221
+ );
222
+ }
223
+ }
224
+ console.log('');
225
+ } catch (error) {
226
+ console.log(` ❌ Benchmark failed: ${error}\n`);
227
+ }
228
+
229
+ // Step 6: Export training data
230
+ console.log('Step 6: Checking training data export...');
231
+ try {
232
+ const scoredResult = await db
233
+ .select({ count: count() })
234
+ .from(trajectories)
235
+ .where(isNotNull(trajectories.aiJudgeReward));
236
+
237
+ const scored = scoredResult[0]?.count || 0;
238
+ if (scored > 0) {
239
+ console.log(` ✅ ${scored} trajectories ready for export.`);
240
+ console.log(' Run "babylon train export" to export training data.\n');
241
+ } else {
242
+ console.log(' ⚠️ No scored trajectories available for export.');
243
+ console.log(' Generate and score trajectories first.\n');
244
+ }
245
+ } catch (error) {
246
+ console.log(` ❌ Export check failed: ${error}\n`);
247
+ }
248
+
249
+ // Summary
250
+ const totalTime = Date.now() - startTime;
251
+ console.log(
252
+ '═══════════════════════════════════════════════════════════════'
253
+ );
254
+ console.log(' Pipeline Complete');
255
+ console.log(
256
+ '═══════════════════════════════════════════════════════════════'
257
+ );
258
+ console.log(` Total time: ${(totalTime / 1000).toFixed(1)}s`);
259
+ console.log('');
260
+ console.log(' Next steps:');
261
+ console.log(' 1. Export data: babylon train export');
262
+ console.log(' 2. Train models: python python/scripts/run_full_pipeline.py');
263
+ console.log(' 3. Benchmark: babylon train benchmark');
264
+ console.log(
265
+ '═══════════════════════════════════════════════════════════════'
266
+ );
267
+ }
268
+
269
+ runPipeline().catch((error) => {
270
+ console.error('Pipeline failed:', error);
271
+ process.exit(1);
272
+ });