@elizaos/training 2.0.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/data/.gitkeep +0 -0
- package/data/degen/.gitkeep +2 -0
- package/data/trader/.gitkeep +2 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +58 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +206 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +89 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +439 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Real Archetype Benchmark
|
|
4
|
+
*
|
|
5
|
+
* Queries actual agent data from the Babylon database.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* bun run packages/training/scripts/real-archetype-benchmark.ts
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
agentPerformanceMetrics,
|
|
13
|
+
and,
|
|
14
|
+
db,
|
|
15
|
+
desc,
|
|
16
|
+
eq,
|
|
17
|
+
isNull,
|
|
18
|
+
poolPositions,
|
|
19
|
+
users,
|
|
20
|
+
} from '@elizaos/db';
|
|
21
|
+
import { ArchetypeConfigService } from '../src/archetypes/ArchetypeConfigService';
|
|
22
|
+
|
|
23
|
+
// Get the available archetypes from our actual config
|
|
24
|
+
const ARCHETYPES = ArchetypeConfigService.getAvailableArchetypes();
|
|
25
|
+
|
|
26
|
+
interface RealAgentMetrics {
|
|
27
|
+
agentId: string;
|
|
28
|
+
agentName: string;
|
|
29
|
+
archetype: string;
|
|
30
|
+
lifetimePnL: number;
|
|
31
|
+
totalTrades: number;
|
|
32
|
+
winRate: number;
|
|
33
|
+
openPositions: number;
|
|
34
|
+
reputationPoints: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function main() {
|
|
38
|
+
console.log(
|
|
39
|
+
'═══════════════════════════════════════════════════════════════'
|
|
40
|
+
);
|
|
41
|
+
console.log(' Babylon Real Archetype Benchmark');
|
|
42
|
+
console.log(' Using ACTUAL data from the game engine');
|
|
43
|
+
console.log(
|
|
44
|
+
'═══════════════════════════════════════════════════════════════\n'
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
console.log('Fetching real agent data from database...');
|
|
48
|
+
|
|
49
|
+
// Get all agents - use select() without specifying columns
|
|
50
|
+
const agents = await db
|
|
51
|
+
.select()
|
|
52
|
+
.from(users)
|
|
53
|
+
.where(eq(users.isAgent, true))
|
|
54
|
+
.orderBy(desc(users.reputationPoints))
|
|
55
|
+
.limit(100);
|
|
56
|
+
|
|
57
|
+
console.log(`Found ${agents.length} agents in database`);
|
|
58
|
+
|
|
59
|
+
if (agents.length === 0) {
|
|
60
|
+
console.log('\n⚠️ No agents found in database.');
|
|
61
|
+
console.log(' To generate real benchmark data:');
|
|
62
|
+
console.log(' 1. Run the game with agents: bun run dev');
|
|
63
|
+
console.log(' 2. Create agents with archetypes');
|
|
64
|
+
console.log(' 3. Let them trade for a while');
|
|
65
|
+
console.log(' 4. Re-run this benchmark\n');
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const results: RealAgentMetrics[] = [];
|
|
70
|
+
|
|
71
|
+
for (const agent of agents) {
|
|
72
|
+
// Get performance metrics for this agent
|
|
73
|
+
const performanceMetricsList = await db
|
|
74
|
+
.select()
|
|
75
|
+
.from(agentPerformanceMetrics)
|
|
76
|
+
.where(eq(agentPerformanceMetrics.userId, agent.id))
|
|
77
|
+
.limit(1);
|
|
78
|
+
|
|
79
|
+
const performanceMetrics = performanceMetricsList[0];
|
|
80
|
+
|
|
81
|
+
// Get open positions count
|
|
82
|
+
const openPositionsList = await db
|
|
83
|
+
.select()
|
|
84
|
+
.from(poolPositions)
|
|
85
|
+
.where(
|
|
86
|
+
and(eq(poolPositions.userId, agent.id), isNull(poolPositions.closedAt))
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
// Infer archetype from username/displayName
|
|
90
|
+
const agentName = agent.displayName || agent.username || 'Unknown';
|
|
91
|
+
let archetype = 'default';
|
|
92
|
+
const lowerName = agentName.toLowerCase();
|
|
93
|
+
for (const a of ARCHETYPES) {
|
|
94
|
+
if (
|
|
95
|
+
lowerName.includes(a.replace('-', '').toLowerCase()) ||
|
|
96
|
+
lowerName.includes(a.replace('-', ' ').toLowerCase())
|
|
97
|
+
) {
|
|
98
|
+
archetype = a;
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
results.push({
|
|
104
|
+
agentId: agent.id,
|
|
105
|
+
agentName,
|
|
106
|
+
archetype,
|
|
107
|
+
lifetimePnL: Number(agent.lifetimePnL) || 0,
|
|
108
|
+
totalTrades: performanceMetrics?.totalTrades || 0,
|
|
109
|
+
winRate: performanceMetrics?.winRate || 0,
|
|
110
|
+
openPositions: openPositionsList.length,
|
|
111
|
+
reputationPoints: agent.reputationPoints || 0,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
console.log(`\nProcessed ${results.length} agents\n`);
|
|
116
|
+
|
|
117
|
+
// Group by archetype
|
|
118
|
+
const grouped = new Map<string, RealAgentMetrics[]>();
|
|
119
|
+
for (const agent of results) {
|
|
120
|
+
const existing = grouped.get(agent.archetype) || [];
|
|
121
|
+
existing.push(agent);
|
|
122
|
+
grouped.set(agent.archetype, existing);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Print summary
|
|
126
|
+
console.log('Archetype Performance Summary:');
|
|
127
|
+
console.log('─'.repeat(70));
|
|
128
|
+
console.log(
|
|
129
|
+
'Archetype | Agents | Avg PnL | Win Rate | Reputation'
|
|
130
|
+
);
|
|
131
|
+
console.log('─'.repeat(70));
|
|
132
|
+
|
|
133
|
+
const benchmarkResults: Array<{
|
|
134
|
+
archetype: string;
|
|
135
|
+
count: number;
|
|
136
|
+
avgPnL: number;
|
|
137
|
+
avgWinRate: number;
|
|
138
|
+
avgReputation: number;
|
|
139
|
+
}> = [];
|
|
140
|
+
|
|
141
|
+
for (const [archetype, agentsList] of grouped) {
|
|
142
|
+
const count = agentsList.length;
|
|
143
|
+
const avgPnL =
|
|
144
|
+
count > 0
|
|
145
|
+
? agentsList.reduce((sum, a) => sum + a.lifetimePnL, 0) / count
|
|
146
|
+
: 0;
|
|
147
|
+
const avgWinRate =
|
|
148
|
+
count > 0 ? agentsList.reduce((sum, a) => sum + a.winRate, 0) / count : 0;
|
|
149
|
+
const avgReputation =
|
|
150
|
+
count > 0
|
|
151
|
+
? agentsList.reduce((sum, a) => sum + a.reputationPoints, 0) / count
|
|
152
|
+
: 0;
|
|
153
|
+
|
|
154
|
+
benchmarkResults.push({
|
|
155
|
+
archetype,
|
|
156
|
+
count,
|
|
157
|
+
avgPnL,
|
|
158
|
+
avgWinRate,
|
|
159
|
+
avgReputation,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
benchmarkResults.sort((a, b) => b.avgPnL - a.avgPnL);
|
|
164
|
+
|
|
165
|
+
for (const r of benchmarkResults) {
|
|
166
|
+
console.log(
|
|
167
|
+
`${r.archetype.padEnd(20)} | ` +
|
|
168
|
+
`${r.count.toString().padStart(6)} | ` +
|
|
169
|
+
`$${r.avgPnL.toFixed(2).padStart(11)} | ` +
|
|
170
|
+
`${(r.avgWinRate * 100).toFixed(1).padStart(7)}% | ` +
|
|
171
|
+
`${r.avgReputation.toFixed(0).padStart(10)}`
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
console.log('─'.repeat(70));
|
|
176
|
+
|
|
177
|
+
// Save report
|
|
178
|
+
const { mkdirSync, writeFileSync } = await import('fs');
|
|
179
|
+
const outputDir = './research-output/real-benchmarks';
|
|
180
|
+
mkdirSync(outputDir, { recursive: true });
|
|
181
|
+
|
|
182
|
+
const report = `# Babylon Real Archetype Benchmark
|
|
183
|
+
|
|
184
|
+
Generated: ${new Date().toISOString()}
|
|
185
|
+
|
|
186
|
+
## Agents: ${results.length}
|
|
187
|
+
|
|
188
|
+
| Archetype | Count | Avg PnL | Win Rate | Reputation |
|
|
189
|
+
|-----------|-------|---------|----------|------------|
|
|
190
|
+
${benchmarkResults
|
|
191
|
+
.map(
|
|
192
|
+
(r) =>
|
|
193
|
+
`| ${r.archetype} | ${r.count} | $${r.avgPnL.toFixed(2)} | ${(r.avgWinRate * 100).toFixed(1)}% | ${r.avgReputation.toFixed(0)} |`
|
|
194
|
+
)
|
|
195
|
+
.join('\n')}
|
|
196
|
+
`;
|
|
197
|
+
|
|
198
|
+
const reportPath = `${outputDir}/benchmark-${Date.now()}.md`;
|
|
199
|
+
writeFileSync(reportPath, report);
|
|
200
|
+
|
|
201
|
+
console.log(`\n✓ Report saved to: ${reportPath}`);
|
|
202
|
+
console.log(
|
|
203
|
+
'═══════════════════════════════════════════════════════════════'
|
|
204
|
+
);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
main().catch((err) => {
|
|
208
|
+
console.error('Benchmark failed:', err);
|
|
209
|
+
process.exit(1);
|
|
210
|
+
});
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Head-to-Head Benchmark Script
|
|
5
|
+
*
|
|
6
|
+
* Runs two parallel simulations on the exact same market conditions ("Fixed Seed").
|
|
7
|
+
* Compares "Baseline" (Random/Momentum) vs "Challenger" (Smart LLM Agent).
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* bun packages/training/scripts/run-baseline-comparison.ts
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// Import initializeJsonMode to enable file-based DB for trajectory recording
|
|
14
|
+
// This prevents "Database not initialized" errors when saveTrajectory is true
|
|
15
|
+
import { initializeJsonMode } from '@elizaos/db';
|
|
16
|
+
import type { IAgentRuntime } from '@elizaos/core';
|
|
17
|
+
import { mkdirSync } from 'fs';
|
|
18
|
+
import * as path from 'path';
|
|
19
|
+
import { type BenchmarkConfig } from '../src/benchmark/BenchmarkDataGenerator';
|
|
20
|
+
import { BenchmarkRunner } from '../src/benchmark/BenchmarkRunner';
|
|
21
|
+
import { MetricsVisualizer } from '../src/benchmark/MetricsVisualizer';
|
|
22
|
+
import { logger } from '../src/utils/logger';
|
|
23
|
+
|
|
24
|
+
// Mock Agent Runtime for the runner structure
|
|
25
|
+
const mockRuntime = {
|
|
26
|
+
character: {
|
|
27
|
+
settings: {
|
|
28
|
+
model: 'gpt-4-turbo',
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
} as unknown as IAgentRuntime;
|
|
32
|
+
|
|
33
|
+
async function main() {
|
|
34
|
+
console.log(
|
|
35
|
+
'═══════════════════════════════════════════════════════════════'
|
|
36
|
+
);
|
|
37
|
+
console.log(' 🥊 HEAD-TO-HEAD BENCHMARK: Random vs LLM Agent');
|
|
38
|
+
console.log(
|
|
39
|
+
'═══════════════════════════════════════════════════════════════\n'
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
const outputDir = path.join(
|
|
43
|
+
process.cwd(),
|
|
44
|
+
'benchmark-results',
|
|
45
|
+
`h2h-${Date.now()}`
|
|
46
|
+
);
|
|
47
|
+
mkdirSync(outputDir, { recursive: true });
|
|
48
|
+
|
|
49
|
+
// 0. Initialize Database in JSON Mode
|
|
50
|
+
// This ensures TrajectoryRecorder writes to files instead of crashing on missing Postgres
|
|
51
|
+
const dbPath = path.join(outputDir, 'db_storage');
|
|
52
|
+
mkdirSync(dbPath, { recursive: true });
|
|
53
|
+
try {
|
|
54
|
+
initializeJsonMode(dbPath);
|
|
55
|
+
logger.info(`Initialized JSON DB at ${dbPath}`);
|
|
56
|
+
} catch (e) {
|
|
57
|
+
logger.warn(
|
|
58
|
+
'Could not initialize JSON DB mode. Trajectory recording might fail if no Postgres connection.',
|
|
59
|
+
{ error: e instanceof Error ? e.message : String(e) }
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// 1. Configuration for Fixed Benchmark
|
|
64
|
+
const generatorConfig: BenchmarkConfig = {
|
|
65
|
+
durationMinutes: 10,
|
|
66
|
+
tickInterval: 1,
|
|
67
|
+
numPredictionMarkets: 5,
|
|
68
|
+
numPerpetualMarkets: 3,
|
|
69
|
+
numAgents: 5,
|
|
70
|
+
seed: 12345, // FIXED SEED for fairness
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
logger.info('Generating fixed benchmark snapshot...');
|
|
74
|
+
|
|
75
|
+
// 2. Run Baseline (Random Strategy)
|
|
76
|
+
// Note: We use the SAME generator config, so the runner will generate the SAME snapshot
|
|
77
|
+
// because of the fixed seed.
|
|
78
|
+
logger.info('>>> STARTING RUN A: BASELINE (RANDOM) <<<');
|
|
79
|
+
const baselineResult = await BenchmarkRunner.runSingle({
|
|
80
|
+
generatorConfig,
|
|
81
|
+
agentRuntime: mockRuntime, // Not used for baseline strategy
|
|
82
|
+
agentUserId: 'baseline-agent',
|
|
83
|
+
saveTrajectory: false, // Baseline doesn't need trajectory recording
|
|
84
|
+
outputDir: path.join(outputDir, 'baseline'),
|
|
85
|
+
forceStrategy: 'random', // Force dumb strategy
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// 3. Run Challenger (Smart/Momentum Strategy for this demo)
|
|
89
|
+
// We use 'momentum' here to simulate a "Smart" agent for demonstration.
|
|
90
|
+
logger.info('>>> STARTING RUN B: CHALLENGER (MOMENTUM/LLM) <<<');
|
|
91
|
+
|
|
92
|
+
const challengerResult = await BenchmarkRunner.runSingle({
|
|
93
|
+
generatorConfig, // Same config -> Same seed -> Same market conditions
|
|
94
|
+
agentRuntime: mockRuntime,
|
|
95
|
+
agentUserId: 'challenger-agent',
|
|
96
|
+
saveTrajectory: true, // Record trajectory for the "Smart" agent to analyze decisions
|
|
97
|
+
outputDir: path.join(outputDir, 'challenger'),
|
|
98
|
+
forceStrategy: 'momentum', // Simulating "Smart" behavior
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// 4. Generate Comparison Report
|
|
102
|
+
await MetricsVisualizer.generateComparisonReport(
|
|
103
|
+
baselineResult,
|
|
104
|
+
challengerResult,
|
|
105
|
+
outputDir
|
|
106
|
+
);
|
|
107
|
+
|
|
108
|
+
console.log(`\n✅ Benchmark complete. Results saved to: ${outputDir}`);
|
|
109
|
+
|
|
110
|
+
process.exit(0);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
main().catch((error) => {
|
|
114
|
+
console.error('Benchmark failed:', error);
|
|
115
|
+
process.exit(1);
|
|
116
|
+
});
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Full Training Pipeline Test
|
|
5
|
+
*
|
|
6
|
+
* This script runs the complete training pipeline end-to-end:
|
|
7
|
+
* 1. Initialize training package
|
|
8
|
+
* 2. Generate real trajectories (or use existing)
|
|
9
|
+
* 3. Score trajectories with LLM-as-judge
|
|
10
|
+
* 4. Export training data
|
|
11
|
+
* 5. Run archetype matchup benchmark
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* bun run packages/training/scripts/run-full-pipeline.ts
|
|
15
|
+
*
|
|
16
|
+
* Options:
|
|
17
|
+
* --skip-generation Skip trajectory generation (use existing data)
|
|
18
|
+
* --skip-scoring Skip LLM scoring
|
|
19
|
+
* --archetypes Comma-separated archetypes (default: trader,researcher)
|
|
20
|
+
* --agents Agents per archetype (default: 2)
|
|
21
|
+
* --ticks Ticks per agent (default: 10)
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { count, db, eq, isNotNull, trajectories } from '@elizaos/db';
|
|
25
|
+
import { parseArgs } from 'util';
|
|
26
|
+
|
|
27
|
+
// Parse command line arguments
|
|
28
|
+
const { values } = parseArgs({
|
|
29
|
+
args: process.argv.slice(2),
|
|
30
|
+
options: {
|
|
31
|
+
'skip-generation': { type: 'boolean', default: false },
|
|
32
|
+
'skip-scoring': { type: 'boolean', default: false },
|
|
33
|
+
archetypes: { type: 'string', default: 'trader,researcher' },
|
|
34
|
+
agents: { type: 'string', default: '2' },
|
|
35
|
+
ticks: { type: 'string', default: '10' },
|
|
36
|
+
help: { type: 'boolean', short: 'h' },
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
if (values.help) {
|
|
41
|
+
console.log(`
|
|
42
|
+
Full Training Pipeline Test
|
|
43
|
+
|
|
44
|
+
Usage:
|
|
45
|
+
bun run packages/training/scripts/run-full-pipeline.ts [options]
|
|
46
|
+
|
|
47
|
+
Options:
|
|
48
|
+
--skip-generation Skip trajectory generation (use existing data)
|
|
49
|
+
--skip-scoring Skip LLM scoring
|
|
50
|
+
--archetypes Comma-separated archetypes (default: trader,researcher)
|
|
51
|
+
--agents Agents per archetype (default: 2)
|
|
52
|
+
--ticks Ticks per agent (default: 10)
|
|
53
|
+
-h, --help Show this help message
|
|
54
|
+
`);
|
|
55
|
+
process.exit(0);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const config = {
|
|
59
|
+
skipGeneration: values['skip-generation'] as boolean,
|
|
60
|
+
skipScoring: values['skip-scoring'] as boolean,
|
|
61
|
+
archetypes: (values.archetypes as string).split(','),
|
|
62
|
+
agentsPerArchetype: parseInt(values.agents as string, 10),
|
|
63
|
+
ticksPerAgent: parseInt(values.ticks as string, 10),
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
console.log('═══════════════════════════════════════════════════════════════');
|
|
67
|
+
console.log(' Babylon Full Training Pipeline');
|
|
68
|
+
console.log('═══════════════════════════════════════════════════════════════');
|
|
69
|
+
console.log(` Archetypes: ${config.archetypes.join(', ')}`);
|
|
70
|
+
console.log(` Agents per archetype: ${config.agentsPerArchetype}`);
|
|
71
|
+
console.log(` Ticks per agent: ${config.ticksPerAgent}`);
|
|
72
|
+
console.log(` Skip generation: ${config.skipGeneration}`);
|
|
73
|
+
console.log(` Skip scoring: ${config.skipScoring}`);
|
|
74
|
+
console.log(
|
|
75
|
+
'═══════════════════════════════════════════════════════════════\n'
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
async function runPipeline() {
|
|
79
|
+
const startTime = Date.now();
|
|
80
|
+
|
|
81
|
+
// Step 1: Check database connection
|
|
82
|
+
console.log('Step 1: Checking database connection...');
|
|
83
|
+
try {
|
|
84
|
+
const result = await db.select({ count: count() }).from(trajectories);
|
|
85
|
+
console.log(
|
|
86
|
+
` ✅ Database connected. ${result[0]?.count || 0} existing trajectories.\n`
|
|
87
|
+
);
|
|
88
|
+
} catch (error) {
|
|
89
|
+
console.log(` ❌ Database connection failed: ${error}`);
|
|
90
|
+
console.log(' Make sure DATABASE_URL is set correctly.\n');
|
|
91
|
+
process.exit(1);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Step 2: Initialize training package
|
|
95
|
+
console.log('Step 2: Initializing training package...');
|
|
96
|
+
try {
|
|
97
|
+
const { initializeTrainingPackage } = await import('../src/init-training');
|
|
98
|
+
await initializeTrainingPackage();
|
|
99
|
+
console.log(' ✅ Training package initialized.\n');
|
|
100
|
+
} catch (error) {
|
|
101
|
+
console.log(` ⚠️ Training package initialization failed: ${error}`);
|
|
102
|
+
console.log(' Will continue with limited functionality.\n');
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Step 3: Generate trajectories
|
|
106
|
+
if (!config.skipGeneration) {
|
|
107
|
+
console.log('Step 3: Generating real trajectories...');
|
|
108
|
+
try {
|
|
109
|
+
const { TrajectoryGenerator } = await import(
|
|
110
|
+
'../src/generation/TrajectoryGenerator'
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
// Get a manager ID (first user in DB or create one)
|
|
114
|
+
const { users, desc } = await import('@elizaos/db');
|
|
115
|
+
const managerResult = await db
|
|
116
|
+
.select({ id: users.id })
|
|
117
|
+
.from(users)
|
|
118
|
+
.orderBy(desc(users.createdAt))
|
|
119
|
+
.limit(1);
|
|
120
|
+
|
|
121
|
+
if (managerResult.length === 0) {
|
|
122
|
+
console.log(' ⚠️ No users found in database. Skipping generation.');
|
|
123
|
+
console.log(' Create a user first or use --skip-generation.\n');
|
|
124
|
+
} else {
|
|
125
|
+
const managerId = managerResult[0].id;
|
|
126
|
+
|
|
127
|
+
const generator = new TrajectoryGenerator({
|
|
128
|
+
archetypes: config.archetypes,
|
|
129
|
+
agentsPerArchetype: config.agentsPerArchetype,
|
|
130
|
+
ticksPerAgent: config.ticksPerAgent,
|
|
131
|
+
parallelAgents: 3,
|
|
132
|
+
recordTrajectories: true,
|
|
133
|
+
managerId,
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
const result = await generator.generate();
|
|
137
|
+
console.log(
|
|
138
|
+
` ✅ Generated ${result.trajectoryIds.length} trajectories.`
|
|
139
|
+
);
|
|
140
|
+
console.log(` Agents created: ${result.agentsCreated.length}`);
|
|
141
|
+
console.log(` Duration: ${result.duration}ms\n`);
|
|
142
|
+
|
|
143
|
+
// Cleanup test agents
|
|
144
|
+
await generator.cleanup();
|
|
145
|
+
}
|
|
146
|
+
} catch (error) {
|
|
147
|
+
console.log(` ❌ Generation failed: ${error}`);
|
|
148
|
+
console.log(
|
|
149
|
+
' Make sure the server is running or use --skip-generation.\n'
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
} else {
|
|
153
|
+
console.log(
|
|
154
|
+
'Step 3: Skipping trajectory generation (--skip-generation).\n'
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Step 4: Score trajectories
|
|
159
|
+
if (!config.skipScoring) {
|
|
160
|
+
console.log('Step 4: Scoring trajectories with LLM-as-judge...');
|
|
161
|
+
try {
|
|
162
|
+
const { archetypeScoringService } = await import(
|
|
163
|
+
'../src/scoring/ArchetypeScoringService'
|
|
164
|
+
);
|
|
165
|
+
|
|
166
|
+
// Check for unscored trajectories
|
|
167
|
+
const unscoredCount = await db
|
|
168
|
+
.select({ count: count() })
|
|
169
|
+
.from(trajectories)
|
|
170
|
+
.where(eq(trajectories.isTrainingData, true));
|
|
171
|
+
|
|
172
|
+
const scoredCount = await db
|
|
173
|
+
.select({ count: count() })
|
|
174
|
+
.from(trajectories)
|
|
175
|
+
.where(isNotNull(trajectories.aiJudgeReward));
|
|
176
|
+
|
|
177
|
+
console.log(` Training trajectories: ${unscoredCount[0]?.count || 0}`);
|
|
178
|
+
console.log(` Already scored: ${scoredCount[0]?.count || 0}`);
|
|
179
|
+
|
|
180
|
+
// Score a batch of unscored trajectories
|
|
181
|
+
const result = await archetypeScoringService.scoreUnscoredTrajectories(
|
|
182
|
+
'default',
|
|
183
|
+
10
|
|
184
|
+
);
|
|
185
|
+
console.log(
|
|
186
|
+
` ✅ Scored ${result.scored} trajectories (${result.errors} errors).\n`
|
|
187
|
+
);
|
|
188
|
+
} catch (error) {
|
|
189
|
+
console.log(` ❌ Scoring failed: ${error}`);
|
|
190
|
+
console.log(' Make sure GROQ_API_KEY is set.\n');
|
|
191
|
+
}
|
|
192
|
+
} else {
|
|
193
|
+
console.log('Step 4: Skipping scoring (--skip-scoring).\n');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Step 5: Run archetype matchup benchmark
|
|
197
|
+
console.log('Step 5: Running archetype matchup benchmark...');
|
|
198
|
+
try {
|
|
199
|
+
const { ArchetypeMatchupBenchmark } = await import(
|
|
200
|
+
'../src/benchmark/ArchetypeMatchupBenchmark'
|
|
201
|
+
);
|
|
202
|
+
|
|
203
|
+
const benchmark = new ArchetypeMatchupBenchmark({
|
|
204
|
+
archetypes: config.archetypes,
|
|
205
|
+
agentsPerArchetype: 2,
|
|
206
|
+
rounds: 3,
|
|
207
|
+
ticksPerRound: 50,
|
|
208
|
+
marketConditions: ['bull', 'bear'],
|
|
209
|
+
availableVramGb: 16,
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
const results = await benchmark.run();
|
|
213
|
+
|
|
214
|
+
console.log(` ✅ Benchmark complete.`);
|
|
215
|
+
for (const result of results) {
|
|
216
|
+
console.log(` ${result.marketCondition.toUpperCase()} market:`);
|
|
217
|
+
const top3 = result.archetypeRankings.slice(0, 3);
|
|
218
|
+
for (const r of top3) {
|
|
219
|
+
console.log(
|
|
220
|
+
` ${r.avgRank.toFixed(1)}. ${r.archetype} (avg PnL: ${r.avgPnl.toFixed(2)})`
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
console.log('');
|
|
225
|
+
} catch (error) {
|
|
226
|
+
console.log(` ❌ Benchmark failed: ${error}\n`);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Step 6: Export training data
|
|
230
|
+
console.log('Step 6: Checking training data export...');
|
|
231
|
+
try {
|
|
232
|
+
const scoredResult = await db
|
|
233
|
+
.select({ count: count() })
|
|
234
|
+
.from(trajectories)
|
|
235
|
+
.where(isNotNull(trajectories.aiJudgeReward));
|
|
236
|
+
|
|
237
|
+
const scored = scoredResult[0]?.count || 0;
|
|
238
|
+
if (scored > 0) {
|
|
239
|
+
console.log(` ✅ ${scored} trajectories ready for export.`);
|
|
240
|
+
console.log(' Run "babylon train export" to export training data.\n');
|
|
241
|
+
} else {
|
|
242
|
+
console.log(' ⚠️ No scored trajectories available for export.');
|
|
243
|
+
console.log(' Generate and score trajectories first.\n');
|
|
244
|
+
}
|
|
245
|
+
} catch (error) {
|
|
246
|
+
console.log(` ❌ Export check failed: ${error}\n`);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Summary
|
|
250
|
+
const totalTime = Date.now() - startTime;
|
|
251
|
+
console.log(
|
|
252
|
+
'═══════════════════════════════════════════════════════════════'
|
|
253
|
+
);
|
|
254
|
+
console.log(' Pipeline Complete');
|
|
255
|
+
console.log(
|
|
256
|
+
'═══════════════════════════════════════════════════════════════'
|
|
257
|
+
);
|
|
258
|
+
console.log(` Total time: ${(totalTime / 1000).toFixed(1)}s`);
|
|
259
|
+
console.log('');
|
|
260
|
+
console.log(' Next steps:');
|
|
261
|
+
console.log(' 1. Export data: babylon train export');
|
|
262
|
+
console.log(' 2. Train models: python python/scripts/run_full_pipeline.py');
|
|
263
|
+
console.log(' 3. Benchmark: babylon train benchmark');
|
|
264
|
+
console.log(
|
|
265
|
+
'═══════════════════════════════════════════════════════════════'
|
|
266
|
+
);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
runPipeline().catch((error) => {
|
|
270
|
+
console.error('Pipeline failed:', error);
|
|
271
|
+
process.exit(1);
|
|
272
|
+
});
|