@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/Dockerfile +75 -0
  2. package/LICENSE +21 -0
  3. package/Makefile +374 -0
  4. package/README.md +346 -0
  5. package/config/rubrics.json +137 -0
  6. package/docker-compose.test.yml +57 -0
  7. package/package.json +57 -0
  8. package/python/config/babylon_atropos.yaml +90 -0
  9. package/python/config/profiles/12gb.json +11 -0
  10. package/python/config/profiles/16gb.json +10 -0
  11. package/python/config/profiles/24gb.json +10 -0
  12. package/python/config/profiles/48gb.json +10 -0
  13. package/python/config/profiles/cpu.json +11 -0
  14. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  15. package/python/config/profiles/l40-2gpu.json +22 -0
  16. package/python/config/profiles/l40-4gpu.json +21 -0
  17. package/python/config/profiles/l40.json +17 -0
  18. package/python/config/tinker_training.yaml +143 -0
  19. package/python/curriculum_state.json +165 -0
  20. package/python/env.template +86 -0
  21. package/python/env.training.template +46 -0
  22. package/python/pyproject.toml +41 -0
  23. package/python/requirements-ci.txt +31 -0
  24. package/python/requirements.txt +87 -0
  25. package/python/scripts/__init__.py +4 -0
  26. package/python/scripts/benchmark_should_respond.py +190 -0
  27. package/python/scripts/debug_inference.py +62 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/optimize_prompt_grpo.py +269 -0
  36. package/python/scripts/run_ab_test.py +143 -0
  37. package/python/scripts/run_full_pipeline.py +544 -0
  38. package/python/scripts/run_tinker_training.py +192 -0
  39. package/python/scripts/run_training.py +914 -0
  40. package/python/scripts/test_generation.py +29 -0
  41. package/python/scripts/test_judge.py +155 -0
  42. package/python/scripts/test_pipeline.py +356 -0
  43. package/python/scripts/test_trained_model.py +380 -0
  44. package/python/scripts/train_grpo.py +360 -0
  45. package/python/scripts/train_jsonl.py +223 -0
  46. package/python/scripts/train_local.py +528 -0
  47. package/python/setup.py +20 -0
  48. package/python/src/__init__.py +190 -0
  49. package/python/src/data_bridge/__init__.py +24 -0
  50. package/python/src/data_bridge/converter.py +435 -0
  51. package/python/src/data_bridge/reader.py +393 -0
  52. package/python/src/models.py +283 -0
  53. package/python/src/training/__init__.py +605 -0
  54. package/python/src/training/ab_testing.py +404 -0
  55. package/python/src/training/action_executor.py +621 -0
  56. package/python/src/training/archetype_trainer.py +347 -0
  57. package/python/src/training/atropos_trainer.py +980 -0
  58. package/python/src/training/babylon_env.py +1254 -0
  59. package/python/src/training/error_recovery.py +647 -0
  60. package/python/src/training/evaluation.py +856 -0
  61. package/python/src/training/fast_simulator.py +880 -0
  62. package/python/src/training/format_validator.py +584 -0
  63. package/python/src/training/hybrid_env.py +522 -0
  64. package/python/src/training/kl_controller.py +628 -0
  65. package/python/src/training/multi_prompt_dataset.py +883 -0
  66. package/python/src/training/multi_turn.py +656 -0
  67. package/python/src/training/online_env.py +1084 -0
  68. package/python/src/training/quality_scorer.py +391 -0
  69. package/python/src/training/quality_utils.py +633 -0
  70. package/python/src/training/rewards.py +1344 -0
  71. package/python/src/training/rlaif_env.py +17 -0
  72. package/python/src/training/rollout_generator.py +502 -0
  73. package/python/src/training/rubric_loader.py +198 -0
  74. package/python/src/training/scenario_pool.py +1072 -0
  75. package/python/src/training/schemas.py +481 -0
  76. package/python/src/training/service_manager.py +552 -0
  77. package/python/src/training/simulation_bridge.py +535 -0
  78. package/python/src/training/tick_reward_attribution.py +399 -0
  79. package/python/src/training/tinker_client.py +575 -0
  80. package/python/src/training/tinker_trainer.py +646 -0
  81. package/python/src/training/tokenization_utils.py +402 -0
  82. package/python/tests/e2e/__init__.py +13 -0
  83. package/python/tests/e2e/conftest.py +258 -0
  84. package/python/tests/e2e/test_full_pipeline.py +643 -0
  85. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  86. package/python/tests/integration/__init__.py +12 -0
  87. package/python/tests/integration/conftest.py +383 -0
  88. package/python/tests/integration/test_db_integration.py +649 -0
  89. package/python/tests/integration/test_json_mode_integration.py +554 -0
  90. package/python/tests/test_action_executor.py +594 -0
  91. package/python/tests/test_archetype_scoring.py +1027 -0
  92. package/python/tests/test_atropos_integration.py +360 -0
  93. package/python/tests/test_evaluation.py +727 -0
  94. package/python/tests/test_format_validator.py +486 -0
  95. package/python/tests/test_kl_controller.py +432 -0
  96. package/python/tests/test_lr_scheduler.py +579 -0
  97. package/python/tests/test_multi_turn.py +590 -0
  98. package/python/tests/test_online_env.py +519 -0
  99. package/python/tests/test_quality_scorer.py +474 -0
  100. package/python/tests/test_scenario_pool.py +735 -0
  101. package/python/tests/test_service_manager.py +585 -0
  102. package/python/tests/test_simulation_rollout.py +581 -0
  103. package/python/tests/test_tokenization_utils.py +501 -0
  104. package/python/tests/test_training_orchestrator.py +497 -0
  105. package/python/tests/test_training_output_structure.py +661 -0
  106. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  107. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  108. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  109. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  110. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  111. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  112. package/research-output/training-runs/training-run-1771276293257.json +38 -0
  113. package/research-output/training-runs/training-run-1771276389280.json +38 -0
  114. package/research-output/training-runs/training-run-1771276502776.json +38 -0
  115. package/research-output/training-runs/training-run-1771277340748.json +38 -0
  116. package/research-output/training-runs/training-run-1773013658993.json +38 -0
  117. package/research-output/training-runs/training-run-1773013861014.json +38 -0
  118. package/research-output/training-runs/training-run-1773014215983.json +38 -0
  119. package/scripts/assess-training-data.ts +422 -0
  120. package/scripts/e2e-training-test.ts +550 -0
  121. package/scripts/export-rubrics.ts +64 -0
  122. package/scripts/generate-research-report.ts +1523 -0
  123. package/scripts/generate_dataset.sh +173 -0
  124. package/scripts/generate_should_respond.ts +267 -0
  125. package/scripts/generate_should_respond_dataset.ts +162 -0
  126. package/scripts/json-mode-benchmark.ts +399 -0
  127. package/scripts/rank_trajectories.ts +207 -0
  128. package/scripts/real-archetype-benchmark.ts +210 -0
  129. package/scripts/run-baseline-comparison.ts +116 -0
  130. package/scripts/run-full-pipeline.ts +272 -0
  131. package/scripts/run_rlaif_loop.ts +78 -0
  132. package/scripts/run_task_benchmark.ts +247 -0
  133. package/scripts/runpod_setup.sh +137 -0
  134. package/scripts/runpod_validate.sh +147 -0
  135. package/scripts/test-model-in-game.ts +955 -0
  136. package/scripts/test-scoring.ts +73 -0
  137. package/scripts/test-trained-model.ts +209 -0
  138. package/scripts/train-and-test.ts +824 -0
  139. package/scripts/verify-final.ts +118 -0
  140. package/src/adapter.ts +516 -0
  141. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  142. package/src/archetypes/derive-archetype.ts +249 -0
  143. package/src/archetypes/index.ts +22 -0
  144. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  145. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  146. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  147. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  148. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  149. package/src/benchmark/BenchmarkRunner.ts +685 -0
  150. package/src/benchmark/BenchmarkValidator.ts +204 -0
  151. package/src/benchmark/FastEvalRunner.ts +225 -0
  152. package/src/benchmark/MetricsValidator.ts +165 -0
  153. package/src/benchmark/MetricsVisualizer.ts +909 -0
  154. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  155. package/src/benchmark/ModelRegistry.ts +158 -0
  156. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  157. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  158. package/src/benchmark/SimulationEngine.ts +832 -0
  159. package/src/benchmark/TaskRunner.ts +94 -0
  160. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  161. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  162. package/src/benchmark/index.ts +91 -0
  163. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  164. package/src/benchmark/simulation-types.ts +78 -0
  165. package/src/dependencies.ts +475 -0
  166. package/src/generation/TrajectoryGenerator.ts +387 -0
  167. package/src/generation/index.ts +12 -0
  168. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  169. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  170. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  171. package/src/huggingface/index.ts +27 -0
  172. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  173. package/src/index.ts +102 -0
  174. package/src/init-training.ts +53 -0
  175. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  176. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  177. package/src/metrics/index.ts +8 -0
  178. package/src/metrics/types.ts +200 -0
  179. package/src/rubrics/__tests__/index.test.ts +184 -0
  180. package/src/rubrics/ass-kisser.ts +85 -0
  181. package/src/rubrics/degen.ts +80 -0
  182. package/src/rubrics/goody-twoshoes.ts +84 -0
  183. package/src/rubrics/index.ts +236 -0
  184. package/src/rubrics/information-trader.ts +84 -0
  185. package/src/rubrics/infosec.ts +101 -0
  186. package/src/rubrics/liar.ts +104 -0
  187. package/src/rubrics/perps-trader.ts +87 -0
  188. package/src/rubrics/researcher.ts +81 -0
  189. package/src/rubrics/scammer.ts +82 -0
  190. package/src/rubrics/social-butterfly.ts +73 -0
  191. package/src/rubrics/super-predictor.ts +97 -0
  192. package/src/rubrics/trader.ts +67 -0
  193. package/src/scoring/ArchetypeScoringService.ts +486 -0
  194. package/src/scoring/JudgePromptBuilder.ts +556 -0
  195. package/src/scoring/LLMJudgeCache.ts +401 -0
  196. package/src/scoring/index.ts +9 -0
  197. package/src/training/AutomationPipeline.ts +916 -0
  198. package/src/training/BenchmarkService.ts +518 -0
  199. package/src/training/ConfigValidator.ts +220 -0
  200. package/src/training/MarketOutcomesTracker.ts +187 -0
  201. package/src/training/ModelDeployer.ts +186 -0
  202. package/src/training/ModelFetcher.ts +76 -0
  203. package/src/training/ModelSelectionService.ts +341 -0
  204. package/src/training/ModelUsageVerifier.ts +160 -0
  205. package/src/training/MultiModelOrchestrator.ts +580 -0
  206. package/src/training/RLModelConfig.ts +407 -0
  207. package/src/training/RewardBackpropagationService.ts +149 -0
  208. package/src/training/RulerScoringService.ts +666 -0
  209. package/src/training/TrainingMonitor.ts +166 -0
  210. package/src/training/TrajectoryRecorder.ts +399 -0
  211. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  212. package/src/training/index.ts +100 -0
  213. package/src/training/logRLConfig.ts +34 -0
  214. package/src/training/pipeline.ts +129 -0
  215. package/src/training/storage/ModelStorageService.ts +279 -0
  216. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  217. package/src/training/storage/index.ts +17 -0
  218. package/src/training/types.ts +207 -0
  219. package/src/training/window-utils.ts +138 -0
  220. package/src/utils/index.ts +101 -0
  221. package/src/utils/logger.ts +59 -0
  222. package/src/utils/snowflake.ts +17 -0
  223. package/src/utils/synthetic-detector.ts +111 -0
  224. package/tsconfig.json +20 -0
@@ -0,0 +1,824 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Automated Training and Game Testing Pipeline
5
+ *
6
+ * This script automates the complete training and testing workflow:
7
+ * 1. Train a model locally (MLX on Mac, CUDA on Linux/Windows)
8
+ * 2. Test the trained adapter
9
+ * 3. Import to Ollama
10
+ * 4. Run game tests - actual trades on markets
11
+ *
12
+ * Run: bun run packages/training/scripts/train-and-test.ts
13
+ *
14
+ * Options:
15
+ * --skip-training Skip model training (use existing adapter)
16
+ * --skip-test Skip the game testing after training
17
+ * --adapter-path Path to existing adapter (default: auto-detect)
18
+ * --ticks <n> Number of game ticks (default: 100)
19
+ * --archetype <type> Agent archetype (default: trader)
20
+ * --verbose Enable verbose logging
21
+ */
22
+
23
+ import { type Subprocess, spawn } from 'bun';
24
+ import { existsSync, mkdirSync, writeFileSync } from 'fs';
25
+ import { dirname, join, resolve } from 'path';
26
+ import { fileURLToPath } from 'url';
27
+ import { parseArgs } from 'util';
28
+
29
+ const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
30
+ const REPO_ROOT = resolve(SCRIPT_DIR, '../../..');
31
+ const TRAINING_DIR = resolve(SCRIPT_DIR, '../python');
32
+
33
+ // Configuration
34
+ interface PipelineConfig {
35
+ skipTraining: boolean;
36
+ skipBenchmark: boolean;
37
+ adapterPath?: string;
38
+ ticks: number;
39
+ archetype: string;
40
+ verbose: boolean;
41
+ }
42
+
43
+ interface StepResult {
44
+ name: string;
45
+ success: boolean;
46
+ duration: number;
47
+ message: string;
48
+ details?: Record<string, unknown>;
49
+ }
50
+
51
+ const results: StepResult[] = [];
52
+
53
+ // Show help
54
+ function showHelp(): void {
55
+ console.log(`
56
+ ElizaOS Automated Training & Game Testing Pipeline
57
+
58
+ Usage: bun run packages/training/scripts/train-and-test.ts [options]
59
+
60
+ Options:
61
+ --skip-training Skip model training (use existing adapter)
62
+ --skip-test Skip the game testing after training
63
+ --adapter-path Path to existing adapter (default: auto-detect)
64
+ --ticks <n> Number of game ticks to run (default: 100)
65
+ --archetype <type> Agent archetype (default: trader)
66
+ --verbose Enable verbose logging
67
+ --help Show this help message
68
+
69
+ Examples:
70
+ # Full pipeline (train + test)
71
+ bun run packages/training/scripts/train-and-test.ts
72
+
73
+ # Skip training, test existing model
74
+ bun run packages/training/scripts/train-and-test.ts --skip-training
75
+
76
+ # Train only, no testing
77
+ bun run packages/training/scripts/train-and-test.ts --skip-test
78
+
79
+ # Run 500 game ticks with verbose output
80
+ bun run packages/training/scripts/train-and-test.ts --ticks 500 --verbose
81
+ `);
82
+ }
83
+
84
+ // Parse command line arguments
85
+ function parseConfig(): PipelineConfig {
86
+ // Check for help flag first
87
+ if (process.argv.includes('--help') || process.argv.includes('-h')) {
88
+ showHelp();
89
+ process.exit(0);
90
+ }
91
+
92
+ const { values } = parseArgs({
93
+ args: process.argv.slice(2),
94
+ options: {
95
+ 'skip-training': { type: 'boolean', default: false },
96
+ 'skip-test': { type: 'boolean', default: false },
97
+ 'adapter-path': { type: 'string' },
98
+ ticks: { type: 'string', default: '100' },
99
+ archetype: { type: 'string', default: 'trader' },
100
+ verbose: { type: 'boolean', default: false },
101
+ },
102
+ });
103
+
104
+ return {
105
+ skipTraining: values['skip-training'] ?? false,
106
+ skipBenchmark: values['skip-test'] ?? false,
107
+ adapterPath: values['adapter-path'],
108
+ ticks: parseInt(values.ticks ?? '100', 10),
109
+ archetype: values.archetype ?? 'trader',
110
+ verbose: values.verbose ?? false,
111
+ };
112
+ }
113
+
114
+ // Utility to run a command and capture output
115
+ async function runCommand(
116
+ command: string[],
117
+ options: {
118
+ cwd?: string;
119
+ timeout?: number;
120
+ env?: Record<string, string>;
121
+ } = {}
122
+ ): Promise<{ success: boolean; output: string; exitCode: number }> {
123
+ const { cwd, timeout = 600000, env } = options;
124
+
125
+ try {
126
+ const proc = spawn(command, {
127
+ cwd,
128
+ stdout: 'pipe',
129
+ stderr: 'pipe',
130
+ env: { ...process.env, ...env },
131
+ });
132
+
133
+ const timeoutPromise = new Promise<never>((_, reject) => {
134
+ setTimeout(() => reject(new Error('Command timed out')), timeout);
135
+ });
136
+
137
+ const [exitCode, stdout, stderr] = await Promise.race([
138
+ Promise.all([
139
+ proc.exited,
140
+ new Response(proc.stdout).text(),
141
+ new Response(proc.stderr).text(),
142
+ ]),
143
+ timeoutPromise,
144
+ ]);
145
+
146
+ const output = stdout + stderr;
147
+ return { success: exitCode === 0, output, exitCode };
148
+ } catch (error) {
149
+ return {
150
+ success: false,
151
+ output: error instanceof Error ? error.message : String(error),
152
+ exitCode: -1,
153
+ };
154
+ }
155
+ }
156
+
157
+ // Run a pipeline step with logging
158
+ async function runStep(
159
+ name: string,
160
+ fn: () => Promise<{
161
+ success: boolean;
162
+ message: string;
163
+ details?: Record<string, unknown>;
164
+ }>
165
+ ): Promise<boolean> {
166
+ const start = Date.now();
167
+ console.log(`\n${'═'.repeat(60)}`);
168
+ console.log(` STEP: ${name}`);
169
+ console.log(`${'═'.repeat(60)}\n`);
170
+
171
+ try {
172
+ const result = await fn();
173
+ const duration = Date.now() - start;
174
+
175
+ results.push({ name, ...result, duration });
176
+
177
+ if (result.success) {
178
+ console.log(`\n✅ ${name} completed in ${(duration / 1000).toFixed(1)}s`);
179
+ console.log(` ${result.message}`);
180
+ } else {
181
+ console.log(`\n❌ ${name} failed after ${(duration / 1000).toFixed(1)}s`);
182
+ console.log(` ${result.message}`);
183
+ }
184
+
185
+ return result.success;
186
+ } catch (error) {
187
+ const duration = Date.now() - start;
188
+ const message = error instanceof Error ? error.message : String(error);
189
+
190
+ results.push({ name, success: false, duration, message });
191
+ console.log(`\n💥 ${name} errored after ${(duration / 1000).toFixed(1)}s`);
192
+ console.log(` ${message}`);
193
+
194
+ return false;
195
+ }
196
+ }
197
+
198
+ // Step 1: Check prerequisites
199
+ async function checkPrerequisites(config: PipelineConfig): Promise<{
200
+ success: boolean;
201
+ message: string;
202
+ details?: Record<string, unknown>;
203
+ }> {
204
+ const checks: Record<string, boolean> = {};
205
+
206
+ // Check Python
207
+ console.log('Checking Python...');
208
+ const pythonResult = await runCommand(['python3', '--version']);
209
+ checks.python = pythonResult.success;
210
+ if (pythonResult.success) {
211
+ console.log(` ✓ Python: ${pythonResult.output.trim()}`);
212
+ } else {
213
+ console.log(' ✗ Python not found');
214
+ }
215
+
216
+ // Check if training directory exists
217
+ checks.trainingDir = existsSync(TRAINING_DIR);
218
+ console.log(
219
+ checks.trainingDir
220
+ ? ` ✓ Training directory exists`
221
+ : ` ✗ Training directory not found: ${TRAINING_DIR}`
222
+ );
223
+
224
+ // Check for MLX (macOS) or CUDA
225
+ const platform = process.platform;
226
+ if (platform === 'darwin') {
227
+ console.log('Checking MLX (macOS)...');
228
+ const mlxResult = await runCommand([
229
+ 'python3',
230
+ '-c',
231
+ 'import mlx; print(mlx.__version__)',
232
+ ]);
233
+ checks.mlx = mlxResult.success;
234
+ console.log(
235
+ mlxResult.success
236
+ ? ` ✓ MLX: ${mlxResult.output.trim()}`
237
+ : ' ⚠ MLX not installed (will be installed during training)'
238
+ );
239
+ } else {
240
+ console.log('Checking CUDA...');
241
+ const cudaResult = await runCommand([
242
+ 'python3',
243
+ '-c',
244
+ 'import torch; print(torch.cuda.is_available())',
245
+ ]);
246
+ checks.cuda = cudaResult.success && cudaResult.output.includes('True');
247
+ console.log(
248
+ checks.cuda
249
+ ? ' ✓ CUDA available'
250
+ : ' ⚠ CUDA not available (will use CPU)'
251
+ );
252
+ }
253
+
254
+ // Check Ollama
255
+ console.log('Checking Ollama...');
256
+ const ollamaResult = await runCommand(['which', 'ollama']);
257
+ checks.ollama = ollamaResult.success;
258
+ console.log(
259
+ ollamaResult.success
260
+ ? ` ✓ Ollama: ${ollamaResult.output.trim()}`
261
+ : ' ✗ Ollama not installed'
262
+ );
263
+
264
+ if (!checks.ollama) {
265
+ console.log('\n 📦 To install Ollama:');
266
+ console.log(' macOS: brew install ollama');
267
+ console.log(' Linux: curl -fsSL https://ollama.ai/install.sh | sh');
268
+ }
269
+
270
+ const allPassed = checks.python && checks.trainingDir;
271
+ const ollamaRequired = !config.skipBenchmark;
272
+
273
+ return {
274
+ success: allPassed && (!ollamaRequired || checks.ollama),
275
+ message: allPassed
276
+ ? `Prerequisites satisfied (Ollama: ${checks.ollama ? 'yes' : 'no'})`
277
+ : 'Missing required prerequisites',
278
+ details: checks,
279
+ };
280
+ }
281
+
282
+ // Step 2: Install Python dependencies
283
+ async function installDependencies(): Promise<{
284
+ success: boolean;
285
+ message: string;
286
+ details?: Record<string, unknown>;
287
+ }> {
288
+ const trainingDir = TRAINING_DIR;
289
+
290
+ console.log('Installing Python dependencies...');
291
+
292
+ // Check if requirements.txt exists
293
+ const requirementsPath = join(trainingDir, 'requirements.txt');
294
+ if (!existsSync(requirementsPath)) {
295
+ return {
296
+ success: false,
297
+ message: `requirements.txt not found at ${requirementsPath}`,
298
+ };
299
+ }
300
+
301
+ const result = await runCommand(
302
+ ['pip', 'install', '-r', 'requirements.txt', '--quiet'],
303
+ { cwd: trainingDir, timeout: 300000 }
304
+ );
305
+
306
+ if (!result.success) {
307
+ console.log('pip install output:', result.output);
308
+ }
309
+
310
+ return {
311
+ success: result.success,
312
+ message: result.success
313
+ ? 'Python dependencies installed'
314
+ : `Failed to install dependencies: ${result.output.slice(0, 200)}`,
315
+ };
316
+ }
317
+
318
+ // Step 3: Train model
319
+ async function trainModel(config: PipelineConfig): Promise<{
320
+ success: boolean;
321
+ message: string;
322
+ details?: Record<string, unknown>;
323
+ }> {
324
+ if (config.skipTraining) {
325
+ return {
326
+ success: true,
327
+ message: 'Training skipped (--skip-training)',
328
+ };
329
+ }
330
+
331
+ const trainingDir = TRAINING_DIR;
332
+ const backend = process.platform === 'darwin' ? 'mlx' : 'cuda';
333
+
334
+ console.log(`Training model with ${backend} backend...`);
335
+ console.log('This may take several minutes...\n');
336
+
337
+ const result = await runCommand(
338
+ [
339
+ 'python3',
340
+ 'scripts/train_local.py',
341
+ '--backend',
342
+ backend,
343
+ '--archetype',
344
+ config.archetype,
345
+ ],
346
+ {
347
+ cwd: trainingDir,
348
+ timeout: 3600000, // 1 hour
349
+ }
350
+ );
351
+
352
+ // Stream output for visibility
353
+ if (config.verbose) {
354
+ console.log(result.output);
355
+ }
356
+
357
+ // Find the adapter path
358
+ const adapterDir = join(trainingDir, 'trained_models/local/adapters');
359
+ const adapterExists = existsSync(adapterDir);
360
+
361
+ return {
362
+ success: result.success && adapterExists,
363
+ message: result.success
364
+ ? `Model trained successfully (adapter: ${adapterDir})`
365
+ : `Training failed: ${result.output.slice(-500)}`,
366
+ details: {
367
+ backend,
368
+ adapterPath: adapterDir,
369
+ exitCode: result.exitCode,
370
+ },
371
+ };
372
+ }
373
+
374
+ // Step 4: Test trained adapter
375
+ async function testAdapter(config: PipelineConfig): Promise<{
376
+ success: boolean;
377
+ message: string;
378
+ details?: Record<string, unknown>;
379
+ }> {
380
+ const trainingDir = TRAINING_DIR;
381
+
382
+ // Find adapter path
383
+ let adapterPath = config.adapterPath;
384
+ if (!adapterPath) {
385
+ const defaultPath = join(trainingDir, 'trained_models/local/adapters');
386
+ if (existsSync(defaultPath)) {
387
+ adapterPath = defaultPath;
388
+ }
389
+ }
390
+
391
+ if (!adapterPath || !existsSync(adapterPath)) {
392
+ return {
393
+ success: false,
394
+ message: `Adapter not found at: ${adapterPath || 'not specified'}`,
395
+ };
396
+ }
397
+
398
+ console.log(`Testing adapter at: ${adapterPath}`);
399
+
400
+ const result = await runCommand(
401
+ [
402
+ 'python3',
403
+ 'scripts/test_trained_model.py',
404
+ '--adapter-path',
405
+ adapterPath,
406
+ '--validate',
407
+ ],
408
+ {
409
+ cwd: trainingDir,
410
+ timeout: 300000,
411
+ }
412
+ );
413
+
414
+ if (config.verbose) {
415
+ console.log(result.output);
416
+ }
417
+
418
+ return {
419
+ success: result.success,
420
+ message: result.success
421
+ ? 'Adapter validation passed'
422
+ : `Adapter test failed: ${result.output.slice(-300)}`,
423
+ details: { adapterPath },
424
+ };
425
+ }
426
+
427
+ // Step 5: Start Ollama
428
+ let ollamaProcess: Subprocess | null = null;
429
+
430
+ async function startOllama(): Promise<{
431
+ success: boolean;
432
+ message: string;
433
+ details?: Record<string, unknown>;
434
+ }> {
435
+ // Check if already running
436
+ try {
437
+ const response = await fetch('http://localhost:11434/api/tags', {
438
+ signal: AbortSignal.timeout(3000),
439
+ });
440
+ if (response.ok) {
441
+ return {
442
+ success: true,
443
+ message: 'Ollama already running',
444
+ };
445
+ }
446
+ } catch {
447
+ // Not running, start it
448
+ }
449
+
450
+ console.log('Starting Ollama server...');
451
+
452
+ ollamaProcess = spawn(['ollama', 'serve'], {
453
+ stdout: 'ignore',
454
+ stderr: 'ignore',
455
+ });
456
+
457
+ // Wait for Ollama to be ready
458
+ for (let i = 0; i < 30; i++) {
459
+ await new Promise((resolve) => setTimeout(resolve, 1000));
460
+
461
+ try {
462
+ const response = await fetch('http://localhost:11434/api/tags', {
463
+ signal: AbortSignal.timeout(2000),
464
+ });
465
+ if (response.ok) {
466
+ return {
467
+ success: true,
468
+ message: 'Ollama started successfully',
469
+ details: { startedByUs: true },
470
+ };
471
+ }
472
+ } catch {
473
+ // Keep waiting
474
+ }
475
+
476
+ if (i % 5 === 4) {
477
+ console.log(` Waiting for Ollama... (${i + 1}s)`);
478
+ }
479
+ }
480
+
481
+ return {
482
+ success: false,
483
+ message: 'Ollama failed to start within 30 seconds',
484
+ };
485
+ }
486
+
487
+ // Step 6: Import adapter to Ollama
488
+ async function importToOllama(config: PipelineConfig): Promise<{
489
+ success: boolean;
490
+ message: string;
491
+ details?: Record<string, unknown>;
492
+ }> {
493
+ const trainingDir = TRAINING_DIR;
494
+
495
+ // Find adapter path
496
+ let adapterPath = config.adapterPath;
497
+ if (!adapterPath) {
498
+ const defaultPath = join(trainingDir, 'trained_models/local/adapters');
499
+ if (existsSync(defaultPath)) {
500
+ adapterPath = defaultPath;
501
+ }
502
+ }
503
+
504
+ if (!adapterPath || !existsSync(adapterPath)) {
505
+ return {
506
+ success: false,
507
+ message: `Adapter not found: ${adapterPath}`,
508
+ };
509
+ }
510
+
511
+ const modelName = `babylon-${config.archetype}:latest`;
512
+ const baseModel = process.env.OLLAMA_BASE_MODEL || 'qwen2.5:7b-instruct';
513
+
514
+ console.log(`Importing adapter as ${modelName}...`);
515
+ console.log(` Base model: ${baseModel}`);
516
+ console.log(` Adapter: ${adapterPath}`);
517
+
518
+ // Create Modelfile
519
+ const modelfile = `FROM ${baseModel}
520
+ ADAPTER ${adapterPath}
521
+ PARAMETER temperature 0.7
522
+ PARAMETER num_predict 8192
523
+ `;
524
+
525
+ try {
526
+ const response = await fetch('http://localhost:11434/api/create', {
527
+ method: 'POST',
528
+ headers: { 'Content-Type': 'application/json' },
529
+ body: JSON.stringify({
530
+ name: modelName,
531
+ modelfile,
532
+ stream: false,
533
+ }),
534
+ signal: AbortSignal.timeout(600000), // 10 minutes
535
+ });
536
+
537
+ if (!response.ok) {
538
+ const error = await response.text();
539
+ return {
540
+ success: false,
541
+ message: `Failed to import: ${error}`,
542
+ };
543
+ }
544
+
545
+ return {
546
+ success: true,
547
+ message: `Model imported as ${modelName}`,
548
+ details: { modelName, baseModel, adapterPath },
549
+ };
550
+ } catch (error) {
551
+ return {
552
+ success: false,
553
+ message: `Import error: ${error instanceof Error ? error.message : String(error)}`,
554
+ };
555
+ }
556
+ }
557
+
558
+ // Step 7: Run game test
559
+ async function runGameTest(config: PipelineConfig): Promise<{
560
+ success: boolean;
561
+ message: string;
562
+ details?: Record<string, unknown>;
563
+ }> {
564
+ if (config.skipBenchmark) {
565
+ return {
566
+ success: true,
567
+ message: 'Game test skipped (--skip-test)',
568
+ };
569
+ }
570
+
571
+ console.log('Running game test...');
572
+
573
+ // First, check if the dev server is running
574
+ let serverRunning = false;
575
+ try {
576
+ const response = await fetch('http://localhost:3000/api/health', {
577
+ signal: AbortSignal.timeout(3000),
578
+ });
579
+ serverRunning = response.ok;
580
+ } catch {
581
+ serverRunning = false;
582
+ }
583
+
584
+ if (!serverRunning) {
585
+ console.log('⚠️ Dev server not running. Starting it...');
586
+ console.log(' (This may take a minute)\n');
587
+
588
+ // Start dev server in background
589
+ const devServer = spawn(['bun', 'run', 'dev:web'], {
590
+ cwd: REPO_ROOT,
591
+ stdout: 'ignore',
592
+ stderr: 'ignore',
593
+ });
594
+
595
+ // Wait for server to be ready
596
+ for (let i = 0; i < 60; i++) {
597
+ await new Promise((resolve) => setTimeout(resolve, 2000));
598
+ try {
599
+ const response = await fetch('http://localhost:3000/api/health', {
600
+ signal: AbortSignal.timeout(3000),
601
+ });
602
+ if (response.ok) {
603
+ serverRunning = true;
604
+ console.log(' ✓ Dev server started\n');
605
+ break;
606
+ }
607
+ } catch {
608
+ // Keep waiting
609
+ }
610
+ if (i % 10 === 9) {
611
+ console.log(` Still starting... (${(i + 1) * 2}s)`);
612
+ }
613
+ }
614
+
615
+ if (!serverRunning) {
616
+ devServer.kill();
617
+ return {
618
+ success: false,
619
+ message:
620
+ 'Failed to start dev server. Run "bun run dev" manually first.',
621
+ };
622
+ }
623
+ } else {
624
+ console.log('✓ Dev server already running\n');
625
+ }
626
+
627
+ // Run game ticks using the CLI
628
+ console.log(`Executing ${config.ticks} game ticks with trained model...\n`);
629
+
630
+ const result = await runCommand(
631
+ [
632
+ 'bun',
633
+ 'run',
634
+ 'apps/cli/src/index.ts',
635
+ 'game',
636
+ 'tick',
637
+ '--count',
638
+ String(config.ticks),
639
+ '--agent-archetype',
640
+ config.archetype,
641
+ ],
642
+ {
643
+ cwd: REPO_ROOT,
644
+ timeout: config.ticks * 10000, // 10s per tick max
645
+ env: {
646
+ AGENT_LLM_PROVIDER: 'ollama',
647
+ OLLAMA_MODEL: `babylon-${config.archetype}:latest`,
648
+ },
649
+ }
650
+ );
651
+
652
+ if (config.verbose) {
653
+ console.log(result.output);
654
+ }
655
+
656
+ // Also run autonomous agent tick if available
657
+ console.log('\nRunning autonomous agent with trained model...\n');
658
+
659
+ const agentResult = await runCommand(
660
+ [
661
+ 'bun',
662
+ 'run',
663
+ 'apps/cli/src/index.ts',
664
+ 'agent',
665
+ 'tick',
666
+ '--archetype',
667
+ config.archetype,
668
+ ],
669
+ {
670
+ cwd: REPO_ROOT,
671
+ timeout: 120000,
672
+ env: {
673
+ AGENT_LLM_PROVIDER: 'ollama',
674
+ OLLAMA_MODEL: `babylon-${config.archetype}:latest`,
675
+ },
676
+ }
677
+ );
678
+
679
+ if (config.verbose) {
680
+ console.log(agentResult.output);
681
+ }
682
+
683
+ const gameSuccess = result.success;
684
+ const agentSuccess = agentResult.success;
685
+
686
+ return {
687
+ success: gameSuccess,
688
+ message: gameSuccess
689
+ ? `Game test completed: ${config.ticks} ticks executed, agent ${agentSuccess ? 'passed' : 'had issues'}`
690
+ : `Game test failed: ${result.output.slice(-300)}`,
691
+ details: {
692
+ ticks: config.ticks,
693
+ gameSuccess,
694
+ agentSuccess,
695
+ serverWasRunning: serverRunning,
696
+ },
697
+ };
698
+ }
699
+
700
+ // Cleanup
701
+ async function cleanup(): Promise<void> {
702
+ if (ollamaProcess) {
703
+ console.log('\nStopping Ollama...');
704
+ ollamaProcess.kill();
705
+ ollamaProcess = null;
706
+ }
707
+ }
708
+
709
+ // Main pipeline
710
+ async function main(): Promise<void> {
711
+ const config = parseConfig();
712
+
713
+ console.log(`
714
+ ${'═'.repeat(60)}
715
+ ELIZAOS AUTOMATED TRAINING & TESTING PIPELINE
716
+ ${'═'.repeat(60)}
717
+
718
+ Configuration:
719
+ - Skip Training: ${config.skipTraining}
720
+ - Skip Benchmark: ${config.skipBenchmark}
721
+ - Archetype: ${config.archetype}
722
+ - Benchmark Ticks: ${config.ticks}
723
+ - Verbose: ${config.verbose}
724
+ `);
725
+
726
+ const startTime = Date.now();
727
+
728
+ // Run pipeline steps
729
+ const steps: Array<{
730
+ name: string;
731
+ fn: () => Promise<{
732
+ success: boolean;
733
+ message: string;
734
+ details?: Record<string, unknown>;
735
+ }>;
736
+ }> = [
737
+ { name: 'Check Prerequisites', fn: () => checkPrerequisites(config) },
738
+ { name: 'Install Dependencies', fn: () => installDependencies() },
739
+ { name: 'Train Model', fn: () => trainModel(config) },
740
+ ];
741
+
742
+ const defaultAdapterPath = join(TRAINING_DIR, 'trained_models/local/adapters');
743
+ const hasAdapterCandidate = Boolean(config.adapterPath) || existsSync(defaultAdapterPath);
744
+
745
+ if (!config.skipBenchmark) {
746
+ steps.push(
747
+ { name: 'Test Adapter', fn: () => testAdapter(config) },
748
+ { name: 'Start Ollama', fn: () => startOllama() },
749
+ { name: 'Import to Ollama', fn: () => importToOllama(config) },
750
+ { name: 'Run Game Test', fn: () => runGameTest(config) }
751
+ );
752
+ } else if (!config.skipTraining || hasAdapterCandidate) {
753
+ // When benchmark is skipped, only validate adapter if we trained one or a path exists.
754
+ steps.push({ name: 'Test Adapter', fn: () => testAdapter(config) });
755
+ }
756
+
757
+ let allPassed = true;
758
+ for (const step of steps) {
759
+ const success = await runStep(step.name, step.fn);
760
+ if (!success) {
761
+ allPassed = false;
762
+ console.log(`\n⛔ Pipeline stopped at: ${step.name}`);
763
+ break;
764
+ }
765
+ }
766
+
767
+ // Cleanup
768
+ await cleanup();
769
+
770
+ // Summary
771
+ const totalDuration = Date.now() - startTime;
772
+ console.log(`
773
+ ${'═'.repeat(60)}
774
+ PIPELINE SUMMARY
775
+ ${'═'.repeat(60)}
776
+ `);
777
+
778
+ for (const result of results) {
779
+ const icon = result.success ? '✅' : '❌';
780
+ console.log(`${icon} ${result.name}: ${result.message}`);
781
+ }
782
+
783
+ console.log(`
784
+ Total Duration: ${(totalDuration / 1000).toFixed(1)}s
785
+ Status: ${allPassed ? '✅ ALL STEPS PASSED' : '❌ PIPELINE FAILED'}
786
+ `);
787
+
788
+ // Save report
789
+ const outputDir = './research-output/training-runs';
790
+ mkdirSync(outputDir, { recursive: true });
791
+
792
+ const report = {
793
+ timestamp: new Date().toISOString(),
794
+ config,
795
+ results,
796
+ totalDuration,
797
+ success: allPassed,
798
+ };
799
+
800
+ const reportPath = join(outputDir, `training-run-${Date.now()}.json`);
801
+ writeFileSync(reportPath, JSON.stringify(report, null, 2));
802
+ console.log(`📄 Report saved to: ${reportPath}`);
803
+
804
+ process.exit(allPassed ? 0 : 1);
805
+ }
806
+
807
+ // Handle signals
808
+ process.on('SIGINT', async () => {
809
+ console.log('\n\n🛑 Interrupted. Cleaning up...');
810
+ await cleanup();
811
+ process.exit(130);
812
+ });
813
+
814
+ process.on('SIGTERM', async () => {
815
+ console.log('\n\n🛑 Terminated. Cleaning up...');
816
+ await cleanup();
817
+ process.exit(143);
818
+ });
819
+
820
+ main().catch(async (error) => {
821
+ console.error('Pipeline failed:', error);
822
+ await cleanup();
823
+ process.exit(1);
824
+ });