@elizaos/training 2.0.0-alpha.41 → 2.0.0-alpha.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  4. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  5. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  6. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  7. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  8. package/research-output/training-runs/training-run-1773743782883.json +38 -0
  9. package/research-output/training-runs/training-run-1773755075895.json +38 -0
  10. package/research-output/training-runs/training-run-1773755142682.json +38 -0
  11. package/scripts/rank_trajectories.ts +20 -6
  12. package/scripts/run_task_benchmark.ts +7 -13
  13. package/src/adapter.ts +96 -49
  14. package/src/archetypes/ArchetypeConfigService.ts +276 -264
  15. package/src/archetypes/derive-archetype.ts +47 -47
  16. package/src/archetypes/index.ts +2 -2
  17. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  18. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  19. package/src/benchmark/BenchmarkDataGenerator.ts +162 -152
  20. package/src/benchmark/BenchmarkDataViewer.ts +98 -97
  21. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  22. package/src/benchmark/BenchmarkRunner.ts +94 -85
  23. package/src/benchmark/BenchmarkValidator.ts +48 -46
  24. package/src/benchmark/FastEvalRunner.ts +17 -16
  25. package/src/benchmark/MetricsValidator.ts +141 -141
  26. package/src/benchmark/MetricsVisualizer.ts +92 -85
  27. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  28. package/src/benchmark/ModelRegistry.ts +44 -44
  29. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  30. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  31. package/src/benchmark/SimulationEngine.ts +55 -54
  32. package/src/benchmark/TaskRunner.ts +87 -79
  33. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +82 -82
  34. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  35. package/src/benchmark/index.ts +27 -27
  36. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  37. package/src/benchmark/simulation-types.ts +10 -10
  38. package/src/dependencies.ts +34 -34
  39. package/src/generation/TrajectoryGenerator.ts +39 -37
  40. package/src/generation/index.ts +1 -1
  41. package/src/huggingface/HuggingFaceDatasetUploader.ts +74 -73
  42. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  43. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  44. package/src/huggingface/index.ts +6 -6
  45. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +66 -59
  46. package/src/index.ts +30 -27
  47. package/src/init-training.ts +6 -6
  48. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  49. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  50. package/src/metrics/index.ts +2 -2
  51. package/src/rubrics/__tests__/index.test.ts +73 -73
  52. package/src/rubrics/ass-kisser.ts +6 -6
  53. package/src/rubrics/degen.ts +6 -6
  54. package/src/rubrics/goody-twoshoes.ts +6 -6
  55. package/src/rubrics/index.ts +50 -50
  56. package/src/rubrics/information-trader.ts +6 -6
  57. package/src/rubrics/infosec.ts +6 -6
  58. package/src/rubrics/liar.ts +6 -6
  59. package/src/rubrics/perps-trader.ts +6 -6
  60. package/src/rubrics/researcher.ts +6 -6
  61. package/src/rubrics/scammer.ts +6 -6
  62. package/src/rubrics/social-butterfly.ts +7 -7
  63. package/src/rubrics/super-predictor.ts +6 -6
  64. package/src/rubrics/trader.ts +5 -5
  65. package/src/scoring/ArchetypeScoringService.ts +56 -54
  66. package/src/scoring/JudgePromptBuilder.ts +96 -96
  67. package/src/scoring/LLMJudgeCache.ts +26 -23
  68. package/src/scoring/index.ts +3 -3
  69. package/src/training/AutomationPipeline.ts +166 -154
  70. package/src/training/BenchmarkService.ts +53 -47
  71. package/src/training/ConfigValidator.ts +202 -190
  72. package/src/training/MarketOutcomesTracker.ts +22 -12
  73. package/src/training/ModelDeployer.ts +15 -15
  74. package/src/training/ModelFetcher.ts +7 -7
  75. package/src/training/ModelSelectionService.ts +32 -32
  76. package/src/training/ModelUsageVerifier.ts +31 -24
  77. package/src/training/MultiModelOrchestrator.ts +44 -44
  78. package/src/training/RLModelConfig.ts +57 -57
  79. package/src/training/RewardBackpropagationService.ts +18 -17
  80. package/src/training/RulerScoringService.ts +86 -79
  81. package/src/training/TrainingMonitor.ts +29 -29
  82. package/src/training/TrajectoryRecorder.ts +40 -30
  83. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  84. package/src/training/index.ts +36 -36
  85. package/src/training/logRLConfig.ts +7 -7
  86. package/src/training/pipeline.ts +13 -16
  87. package/src/training/storage/ModelStorageService.ts +32 -32
  88. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  89. package/src/training/storage/index.ts +2 -2
  90. package/src/training/types.ts +6 -6
  91. package/src/training/window-utils.ts +14 -14
  92. package/src/utils/index.ts +7 -7
  93. package/src/utils/logger.ts +5 -5
  94. package/src/utils/snowflake.ts +1 -1
  95. package/src/utils/synthetic-detector.ts +7 -7
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@elizaos/training",
3
- "version": "2.0.0-alpha.41",
3
+ "version": "2.0.0-alpha.44",
4
4
  "description": "ElizaOS RL training pipeline with benchmarking and model publishing support",
5
5
  "main": "./src/index.ts",
6
6
  "types": "./src/index.ts",
@@ -53,5 +53,5 @@
53
53
  "bun-types": "^1.3.2",
54
54
  "typescript": "^5.9.3"
55
55
  },
56
- "gitHead": "b3e37e421bcd49b6bc7a34373edc7b3b3a282b8b"
56
+ "gitHead": "2b27a4e70ebdf054b117b87ed9e8f9f709fe006b"
57
57
  }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T05:55:41.205Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 139
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 2130
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 2270,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:20:57.616Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 132
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 1528
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 1660,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:22:26.977Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 55
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 1215
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 1270,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:27:58.891Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 315
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 3870
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 4185,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:30:09.754Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 145
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 2265
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 1
34
+ }
35
+ ],
36
+ "totalDuration": 2412,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:34:11.086Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 138
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 1809
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 1
34
+ }
35
+ ],
36
+ "totalDuration": 1949,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T10:36:22.883Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 40
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 1138
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 1178,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T13:44:35.895Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 138
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 2068
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 2208,
37
+ "success": true
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "timestamp": "2026-03-17T13:45:42.682Z",
3
+ "config": {
4
+ "skipTraining": true,
5
+ "skipBenchmark": true,
6
+ "ticks": 100,
7
+ "archetype": "trader",
8
+ "verbose": false
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "Check Prerequisites",
13
+ "success": true,
14
+ "message": "Prerequisites satisfied (Ollama: yes)",
15
+ "details": {
16
+ "python": true,
17
+ "trainingDir": true,
18
+ "mlx": false,
19
+ "ollama": true
20
+ },
21
+ "duration": 132
22
+ },
23
+ {
24
+ "name": "Install Dependencies",
25
+ "success": true,
26
+ "message": "Python dependencies installed",
27
+ "duration": 2688
28
+ },
29
+ {
30
+ "name": "Train Model",
31
+ "success": true,
32
+ "message": "Training skipped (--skip-training)",
33
+ "duration": 0
34
+ }
35
+ ],
36
+ "totalDuration": 2820,
37
+ "success": true
38
+ }
@@ -67,7 +67,6 @@ async function main() {
67
67
  // Initialize Judge Runtime
68
68
  const character = {
69
69
  name: 'JudgeAgent',
70
- modelProvider: "openai" as any,
71
70
  bio: ['I am an impartial AI judge.'],
72
71
  settings: {
73
72
  secrets: {
@@ -138,7 +137,18 @@ async function main() {
138
137
 
139
138
  console.log(`Found ${lines.length} trajectories to rank.`);
140
139
 
141
- const scoredTrajectories = [];
140
+ interface ScoredTrajectory {
141
+ trajectoryId?: string;
142
+ steps?: Array<{
143
+ action?: { parameters?: { text?: string } };
144
+ }>;
145
+ metadata?: { task?: string };
146
+ score?: number;
147
+ reasoning?: string;
148
+ isScored?: boolean;
149
+ }
150
+
151
+ const scoredTrajectories: ScoredTrajectory[] = [];
142
152
 
143
153
  // Clear output file first if overwriting
144
154
  if (fs.existsSync(outputFile)) {
@@ -147,12 +157,12 @@ async function main() {
147
157
 
148
158
  for (const line of lines) {
149
159
  try {
150
- const trajectory = JSON.parse(line);
160
+ const trajectory = JSON.parse(line) as ScoredTrajectory;
151
161
  const { steps, metadata } = trajectory;
152
162
  const task = metadata?.task || 'Unknown Task';
153
163
 
154
164
  // Extract the last step's action/response
155
- const lastStep = steps[steps.length - 1];
165
+ const lastStep = steps && steps.length > 0 ? steps[steps.length - 1] : undefined;
156
166
  const response = lastStep?.action?.parameters?.text || "No response found";
157
167
 
158
168
  console.log(`Ranking trajectory ${trajectory.trajectoryId}...`);
@@ -177,11 +187,15 @@ Return ONLY valid JSON.
177
187
  const resultText = typeof result === 'string' ? result : result.text;
178
188
 
179
189
  // Parse JSON
180
- let scoreData;
190
+ interface ScoreData {
191
+ score: number;
192
+ reasoning: string;
193
+ }
194
+ let scoreData: ScoreData;
181
195
  try {
182
196
  // simple cleanup for markdown code blocks
183
197
  const jsonStr = resultText.replace(/```json/g, '').replace(/```/g, '').trim();
184
- scoreData = JSON.parse(jsonStr);
198
+ scoreData = JSON.parse(jsonStr) as ScoreData;
185
199
  } catch (e) {
186
200
  console.warn(`Failed to parse judge output for ${trajectory.trajectoryId}: ${resultText}`);
187
201
  scoreData = { score: 0, reasoning: "Parse Error" };
@@ -50,8 +50,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
50
50
  // Create a new runtime
51
51
  const character = {
52
52
  name: 'BenchmarkAgent',
53
- modelProvider: "openai" as any,
54
- bio: 'A helpful assistant for benchmarking.',
53
+ bio: ['A helpful assistant for benchmarking.'],
55
54
  settings: {
56
55
  secrets: {
57
56
  OPENAI_API_KEY: process.env.OPENAI_API_KEY || ''
@@ -60,13 +59,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
60
59
  };
61
60
 
62
61
  const runtime = new AgentRuntime({
63
- token: process.env.OPENAI_API_KEY || '',
64
- modelProvider: "openai" as any,
65
62
  character,
66
- plugins: [],
67
- providers: [],
68
- actions: [],
69
- evaluators: [],
70
63
  });
71
64
 
72
65
  // We must initialize with allowNoDatabase to avoid DB error
@@ -116,7 +109,7 @@ class BenchmarkTaskInteractor implements ITaskInteractor {
116
109
 
117
110
  const userMemory: Memory = {
118
111
  id: messageId as `${string}-${string}-${string}-${string}-${string}`,
119
- userId: userId as `${string}-${string}-${string}-${string}-${string}`,
112
+ entityId: userId as `${string}-${string}-${string}-${string}-${string}`,
120
113
  agentId: runtime.agentId,
121
114
  roomId: roomId as `${string}-${string}-${string}-${string}-${string}`,
122
115
  content: {
@@ -143,6 +136,7 @@ Assistant:`;
143
136
  // Signature: generateText(input: string, options?: GenerateTextOptions)
144
137
  const result = await runtime.generateText(context, {
145
138
  modelType: ModelType.TEXT_SMALL,
139
+ stopSequences: [],
146
140
  });
147
141
  // Handle both string and object return types for safety
148
142
  const response = typeof result === 'string' ? result : result.text;
@@ -227,11 +221,11 @@ async function main() {
227
221
  agentService: new BenchmarkAgentService(),
228
222
  agentRuntimeManager: new BenchmarkRuntimeManager(),
229
223
  autonomousCoordinator: {
230
- executeAutonomousTick: async () => ({ success: true })
231
- } as any,
224
+ executeAutonomousTick: async () => ({ success: true }),
225
+ },
232
226
  llmCaller: {
233
- callGroqDirect: async () => "mock response"
234
- } as any,
227
+ callGroqDirect: async () => "mock response",
228
+ },
235
229
  });
236
230
 
237
231
  // Import task interactor config