@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -63,11 +63,11 @@ Social quality matters too - genuine engagement (meaningful conversations, helpf
63
63
  `;
64
64
 
65
65
  export const SOCIAL_BUTTERFLY_PRIORITY_METRICS = [
66
- 'social.uniqueUsersInteracted',
67
- 'social.groupChatsJoined',
68
- 'social.dmsInitiated',
69
- 'social.postsCreated',
70
- 'social.commentsMade',
71
- 'behavior.socialToTradeRatio',
72
- 'influence.followersGained',
66
+ "social.uniqueUsersInteracted",
67
+ "social.groupChatsJoined",
68
+ "social.dmsInitiated",
69
+ "social.postsCreated",
70
+ "social.commentsMade",
71
+ "behavior.socialToTradeRatio",
72
+ "influence.followersGained",
73
73
  ];
@@ -88,10 +88,10 @@ Look for evidence of domain expertise:
88
88
  `;
89
89
 
90
90
  export const SUPER_PREDICTOR_PRIORITY_METRICS = [
91
- 'information.predictionAccuracy',
92
- 'trading.winRate',
93
- 'trading.totalPnL',
94
- 'information.researchActions',
95
- 'information.predictionsMade',
96
- 'information.correctPredictions',
91
+ "information.predictionAccuracy",
92
+ "trading.winRate",
93
+ "trading.totalPnL",
94
+ "information.researchActions",
95
+ "information.predictionsMade",
96
+ "information.correctPredictions",
97
97
  ];
@@ -59,9 +59,9 @@ If two trajectories have similar P&L, the one with better risk metrics (lower dr
59
59
  `;
60
60
 
61
61
  export const TRADER_PRIORITY_METRICS = [
62
- 'trading.totalPnL',
63
- 'trading.sharpeRatio',
64
- 'trading.winRate',
65
- 'trading.marketsTraded',
66
- 'behavior.socialToTradeRatio',
62
+ "trading.totalPnL",
63
+ "trading.sharpeRatio",
64
+ "trading.winRate",
65
+ "trading.marketsTraded",
66
+ "behavior.socialToTradeRatio",
67
67
  ];
@@ -7,16 +7,16 @@
7
7
  * @packageDocumentation
8
8
  */
9
9
 
10
- import { getTrainingDataAdapter } from '../adapter';
11
- import { getLLMCaller } from '../dependencies';
12
- import { type BehavioralMetrics, trajectoryMetricsExtractor } from '../metrics';
13
- import { hasCustomRubric } from '../rubrics';
14
- import type { TrajectoryStep } from '../training/types';
15
- import { logger, splitIntoBatches } from '../utils';
10
+ import { getTrainingDataAdapter } from "../adapter";
11
+ import { getLLMCaller } from "../dependencies";
12
+ import { type BehavioralMetrics, trajectoryMetricsExtractor } from "../metrics";
13
+ import { hasCustomRubric } from "../rubrics";
14
+ import type { TrajectoryStep } from "../training/types";
15
+ import { logger, splitIntoBatches } from "../utils";
16
16
  import {
17
17
  judgePromptBuilder,
18
18
  type TrajectoryContext,
19
- } from './JudgePromptBuilder';
19
+ } from "./JudgePromptBuilder";
20
20
 
21
21
  /**
22
22
  * Score result for a single trajectory.
@@ -86,17 +86,17 @@ export class ArchetypeScoringService {
86
86
  */
87
87
  async scoreTrajectory(
88
88
  trajectoryId: string,
89
- options: ScoringOptions = {}
89
+ options: ScoringOptions = {},
90
90
  ): Promise<ArchetypeScore | null> {
91
91
  const opts = { ...DEFAULT_OPTIONS, ...options };
92
92
 
93
93
  const traj = await getTrainingDataAdapter().getTrajectoryById(trajectoryId);
94
94
  if (!traj) {
95
- logger.warn('Trajectory not found', { trajectoryId }, 'ArchetypeScoring');
95
+ logger.warn("Trajectory not found", { trajectoryId }, "ArchetypeScoring");
96
96
  return null;
97
97
  }
98
98
 
99
- const archetype = traj.archetype || opts.archetype || 'default';
99
+ const archetype = traj.archetype || opts.archetype || "default";
100
100
  const steps = JSON.parse(traj.stepsJson) as TrajectoryStep[];
101
101
 
102
102
  const metrics = trajectoryMetricsExtractor.extractFromRaw({
@@ -109,7 +109,7 @@ export class ArchetypeScoringService {
109
109
 
110
110
  if (!metrics) {
111
111
  throw new Error(
112
- `Failed to extract metrics for trajectory ${trajectoryId}`
112
+ `Failed to extract metrics for trajectory ${trajectoryId}`,
113
113
  );
114
114
  }
115
115
 
@@ -131,7 +131,7 @@ export class ArchetypeScoringService {
131
131
  const response = await this.callSingleJudge(system, user);
132
132
  if (!response) {
133
133
  throw new Error(
134
- `Judge returned no response for trajectory ${trajectoryId}`
134
+ `Judge returned no response for trajectory ${trajectoryId}`,
135
135
  );
136
136
  }
137
137
 
@@ -151,18 +151,18 @@ export class ArchetypeScoringService {
151
151
  await getTrainingDataAdapter().updateTrajectoryScore(
152
152
  trajectoryId,
153
153
  score.score,
154
- score.reasoning
154
+ score.reasoning,
155
155
  );
156
156
  }
157
157
 
158
158
  logger.info(
159
- 'Scored trajectory',
159
+ "Scored trajectory",
160
160
  {
161
161
  trajectoryId,
162
162
  archetype: score.archetype,
163
163
  score: score.score,
164
164
  },
165
- 'ArchetypeScoring'
165
+ "ArchetypeScoring",
166
166
  );
167
167
 
168
168
  return score;
@@ -176,38 +176,39 @@ export class ArchetypeScoringService {
176
176
  */
177
177
  async scoreTrajectoryGroup(
178
178
  trajectoryIds: string[],
179
- options: ScoringOptions = {}
179
+ options: ScoringOptions = {},
180
180
  ): Promise<ArchetypeScore[]> {
181
181
  const opts = { ...DEFAULT_OPTIONS, ...options };
182
182
 
183
183
  if (trajectoryIds.length < this.minGroupSize) {
184
184
  logger.warn(
185
- 'Group too small for RULER scoring',
185
+ "Group too small for RULER scoring",
186
186
  {
187
187
  size: trajectoryIds.length,
188
188
  minRequired: this.minGroupSize,
189
189
  },
190
- 'ArchetypeScoring'
190
+ "ArchetypeScoring",
191
191
  );
192
192
  return [];
193
193
  }
194
194
 
195
- const trajResults = await getTrainingDataAdapter().getTrajectoriesByIds(trajectoryIds);
195
+ const trajResults =
196
+ await getTrainingDataAdapter().getTrajectoriesByIds(trajectoryIds);
196
197
 
197
198
  if (trajResults.length < this.minGroupSize) {
198
199
  logger.warn(
199
- 'Not enough valid trajectories',
200
+ "Not enough valid trajectories",
200
201
  {
201
202
  requested: trajectoryIds.length,
202
203
  found: trajResults.length,
203
204
  },
204
- 'ArchetypeScoring'
205
+ "ArchetypeScoring",
205
206
  );
206
207
  return [];
207
208
  }
208
209
 
209
210
  const contexts: TrajectoryContext[] = [];
210
- const fallbackArchetype = opts.archetype || 'default';
211
+ const fallbackArchetype = opts.archetype || "default";
211
212
 
212
213
  for (const traj of trajResults) {
213
214
  const steps = JSON.parse(traj.stepsJson) as TrajectoryStep[];
@@ -223,7 +224,7 @@ export class ArchetypeScoringService {
223
224
 
224
225
  if (!metrics) {
225
226
  throw new Error(
226
- `Failed to extract metrics for trajectory ${traj.trajectoryId}`
227
+ `Failed to extract metrics for trajectory ${traj.trajectoryId}`,
227
228
  );
228
229
  }
229
230
 
@@ -243,15 +244,15 @@ export class ArchetypeScoringService {
243
244
  const scores: ArchetypeScore[] = [];
244
245
 
245
246
  for (const batch of batches) {
246
- const scenarioId = batch[0]?.archetype || 'unknown';
247
+ const scenarioId = batch[0]?.archetype || "unknown";
247
248
  const { system, user } = judgePromptBuilder.buildComparisonPrompt(
248
249
  batch,
249
- scenarioId
250
+ scenarioId,
250
251
  );
251
252
  const response = await this.callComparisonJudge(system, user);
252
253
 
253
254
  if (!response) {
254
- throw new Error('Judge returned no response for batch');
255
+ throw new Error("Judge returned no response for batch");
255
256
  }
256
257
 
257
258
  for (let i = 0; i < batch.length; i++) {
@@ -260,7 +261,7 @@ export class ArchetypeScoringService {
260
261
 
261
262
  const expectedId = `trajectory-${i + 1}`;
262
263
  const scoreData = response.scores.find(
263
- (s) => s.trajectory_id === expectedId
264
+ (s) => s.trajectory_id === expectedId,
264
265
  );
265
266
 
266
267
  if (!scoreData) {
@@ -270,7 +271,7 @@ export class ArchetypeScoringService {
270
271
  const score: ArchetypeScore = {
271
272
  trajectoryId: ctx.trajectoryId,
272
273
  agentId: ctx.agentId,
273
- archetype: ctx.archetype || 'default',
274
+ archetype: ctx.archetype || "default",
274
275
  score: Math.max(0, Math.min(1, scoreData.score)),
275
276
  reasoning: scoreData.explanation,
276
277
  strengths: [],
@@ -285,19 +286,19 @@ export class ArchetypeScoringService {
285
286
  await getTrainingDataAdapter().updateTrajectoryScore(
286
287
  ctx.trajectoryId,
287
288
  score.score,
288
- score.reasoning
289
+ score.reasoning,
289
290
  );
290
291
  }
291
292
  }
292
293
  }
293
294
 
294
295
  logger.info(
295
- 'Scored trajectory group',
296
+ "Scored trajectory group",
296
297
  {
297
298
  requested: trajectoryIds.length,
298
299
  scored: scores.length,
299
300
  },
300
- 'ArchetypeScoring'
301
+ "ArchetypeScoring",
301
302
  );
302
303
 
303
304
  return scores;
@@ -311,13 +312,13 @@ export class ArchetypeScoringService {
311
312
  */
312
313
  async scoreByArchetype(
313
314
  archetype: string,
314
- trajectoryIds: string[]
315
+ trajectoryIds: string[],
315
316
  ): Promise<{ scored: number; errors: number }> {
316
317
  if (!hasCustomRubric(archetype)) {
317
318
  logger.warn(
318
- 'No custom rubric for archetype, using default',
319
+ "No custom rubric for archetype, using default",
319
320
  { archetype },
320
- 'ArchetypeScoring'
321
+ "ArchetypeScoring",
321
322
  );
322
323
  }
323
324
 
@@ -343,13 +344,14 @@ export class ArchetypeScoringService {
343
344
  * @returns Count of scored and errors
344
345
  */
345
346
  async scoreUnscoredTrajectories(
346
- archetype: string = 'default',
347
- limit: number = 100
347
+ archetype: string = "default",
348
+ limit: number = 100,
348
349
  ): Promise<{ scored: number; errors: number }> {
349
- const unscoredResult = await getTrainingDataAdapter().getUnscoredTrajectories({ limit });
350
+ const unscoredResult =
351
+ await getTrainingDataAdapter().getUnscoredTrajectories({ limit });
350
352
 
351
353
  if (unscoredResult.length === 0) {
352
- logger.info('No unscored trajectories found', {}, 'ArchetypeScoring');
354
+ logger.info("No unscored trajectories found", {}, "ArchetypeScoring");
353
355
  return { scored: 0, errors: 0 };
354
356
  }
355
357
 
@@ -367,25 +369,25 @@ export class ArchetypeScoringService {
367
369
  async scoreTrajectoriesParallel(
368
370
  trajectoryIds: string[],
369
371
  options: ScoringOptions = {},
370
- concurrency: number = 5
372
+ concurrency: number = 5,
371
373
  ): Promise<ArchetypeScore[]> {
372
374
  const results: ArchetypeScore[] = [];
373
375
  const batches = splitIntoBatches(trajectoryIds, concurrency);
374
376
 
375
377
  logger.info(
376
- 'Starting parallel scoring',
378
+ "Starting parallel scoring",
377
379
  {
378
380
  total: trajectoryIds.length,
379
381
  batches: batches.length,
380
382
  concurrency,
381
383
  },
382
- 'ArchetypeScoring'
384
+ "ArchetypeScoring",
383
385
  );
384
386
 
385
387
  for (let i = 0; i < batches.length; i++) {
386
388
  const batch = batches[i] ?? [];
387
389
  const batchPromises = batch.map((id) =>
388
- this.scoreTrajectory(id, options)
390
+ this.scoreTrajectory(id, options),
389
391
  );
390
392
  const batchResults = await Promise.all(batchPromises);
391
393
 
@@ -401,12 +403,12 @@ export class ArchetypeScoringService {
401
403
  }
402
404
 
403
405
  logger.info(
404
- 'Parallel scoring complete',
406
+ "Parallel scoring complete",
405
407
  {
406
408
  requested: trajectoryIds.length,
407
409
  scored: results.length,
408
410
  },
409
- 'ArchetypeScoring'
411
+ "ArchetypeScoring",
410
412
  );
411
413
 
412
414
  return results;
@@ -417,7 +419,7 @@ export class ArchetypeScoringService {
417
419
  */
418
420
  private async callSingleJudge(
419
421
  system: string,
420
- user: string
422
+ user: string,
421
423
  ): Promise<TrajectoryScoreResponse | null> {
422
424
  const llmCaller = getLLMCaller();
423
425
  const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
@@ -425,10 +427,10 @@ export class ArchetypeScoringService {
425
427
  const response = await llmCaller.callGroqDirect({
426
428
  prompt,
427
429
  system,
428
- modelSize: 'large',
430
+ modelSize: "large",
429
431
  temperature: 0.3,
430
432
  maxTokens: 1000,
431
- actionType: 'archetype_score_trajectory',
433
+ actionType: "archetype_score_trajectory",
432
434
  });
433
435
 
434
436
  return this.parseJudgeResponse<TrajectoryScoreResponse>(response);
@@ -439,7 +441,7 @@ export class ArchetypeScoringService {
439
441
  */
440
442
  private async callComparisonJudge(
441
443
  system: string,
442
- user: string
444
+ user: string,
443
445
  ): Promise<RulerScoreResponse | null> {
444
446
  const llmCaller = getLLMCaller();
445
447
  const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
@@ -447,10 +449,10 @@ export class ArchetypeScoringService {
447
449
  const response = await llmCaller.callGroqDirect({
448
450
  prompt,
449
451
  system,
450
- modelSize: 'large',
452
+ modelSize: "large",
451
453
  temperature: 0.3,
452
454
  maxTokens: 2000,
453
- actionType: 'archetype_ruler_score',
455
+ actionType: "archetype_ruler_score",
454
456
  });
455
457
 
456
458
  return this.parseJudgeResponse<RulerScoreResponse>(response);
@@ -462,18 +464,18 @@ export class ArchetypeScoringService {
462
464
  private parseJudgeResponse<T>(response: string): T | null {
463
465
  const jsonText = response
464
466
  .trim()
465
- .replace(/```json\n?/g, '')
466
- .replace(/```\n?/g, '')
467
+ .replace(/```json\n?/g, "")
468
+ .replace(/```\n?/g, "")
467
469
  .trim();
468
470
 
469
471
  const jsonMatch = jsonText.match(/\{[\s\S]*\}/);
470
472
  if (!jsonMatch) {
471
473
  logger.error(
472
- 'No JSON found in response',
474
+ "No JSON found in response",
473
475
  {
474
476
  preview: response.substring(0, 200),
475
477
  },
476
- 'ArchetypeScoring'
478
+ "ArchetypeScoring",
477
479
  );
478
480
  return null;
479
481
  }