@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -12,14 +12,14 @@
12
12
  * - Tracks agent actions for performance evaluation
13
13
  */
14
14
 
15
- import type { JsonValue } from '../adapter';
16
- import { logger } from '../utils/logger';
15
+ import type { JsonValue } from "../adapter";
16
+ import { logger } from "../utils/logger";
17
17
  import type {
18
18
  BenchmarkGameSnapshot,
19
19
  GameState,
20
20
  Tick,
21
- } from './BenchmarkDataGenerator';
22
- import { MetricsValidator } from './MetricsValidator';
21
+ } from "./BenchmarkDataGenerator";
22
+ import { MetricsValidator } from "./MetricsValidator";
23
23
 
24
24
  export interface SimulationConfig {
25
25
  /** The benchmark snapshot to replay */
@@ -52,7 +52,7 @@ export interface AgentAction {
52
52
  perpCorrect?: boolean;
53
53
  sentimentAtTrade?: number;
54
54
  priceChange?: number;
55
- expectedDirection?: 'up' | 'down';
55
+ expectedDirection?: "up" | "down";
56
56
 
57
57
  /** Sentiment analysis accuracy tracking */
58
58
  sentimentAccuracy?: number;
@@ -62,14 +62,14 @@ export interface AgentAction {
62
62
  }
63
63
 
64
64
  export type AgentActionType =
65
- | 'query_state'
66
- | 'buy_prediction'
67
- | 'sell_prediction'
68
- | 'open_perp'
69
- | 'close_perp'
70
- | 'create_post'
71
- | 'join_group'
72
- | 'send_message';
65
+ | "query_state"
66
+ | "buy_prediction"
67
+ | "sell_prediction"
68
+ | "open_perp"
69
+ | "close_perp"
70
+ | "create_post"
71
+ | "join_group"
72
+ | "send_message";
73
73
 
74
74
  export type AgentActionResult =
75
75
  | { positionId: string; shares: number } // buy_prediction
@@ -197,23 +197,23 @@ export class SimulationEngine {
197
197
  const validation = MetricsValidator.validate(
198
198
  metrics,
199
199
  this.actions,
200
- this.config.snapshot.groundTruth
200
+ this.config.snapshot.groundTruth,
201
201
  );
202
202
 
203
203
  if (!validation.valid) {
204
- logger.error('Metrics validation failed', {
204
+ logger.error("Metrics validation failed", {
205
205
  errors: validation.errors,
206
206
  warnings: validation.warnings,
207
207
  });
208
208
  }
209
209
  if (validation.warnings.length > 0) {
210
- logger.warn('Metrics validation warnings', {
210
+ logger.warn("Metrics validation warnings", {
211
211
  warnings: validation.warnings,
212
212
  });
213
213
  }
214
214
  const trajectory = this.buildTrajectory();
215
215
 
216
- logger.info('Simulation completed', {
216
+ logger.info("Simulation completed", {
217
217
  duration: endTime - this.startTime,
218
218
  ticksProcessed: this.currentTick,
219
219
  totalPnl: metrics.totalPnl,
@@ -243,7 +243,7 @@ export class SimulationEngine {
243
243
  this.currentTick = 0;
244
244
  this.pnlHistory = [];
245
245
 
246
- logger.info('Simulation initialized', {
246
+ logger.info("Simulation initialized", {
247
247
  benchmarkId: this.config.snapshot.id,
248
248
  agentId: this.config.agentId,
249
249
  totalTicks: this.config.snapshot.ticks.length,
@@ -308,27 +308,27 @@ export class SimulationEngine {
308
308
  */
309
309
  async performAction(
310
310
  type: AgentActionType,
311
- data: Record<string, JsonValue>
311
+ data: Record<string, JsonValue>,
312
312
  ): Promise<{ success: boolean; result?: AgentActionResult; error?: string }> {
313
313
  const actionStart = Date.now();
314
314
 
315
315
  let result: AgentActionResult;
316
- let correctness: AgentAction['correctness'];
316
+ let correctness: AgentAction["correctness"];
317
317
 
318
318
  try {
319
319
  switch (type) {
320
- case 'buy_prediction': {
320
+ case "buy_prediction": {
321
321
  result = this.handleBuyPrediction(data);
322
322
  const { marketId, outcome } = data as {
323
323
  marketId: string;
324
- outcome: 'YES' | 'NO';
324
+ outcome: "YES" | "NO";
325
325
  };
326
326
 
327
327
  // Track correctness for prediction markets
328
328
  const marketOutcome =
329
329
  this.config.snapshot.groundTruth.marketOutcomes[marketId];
330
330
  if (marketOutcome !== undefined) {
331
- const predictedOutcome = outcome === 'YES';
331
+ const predictedOutcome = outcome === "YES";
332
332
  const isCorrect = predictedOutcome === marketOutcome;
333
333
 
334
334
  correctness = {
@@ -340,17 +340,17 @@ export class SimulationEngine {
340
340
  break;
341
341
  }
342
342
 
343
- case 'open_perp': {
343
+ case "open_perp": {
344
344
  result = this.handleOpenPerp(data);
345
345
  const { ticker, side } = data as {
346
346
  ticker: string;
347
- side: 'LONG' | 'SHORT';
347
+ side: "LONG" | "SHORT";
348
348
  };
349
349
 
350
350
  // Track correctness for perp trades based on sentiment and price movement
351
351
  const state = this.getGameState();
352
352
  const market = state.perpetualMarkets.find(
353
- (m: { ticker: string }) => m.ticker === ticker
353
+ (m: { ticker: string }) => m.ticker === ticker,
354
354
  );
355
355
 
356
356
  if (market) {
@@ -370,8 +370,8 @@ export class SimulationEngine {
370
370
  // Determine if trade was correct
371
371
  // If sentiment is negative and we went short, that's correct
372
372
  // If sentiment is positive and we went long, that's correct
373
- const expectedDirection = sentimentAtTrade < 0 ? 'down' : 'up';
374
- const tradeDirection = side === 'SHORT' ? 'down' : 'up';
373
+ const expectedDirection = sentimentAtTrade < 0 ? "down" : "up";
374
+ const tradeDirection = side === "SHORT" ? "down" : "up";
375
375
  const isCorrect = expectedDirection === tradeDirection;
376
376
 
377
377
  correctness = {
@@ -385,15 +385,15 @@ export class SimulationEngine {
385
385
  break;
386
386
  }
387
387
 
388
- case 'close_perp':
388
+ case "close_perp":
389
389
  result = this.handleClosePerp(data);
390
390
  break;
391
391
 
392
- case 'join_group':
392
+ case "join_group":
393
393
  result = this.handleJoinGroup(data);
394
394
  break;
395
395
 
396
- case 'create_post':
396
+ case "create_post":
397
397
  result = this.handleCreatePost(data);
398
398
  break;
399
399
 
@@ -471,13 +471,13 @@ export class SimulationEngine {
471
471
  } {
472
472
  const { marketId, outcome, amount } = data as {
473
473
  marketId: string;
474
- outcome: 'YES' | 'NO';
474
+ outcome: "YES" | "NO";
475
475
  amount: number;
476
476
  };
477
477
 
478
478
  const state = this.getGameState();
479
479
  const market = state.predictionMarkets.find(
480
- (m: { id: string }) => m.id === marketId
480
+ (m: { id: string }) => m.id === marketId,
481
481
  );
482
482
 
483
483
  if (!market) {
@@ -485,7 +485,7 @@ export class SimulationEngine {
485
485
  }
486
486
 
487
487
  // Calculate shares based on current price
488
- const price = outcome === 'YES' ? market.yesPrice : market.noPrice;
488
+ const price = outcome === "YES" ? market.yesPrice : market.noPrice;
489
489
  const shares = amount / price;
490
490
 
491
491
  // Record position
@@ -510,14 +510,14 @@ export class SimulationEngine {
510
510
  } {
511
511
  const { ticker, side, size, leverage } = data as {
512
512
  ticker: string;
513
- side: 'LONG' | 'SHORT';
513
+ side: "LONG" | "SHORT";
514
514
  size: number;
515
515
  leverage: number;
516
516
  };
517
517
 
518
518
  const state = this.getGameState();
519
519
  const market = state.perpetualMarkets.find(
520
- (m: { ticker: string }) => m.ticker === ticker
520
+ (m: { ticker: string }) => m.ticker === ticker,
521
521
  );
522
522
 
523
523
  if (!market) {
@@ -551,7 +551,7 @@ export class SimulationEngine {
551
551
 
552
552
  const state = this.getGameState();
553
553
  const market = state.perpetualMarkets.find(
554
- (m: { ticker: string }) => m.ticker === position.ticker
554
+ (m: { ticker: string }) => m.ticker === position.ticker,
555
555
  );
556
556
 
557
557
  if (!market) {
@@ -561,7 +561,7 @@ export class SimulationEngine {
561
561
  // Calculate realized P&L
562
562
  const priceChange = market.price - position.entryPrice;
563
563
  const pnl =
564
- position.side === 'LONG'
564
+ position.side === "LONG"
565
565
  ? priceChange * position.size * position.leverage
566
566
  : -priceChange * position.size * position.leverage;
567
567
 
@@ -601,13 +601,13 @@ export class SimulationEngine {
601
601
  if (position.closedAt) continue; // Skip closed positions
602
602
 
603
603
  const market = tick.state.perpetualMarkets.find(
604
- (m: { ticker: string; price: number }) => m.ticker === position.ticker
604
+ (m: { ticker: string; price: number }) => m.ticker === position.ticker,
605
605
  );
606
606
  if (!market) continue;
607
607
 
608
608
  const priceChange = market.price - position.entryPrice;
609
609
  position.unrealizedPnl =
610
- position.side === 'LONG'
610
+ position.side === "LONG"
611
611
  ? priceChange * position.size * position.leverage
612
612
  : -priceChange * position.size * position.leverage;
613
613
  }
@@ -629,8 +629,8 @@ export class SimulationEngine {
629
629
  const marketOutcome =
630
630
  this.config.snapshot.groundTruth.marketOutcomes[position.marketId];
631
631
  const isCorrect =
632
- (position.outcome === 'YES' && marketOutcome) ||
633
- (position.outcome === 'NO' && !marketOutcome);
632
+ (position.outcome === "YES" && marketOutcome) ||
633
+ (position.outcome === "NO" && !marketOutcome);
634
634
 
635
635
  if (isCorrect) {
636
636
  correctPredictions++;
@@ -736,12 +736,12 @@ export class SimulationEngine {
736
736
 
737
737
  // Match action type and target
738
738
  if (
739
- optimalAction.type === 'buy_prediction' &&
740
- a.type === 'buy_prediction'
739
+ optimalAction.type === "buy_prediction" &&
740
+ a.type === "buy_prediction"
741
741
  ) {
742
742
  return a.data.marketId === optimalAction.target;
743
743
  }
744
- if (optimalAction.type === 'open_perp' && a.type === 'open_perp') {
744
+ if (optimalAction.type === "open_perp" && a.type === "open_perp") {
745
745
  return a.data.ticker === optimalAction.target;
746
746
  }
747
747
 
@@ -772,9 +772,9 @@ export class SimulationEngine {
772
772
  // Calculate reward based on action outcome
773
773
  let reward = 0;
774
774
 
775
- if (action.type === 'buy_prediction') {
775
+ if (action.type === "buy_prediction") {
776
776
  const positionId = Object.keys(
777
- Object.fromEntries(this.predictionPositions)
777
+ Object.fromEntries(this.predictionPositions),
778
778
  ).find((id) => {
779
779
  const pos = this.predictionPositions.get(id)!;
780
780
  return pos.openedAt === action.tick;
@@ -787,8 +787,8 @@ export class SimulationEngine {
787
787
  position.marketId
788
788
  ];
789
789
  const isCorrect =
790
- (position.outcome === 'YES' && marketOutcome) ||
791
- (position.outcome === 'NO' && !marketOutcome);
790
+ (position.outcome === "YES" && marketOutcome) ||
791
+ (position.outcome === "NO" && !marketOutcome);
792
792
  reward = isCorrect ? 1.0 : -1.0;
793
793
  }
794
794
  }
@@ -812,7 +812,7 @@ export class SimulationEngine {
812
812
 
813
813
  interface PredictionPosition {
814
814
  marketId: string;
815
- outcome: 'YES' | 'NO';
815
+ outcome: "YES" | "NO";
816
816
  shares: number;
817
817
  entryPrice: number;
818
818
  amount: number;
@@ -821,7 +821,7 @@ interface PredictionPosition {
821
821
 
822
822
  interface PerpPosition {
823
823
  ticker: string;
824
- side: 'LONG' | 'SHORT';
824
+ side: "LONG" | "SHORT";
825
825
  size: number;
826
826
  leverage: number;
827
827
  entryPrice: number;
@@ -1,94 +1,102 @@
1
-
2
1
  import {
3
- type IAgentRuntimeLike,
4
- getAgentRuntimeManager,
5
- getAgentService,
6
- getTaskInteractor,
7
- } from '../dependencies';
8
- import { logger } from '../utils/logger';
2
+ getAgentRuntimeManager,
3
+ getAgentService,
4
+ getTaskInteractor,
5
+ } from "../dependencies";
6
+ import { logger } from "../utils/logger";
9
7
 
10
8
  export interface TaskRunnerConfig {
11
- agentName: string;
12
- taskPrompt: string;
13
- iterations: number;
14
- model: string;
9
+ agentName: string;
10
+ taskPrompt: string;
11
+ iterations: number;
12
+ model: string;
15
13
  }
16
14
 
17
15
  export interface TaskRunResult {
18
- iteration: number;
19
- success: boolean;
20
- response: string;
21
- trajectoryId?: string;
22
- error?: string;
23
- duration: number;
16
+ iteration: number;
17
+ success: boolean;
18
+ response: string;
19
+ trajectoryId?: string;
20
+ error?: string;
21
+ duration: number;
24
22
  }
25
23
 
26
24
  export class TaskRunner {
27
- private config: TaskRunnerConfig;
28
-
29
- constructor(config: TaskRunnerConfig) {
30
- this.config = config;
25
+ private config: TaskRunnerConfig;
26
+
27
+ constructor(config: TaskRunnerConfig) {
28
+ this.config = config;
29
+ }
30
+
31
+ async run(): Promise<TaskRunResult[]> {
32
+ logger.info(
33
+ "Starting Task Benchmark",
34
+ { config: this.config },
35
+ "TaskRunner",
36
+ );
37
+
38
+ const agentService = getAgentService();
39
+ const runtimeManager = getAgentRuntimeManager();
40
+ const taskInteractor = getTaskInteractor();
41
+
42
+ // 1. Create or get agent
43
+ // For simplicity, we assume we create a temp agent or use existing.
44
+ // Let's create a temporary agent for this run to ensure clean state.
45
+ const agentUser = await agentService.createAgent({
46
+ userId: "task-benchmark-manager", // Dummy manager ID
47
+ name: this.config.agentName,
48
+ system: "You are a helpful assistant.", // Base system prompt
49
+ bio: ["Helpful", "Smart"],
50
+ modelTier: "standard", // or whatever maps to config.model internally
51
+ });
52
+
53
+ const runtime = await runtimeManager.getRuntime(agentUser.id);
54
+ if (!runtime) {
55
+ throw new Error(`Failed to get runtime for agent ${agentUser.id}`);
31
56
  }
32
57
 
33
- async run(): Promise<TaskRunResult[]> {
34
- logger.info('Starting Task Benchmark', { config: this.config }, 'TaskRunner');
35
-
36
- const agentService = getAgentService();
37
- const runtimeManager = getAgentRuntimeManager();
38
- const taskInteractor = getTaskInteractor();
39
-
40
- // 1. Create or get agent
41
- // For simplicity, we assume we create a temp agent or use existing.
42
- // Let's create a temporary agent for this run to ensure clean state.
43
- const agentUser = await agentService.createAgent({
44
- userId: 'task-benchmark-manager', // Dummy manager ID
45
- name: this.config.agentName,
46
- system: 'You are a helpful assistant.', // Base system prompt
47
- bio: ['Helpful', 'Smart'],
48
- modelTier: 'standard', // or whatever maps to config.model internally
58
+ const results: TaskRunResult[] = [];
59
+
60
+ // 2. Run iterations
61
+ for (let i = 0; i < this.config.iterations; i++) {
62
+ logger.info(
63
+ `Running iteration ${i + 1}/${this.config.iterations}...`,
64
+ {},
65
+ "TaskRunner",
66
+ );
67
+ const start = Date.now();
68
+
69
+ try {
70
+ const result = await taskInteractor.executeTask(
71
+ runtime,
72
+ this.config.taskPrompt,
73
+ );
74
+
75
+ results.push({
76
+ iteration: i + 1,
77
+ success: result.success,
78
+ response: result.response,
79
+ trajectoryId: result.trajectoryId,
80
+ error: result.error,
81
+ duration: Date.now() - start,
49
82
  });
83
+ } catch (err) {
84
+ logger.error("Iteration failed", { error: err }, "TaskRunner");
85
+ results.push({
86
+ iteration: i + 1,
87
+ success: false,
88
+ response: "",
89
+ error: err instanceof Error ? err.message : String(err),
90
+ duration: Date.now() - start,
91
+ });
92
+ }
93
+ }
50
94
 
51
- const runtime = await runtimeManager.getRuntime(agentUser.id);
52
- if (!runtime) {
53
- throw new Error(`Failed to get runtime for agent ${agentUser.id}`);
54
- }
55
-
56
- const results: TaskRunResult[] = [];
57
-
58
- // 2. Run iterations
59
- for (let i = 0; i < this.config.iterations; i++) {
60
- logger.info(`Running iteration ${i + 1}/${this.config.iterations}...`, {}, 'TaskRunner');
61
- const start = Date.now();
62
-
63
- try {
64
- const result = await taskInteractor.executeTask(runtime, this.config.taskPrompt);
65
-
66
- results.push({
67
- iteration: i + 1,
68
- success: result.success,
69
- response: result.response,
70
- trajectoryId: result.trajectoryId,
71
- error: result.error,
72
- duration: Date.now() - start,
73
- });
74
-
75
- } catch (err) {
76
- logger.error('Iteration failed', { error: err }, 'TaskRunner');
77
- results.push({
78
- iteration: i + 1,
79
- success: false,
80
- response: '',
81
- error: err instanceof Error ? err.message : String(err),
82
- duration: Date.now() - start,
83
- });
84
- }
85
- }
86
-
87
- // 3. Cleanup ?
88
- // AgentService might not have delete method exposed in interface?
89
- // Dependencies has `resetRuntime` but not deleteAgent.
90
- // Access adapter if needed, but for now we leave it.
95
+ // 3. Cleanup ?
96
+ // AgentService might not have delete method exposed in interface?
97
+ // Dependencies has `resetRuntime` but not deleteAgent.
98
+ // Access adapter if needed, but for now we leave it.
91
99
 
92
- return results;
93
- }
100
+ return results;
101
+ }
94
102
  }