@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -5,11 +5,11 @@
5
5
  * This allows RULER to evaluate agent trajectories against known benchmark outcomes.
6
6
  */
7
7
 
8
- import type { MarketOutcomes } from '../training/RulerScoringService';
8
+ import type { MarketOutcomes } from "../training/RulerScoringService";
9
9
  import type {
10
10
  BenchmarkGameSnapshot,
11
11
  GroundTruth,
12
- } from './BenchmarkDataGenerator';
12
+ } from "./BenchmarkDataGenerator";
13
13
 
14
14
  /**
15
15
  * Extract market outcomes from benchmark ground truth for RULER scoring
@@ -28,15 +28,15 @@ import type {
28
28
  * ```
29
29
  */
30
30
  export function extractMarketOutcomesFromBenchmark(
31
- snapshot: BenchmarkGameSnapshot
31
+ snapshot: BenchmarkGameSnapshot,
32
32
  ): MarketOutcomes {
33
33
  const gt = snapshot.groundTruth;
34
34
 
35
35
  // Extract prediction market outcomes
36
- const predictions: Array<{ marketId: string; outcome: 'YES' | 'NO' }> =
36
+ const predictions: Array<{ marketId: string; outcome: "YES" | "NO" }> =
37
37
  Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
38
38
  marketId,
39
- outcome: outcome ? 'YES' : 'NO',
39
+ outcome: outcome ? "YES" : "NO",
40
40
  }));
41
41
 
42
42
  // Extract stock/perpetual outcomes from price history
@@ -77,10 +77,10 @@ export function extractMarketOutcomesFromBenchmark(
77
77
  */
78
78
  export function getHiddenFactsForTick(
79
79
  snapshot: BenchmarkGameSnapshot,
80
- tickNumber: number
81
- ): GroundTruth['hiddenFacts'] {
80
+ tickNumber: number,
81
+ ): GroundTruth["hiddenFacts"] {
82
82
  return (snapshot.groundTruth.hiddenFacts || []).filter(
83
- (f) => f.tick === tickNumber
83
+ (f) => f.tick === tickNumber,
84
84
  );
85
85
  }
86
86
 
@@ -96,10 +96,10 @@ export function getHiddenFactsForTick(
96
96
  */
97
97
  export function getHiddenEventsForTick(
98
98
  snapshot: BenchmarkGameSnapshot,
99
- tickNumber: number
100
- ): GroundTruth['hiddenEvents'] {
99
+ tickNumber: number,
100
+ ): GroundTruth["hiddenEvents"] {
101
101
  return (snapshot.groundTruth.hiddenEvents || []).filter(
102
- (e) => e.tick === tickNumber
102
+ (e) => e.tick === tickNumber,
103
103
  );
104
104
  }
105
105
 
@@ -119,7 +119,7 @@ export function wasDecisionOptimal(
119
119
  snapshot: BenchmarkGameSnapshot,
120
120
  tickNumber: number,
121
121
  actionType: string,
122
- target: string
122
+ target: string,
123
123
  ): boolean {
124
124
  const optimalActions = snapshot.groundTruth.optimalActions;
125
125
 
@@ -129,7 +129,7 @@ export function wasDecisionOptimal(
129
129
  (a) =>
130
130
  Math.abs(a.tick - tickNumber) <= window &&
131
131
  a.type === actionType &&
132
- a.target === target
132
+ a.target === target,
133
133
  );
134
134
 
135
135
  return relevantActions.length > 0;
@@ -145,8 +145,8 @@ export function wasDecisionOptimal(
145
145
  * @returns Object containing true facts about the world state
146
146
  */
147
147
  export function getTrueFacts(
148
- snapshot: BenchmarkGameSnapshot
149
- ): GroundTruth['trueFacts'] {
148
+ snapshot: BenchmarkGameSnapshot,
149
+ ): GroundTruth["trueFacts"] {
150
150
  return snapshot.groundTruth.trueFacts || {};
151
151
  }
152
152
 
@@ -166,10 +166,10 @@ export function getTrueFacts(
166
166
  */
167
167
  export function createRulerContext(snapshot: BenchmarkGameSnapshot): {
168
168
  marketOutcomes: MarketOutcomes;
169
- trueFacts: GroundTruth['trueFacts'];
170
- hiddenFacts: GroundTruth['hiddenFacts'];
171
- hiddenEvents: GroundTruth['hiddenEvents'];
172
- optimalActions: GroundTruth['optimalActions'];
169
+ trueFacts: GroundTruth["trueFacts"];
170
+ hiddenFacts: GroundTruth["hiddenFacts"];
171
+ hiddenEvents: GroundTruth["hiddenEvents"];
172
+ optimalActions: GroundTruth["optimalActions"];
173
173
  } {
174
174
  return {
175
175
  marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
@@ -201,14 +201,14 @@ export function scoreActionAgainstGroundTruth(
201
201
  snapshot: BenchmarkGameSnapshot,
202
202
  tickNumber: number,
203
203
  actionType: string,
204
- target: string
204
+ target: string,
205
205
  ): number {
206
206
  // Check if action was optimal
207
207
  const wasOptimal = wasDecisionOptimal(
208
208
  snapshot,
209
209
  tickNumber,
210
210
  actionType,
211
- target
211
+ target,
212
212
  );
213
213
 
214
214
  if (wasOptimal) {
@@ -220,9 +220,9 @@ export function scoreActionAgainstGroundTruth(
220
220
  const relevantFacts = hiddenFacts.filter(
221
221
  (f) =>
222
222
  f.value &&
223
- typeof f.value === 'object' &&
224
- 'marketId' in f.value &&
225
- (f.value as { marketId: string }).marketId === target
223
+ typeof f.value === "object" &&
224
+ "marketId" in f.value &&
225
+ (f.value as { marketId: string }).marketId === target,
226
226
  );
227
227
 
228
228
  if (relevantFacts.length > 0) {