@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -5,15 +5,15 @@
5
5
  * Useful for validation and understanding benchmark structure.
6
6
  */
7
7
 
8
- import type { JsonValue } from '../adapter';
9
- import { promises as fs } from 'fs';
8
+ import { promises as fs } from "node:fs";
9
+ import type { JsonValue } from "../adapter";
10
10
  import type {
11
11
  BenchmarkGameSnapshot,
12
12
  GameState,
13
13
  GroundTruth,
14
14
  Tick,
15
- } from './BenchmarkDataGenerator';
16
- import * as BenchmarkValidator from './BenchmarkValidator';
15
+ } from "./BenchmarkDataGenerator";
16
+ import * as BenchmarkValidator from "./BenchmarkValidator";
17
17
 
18
18
  export interface BenchmarkViewOptions {
19
19
  /** Show detailed information */
@@ -81,9 +81,9 @@ export class BenchmarkDataViewer {
81
81
  */
82
82
  static async view(
83
83
  filePath: string,
84
- options: BenchmarkViewOptions = {}
84
+ options: BenchmarkViewOptions = {},
85
85
  ): Promise<BenchmarkView> {
86
- const data = await fs.readFile(filePath, 'utf-8');
86
+ const data = await fs.readFile(filePath, "utf-8");
87
87
  const snapshot = JSON.parse(data) as BenchmarkGameSnapshot;
88
88
 
89
89
  // Validate
@@ -105,13 +105,15 @@ export class BenchmarkDataViewer {
105
105
  groupChats: snapshot.initialState.groupChats?.length || 0,
106
106
  },
107
107
 
108
- ticks: this.analyzeTicks(snapshot.ticks),
108
+ ticks: BenchmarkDataViewer.analyzeTicks(snapshot.ticks),
109
109
 
110
110
  validation,
111
111
  };
112
112
 
113
113
  if (options.showGroundTruth || options.verbose) {
114
- view.groundTruth = this.analyzeGroundTruth(snapshot.groundTruth);
114
+ view.groundTruth = BenchmarkDataViewer.analyzeGroundTruth(
115
+ snapshot.groundTruth,
116
+ );
115
117
  }
116
118
 
117
119
  return view;
@@ -120,7 +122,7 @@ export class BenchmarkDataViewer {
120
122
  /**
121
123
  * Analyze ticks
122
124
  */
123
- private static analyzeTicks(ticks: Tick[]): BenchmarkView['ticks'] {
125
+ private static analyzeTicks(ticks: Tick[]): BenchmarkView["ticks"] {
124
126
  const eventTypes: Record<string, number> = {};
125
127
  let withEvents = 0;
126
128
 
@@ -145,15 +147,15 @@ export class BenchmarkDataViewer {
145
147
  * Analyze ground truth
146
148
  */
147
149
  private static analyzeGroundTruth(
148
- groundTruth: GroundTruth
149
- ): BenchmarkView['groundTruth'] {
150
+ groundTruth: GroundTruth,
151
+ ): BenchmarkView["groundTruth"] {
150
152
  return {
151
153
  marketOutcomes: Object.keys(groundTruth.marketOutcomes).length,
152
154
  priceHistory: Object.fromEntries(
153
155
  Object.entries(groundTruth.priceHistory).map(([ticker, history]) => [
154
156
  ticker,
155
157
  history.length,
156
- ])
158
+ ]),
157
159
  ),
158
160
  optimalActions: groundTruth.optimalActions.length,
159
161
  socialOpportunities: groundTruth.socialOpportunities.length,
@@ -167,21 +169,21 @@ export class BenchmarkDataViewer {
167
169
  * Print view to console
168
170
  */
169
171
  static print(view: BenchmarkView, options: BenchmarkViewOptions = {}): void {
170
- console.log('\n📊 Benchmark Data View\n');
172
+ console.log("\n📊 Benchmark Data View\n");
171
173
  console.log(`ID: ${view.id}`);
172
174
  console.log(`Version: ${view.version}`);
173
175
  console.log(`Created: ${new Date(view.createdAt).toISOString()}`);
174
176
  console.log(`Duration: ${(view.duration / 60).toFixed(1)} minutes`);
175
177
  console.log(`Tick Interval: ${view.tickInterval}s`);
176
178
 
177
- console.log('\n📈 Initial State:');
179
+ console.log("\n📈 Initial State:");
178
180
  console.log(` Prediction Markets: ${view.initialState.predictionMarkets}`);
179
181
  console.log(` Perpetual Markets: ${view.initialState.perpetualMarkets}`);
180
182
  console.log(` Agents: ${view.initialState.agents}`);
181
183
  console.log(` Posts: ${view.initialState.posts}`);
182
184
  console.log(` Group Chats: ${view.initialState.groupChats}`);
183
185
 
184
- console.log('\n⏱️ Ticks:');
186
+ console.log("\n⏱️ Ticks:");
185
187
  console.log(` Total: ${view.ticks.total}`);
186
188
  console.log(` With Events: ${view.ticks.withEvents}`);
187
189
  if (options.verbose) {
@@ -192,27 +194,27 @@ export class BenchmarkDataViewer {
192
194
  }
193
195
 
194
196
  if (view.groundTruth) {
195
- console.log('\n🎯 Ground Truth:');
197
+ console.log("\n🎯 Ground Truth:");
196
198
  console.log(` Market Outcomes: ${view.groundTruth.marketOutcomes}`);
197
199
  console.log(` Price History:`);
198
200
  for (const [ticker, count] of Object.entries(
199
- view.groundTruth.priceHistory
201
+ view.groundTruth.priceHistory,
200
202
  )) {
201
203
  console.log(` ${ticker}: ${count} ticks`);
202
204
  }
203
205
  console.log(` Optimal Actions: ${view.groundTruth.optimalActions}`);
204
206
  console.log(
205
- ` Social Opportunities: ${view.groundTruth.socialOpportunities}`
207
+ ` Social Opportunities: ${view.groundTruth.socialOpportunities}`,
206
208
  );
207
209
  if (options.showHidden) {
208
210
  console.log(` Hidden Facts: ${view.groundTruth.hiddenFacts}`);
209
211
  console.log(` Hidden Events: ${view.groundTruth.hiddenEvents}`);
210
- console.log(` True Facts: ${view.groundTruth.trueFacts.join(', ')}`);
212
+ console.log(` True Facts: ${view.groundTruth.trueFacts.join(", ")}`);
211
213
  }
212
214
  }
213
215
 
214
- console.log('\n✅ Validation:');
215
- console.log(` Valid: ${view.validation.valid ? '' : ''}`);
216
+ console.log("\n✅ Validation:");
217
+ console.log(` Valid: ${view.validation.valid ? "" : ""}`);
216
218
  if (view.validation.errors.length > 0) {
217
219
  console.log(` Errors: ${view.validation.errors.length}`);
218
220
  if (options.verbose) {
@@ -230,7 +232,7 @@ export class BenchmarkDataViewer {
230
232
  }
231
233
  }
232
234
 
233
- console.log('');
235
+ console.log("");
234
236
  }
235
237
 
236
238
  /**
@@ -238,7 +240,7 @@ export class BenchmarkDataViewer {
238
240
  */
239
241
  static getTickDetails(
240
242
  snapshot: BenchmarkGameSnapshot,
241
- tickNumber: number
243
+ tickNumber: number,
242
244
  ): {
243
245
  tick: Tick | null;
244
246
  state: GameState | null;
@@ -265,7 +267,7 @@ export class BenchmarkDataViewer {
265
267
  */
266
268
  static getGroundTruthForTick(
267
269
  snapshot: BenchmarkGameSnapshot,
268
- tickNumber: number
270
+ tickNumber: number,
269
271
  ): {
270
272
  hiddenFacts: Array<{ fact: string; category: string }>;
271
273
  hiddenEvents: Array<{ type: string; description: string }>;
@@ -302,14 +304,14 @@ export class BenchmarkDataViewer {
302
304
  // Check if ground truth is accidentally in state
303
305
  const stateKeys = Object.keys(state);
304
306
  const hasGroundTruthInState =
305
- stateKeys.includes('groundTruth') ||
306
- stateKeys.includes('hiddenFacts') ||
307
- stateKeys.includes('hiddenEvents');
307
+ stateKeys.includes("groundTruth") ||
308
+ stateKeys.includes("hiddenFacts") ||
309
+ stateKeys.includes("hiddenEvents");
308
310
 
309
311
  if (hasGroundTruthInState) {
310
312
  return {
311
313
  canAccess: true,
312
- reason: 'Ground truth found in game state (security issue!)',
314
+ reason: "Ground truth found in game state (security issue!)",
313
315
  };
314
316
  }
315
317
 
@@ -317,8 +319,8 @@ export class BenchmarkDataViewer {
317
319
  canAccess: false,
318
320
  reason:
319
321
  hasGroundTruth && hasHiddenFacts
320
- ? 'Ground truth exists but is properly isolated from game state'
321
- : 'No ground truth data found',
322
+ ? "Ground truth exists but is properly isolated from game state"
323
+ : "No ground truth data found",
322
324
  };
323
325
  }
324
326
  }
@@ -5,13 +5,13 @@
5
5
  */
6
6
 
7
7
  import {
8
- getTrainingDataAdapter,
9
8
  type BenchmarkResultRecord,
9
+ getTrainingDataAdapter,
10
10
  type JsonValue,
11
- } from '../adapter';
12
- import { logger } from '../utils/logger';
13
- import { generateSnowflakeId } from '../utils/snowflake';
14
- import type { SimulationMetrics } from './SimulationEngine';
11
+ } from "../adapter";
12
+ import { logger } from "../utils/logger";
13
+ import { generateSnowflakeId } from "../utils/snowflake";
14
+ import type { SimulationMetrics } from "./SimulationEngine";
15
15
 
16
16
  export interface BenchmarkResultInput {
17
17
  modelId: string;
@@ -45,12 +45,13 @@ export interface BenchmarkTrendData {
45
45
  /**
46
46
  * Service for managing benchmark result history
47
47
  */
48
+ // biome-ignore lint/complexity/noStaticOnlyClass: Service namespace - methods are logically grouped
48
49
  export class BenchmarkHistoryService {
49
50
  /**
50
51
  * Save a benchmark result to the database
51
52
  */
52
53
  static async saveResult(
53
- input: BenchmarkResultInput
54
+ input: BenchmarkResultInput,
54
55
  ): Promise<BenchmarkResultRecord> {
55
56
  const id = await generateSnowflakeId();
56
57
  const now = new Date();
@@ -74,7 +75,7 @@ export class BenchmarkHistoryService {
74
75
 
75
76
  await getTrainingDataAdapter().insertBenchmarkResult(insertData);
76
77
 
77
- logger.info('Saved benchmark result', {
78
+ logger.info("Saved benchmark result", {
78
79
  id,
79
80
  modelId: input.modelId,
80
81
  benchmarkId: input.benchmarkId,
@@ -88,7 +89,7 @@ export class BenchmarkHistoryService {
88
89
  * Get benchmark results by query
89
90
  */
90
91
  static async getResults(
91
- query: BenchmarkHistoryQuery
92
+ query: BenchmarkHistoryQuery,
92
93
  ): Promise<BenchmarkResultRecord[]> {
93
94
  return getTrainingDataAdapter().queryBenchmarkResults({
94
95
  modelId: query.modelId,
@@ -103,7 +104,7 @@ export class BenchmarkHistoryService {
103
104
  * Get the latest result for a model
104
105
  */
105
106
  static async getLatestResult(
106
- modelId: string
107
+ modelId: string,
107
108
  ): Promise<BenchmarkResultRecord | null> {
108
109
  const results = await getTrainingDataAdapter().queryBenchmarkResults({
109
110
  modelId,
@@ -117,7 +118,7 @@ export class BenchmarkHistoryService {
117
118
  */
118
119
  static async getTrendData(
119
120
  modelId: string,
120
- limit = 20
121
+ limit = 20,
121
122
  ): Promise<BenchmarkTrendData> {
122
123
  const results = await getTrainingDataAdapter().queryBenchmarkResults({
123
124
  modelId,
@@ -141,7 +142,7 @@ export class BenchmarkHistoryService {
141
142
  */
142
143
  static async getModelComparison(
143
144
  modelIds: string[],
144
- benchmarkId?: string
145
+ benchmarkId?: string,
145
146
  ): Promise<Map<string, BenchmarkResultRecord[]>> {
146
147
  const adapter = getTrainingDataAdapter();
147
148
  const comparison = new Map<string, BenchmarkResultRecord[]>();
@@ -181,7 +182,7 @@ export class BenchmarkHistoryService {
181
182
  static async checkImprovement(
182
183
  modelId: string,
183
184
  baselineModelId: string,
184
- benchmarkId: string
185
+ benchmarkId: string,
185
186
  ): Promise<{
186
187
  improved: boolean;
187
188
  modelPnl: number;