@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/package.json +2 -2
  2. package/scripts/rank_trajectories.ts +0 -1
  3. package/scripts/run_task_benchmark.ts +4 -11
  4. package/src/adapter.ts +96 -49
  5. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  6. package/src/archetypes/derive-archetype.ts +47 -47
  7. package/src/archetypes/index.ts +2 -2
  8. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  9. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  10. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  11. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  12. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  13. package/src/benchmark/BenchmarkRunner.ts +87 -83
  14. package/src/benchmark/BenchmarkValidator.ts +48 -46
  15. package/src/benchmark/FastEvalRunner.ts +17 -16
  16. package/src/benchmark/MetricsValidator.ts +20 -21
  17. package/src/benchmark/MetricsVisualizer.ts +92 -85
  18. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  19. package/src/benchmark/ModelRegistry.ts +44 -44
  20. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  21. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  22. package/src/benchmark/SimulationEngine.ts +51 -51
  23. package/src/benchmark/TaskRunner.ts +87 -79
  24. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  25. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  26. package/src/benchmark/index.ts +27 -27
  27. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  28. package/src/benchmark/simulation-types.ts +10 -10
  29. package/src/dependencies.ts +34 -34
  30. package/src/generation/TrajectoryGenerator.ts +39 -37
  31. package/src/generation/index.ts +1 -1
  32. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  33. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  34. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  35. package/src/huggingface/index.ts +6 -6
  36. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  37. package/src/index.ts +27 -27
  38. package/src/init-training.ts +6 -6
  39. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  40. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  41. package/src/metrics/index.ts +2 -2
  42. package/src/rubrics/__tests__/index.test.ts +73 -73
  43. package/src/rubrics/ass-kisser.ts +6 -6
  44. package/src/rubrics/degen.ts +6 -6
  45. package/src/rubrics/goody-twoshoes.ts +6 -6
  46. package/src/rubrics/index.ts +50 -50
  47. package/src/rubrics/information-trader.ts +6 -6
  48. package/src/rubrics/infosec.ts +6 -6
  49. package/src/rubrics/liar.ts +6 -6
  50. package/src/rubrics/perps-trader.ts +6 -6
  51. package/src/rubrics/researcher.ts +6 -6
  52. package/src/rubrics/scammer.ts +6 -6
  53. package/src/rubrics/social-butterfly.ts +7 -7
  54. package/src/rubrics/super-predictor.ts +6 -6
  55. package/src/rubrics/trader.ts +5 -5
  56. package/src/scoring/ArchetypeScoringService.ts +56 -54
  57. package/src/scoring/JudgePromptBuilder.ts +96 -96
  58. package/src/scoring/LLMJudgeCache.ts +26 -23
  59. package/src/scoring/index.ts +3 -3
  60. package/src/training/AutomationPipeline.ts +149 -140
  61. package/src/training/BenchmarkService.ts +49 -45
  62. package/src/training/ConfigValidator.ts +38 -32
  63. package/src/training/MarketOutcomesTracker.ts +22 -12
  64. package/src/training/ModelDeployer.ts +15 -15
  65. package/src/training/ModelFetcher.ts +7 -7
  66. package/src/training/ModelSelectionService.ts +32 -32
  67. package/src/training/ModelUsageVerifier.ts +31 -24
  68. package/src/training/MultiModelOrchestrator.ts +44 -44
  69. package/src/training/RLModelConfig.ts +57 -57
  70. package/src/training/RewardBackpropagationService.ts +18 -17
  71. package/src/training/RulerScoringService.ts +73 -72
  72. package/src/training/TrainingMonitor.ts +29 -29
  73. package/src/training/TrajectoryRecorder.ts +25 -27
  74. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  75. package/src/training/index.ts +36 -36
  76. package/src/training/logRLConfig.ts +7 -7
  77. package/src/training/pipeline.ts +13 -16
  78. package/src/training/storage/ModelStorageService.ts +32 -32
  79. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  80. package/src/training/storage/index.ts +2 -2
  81. package/src/training/types.ts +6 -6
  82. package/src/training/window-utils.ts +14 -14
  83. package/src/utils/index.ts +7 -7
  84. package/src/utils/logger.ts +5 -5
  85. package/src/utils/snowflake.ts +1 -1
  86. package/src/utils/synthetic-detector.ts +7 -7
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@elizaos/training",
3
- "version": "2.0.0-alpha.13",
3
+ "version": "2.0.0-alpha.14",
4
4
  "description": "ElizaOS RL training pipeline with benchmarking and model publishing support",
5
5
  "main": "./src/index.ts",
6
6
  "types": "./src/index.ts",
@@ -53,5 +53,5 @@
53
53
  "bun-types": "^1.3.2",
54
54
  "typescript": "^5.9.3"
55
55
  },
56
- "gitHead": "9448dcfc32d38873e1e2596d4ff4eca444fadca0"
56
+ "gitHead": "4eb31c47081d48bec956e6b9751f3c9aee3eb38d"
57
57
  }
@@ -67,7 +67,6 @@ async function main() {
67
67
  // Initialize Judge Runtime
68
68
  const character = {
69
69
  name: 'JudgeAgent',
70
- modelProvider: "openai" as any,
71
70
  bio: ['I am an impartial AI judge.'],
72
71
  settings: {
73
72
  secrets: {
@@ -50,7 +50,6 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
50
50
  // Create a new runtime
51
51
  const character = {
52
52
  name: 'BenchmarkAgent',
53
- modelProvider: "openai" as any,
54
53
  bio: 'A helpful assistant for benchmarking.',
55
54
  settings: {
56
55
  secrets: {
@@ -60,13 +59,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
60
59
  };
61
60
 
62
61
  const runtime = new AgentRuntime({
63
- token: process.env.OPENAI_API_KEY || '',
64
- modelProvider: "openai" as any,
65
62
  character,
66
- plugins: [],
67
- providers: [],
68
- actions: [],
69
- evaluators: [],
70
63
  });
71
64
 
72
65
  // We must initialize with allowNoDatabase to avoid DB error
@@ -227,11 +220,11 @@ async function main() {
227
220
  agentService: new BenchmarkAgentService(),
228
221
  agentRuntimeManager: new BenchmarkRuntimeManager(),
229
222
  autonomousCoordinator: {
230
- executeAutonomousTick: async () => ({ success: true })
231
- } as any,
223
+ executeAutonomousTick: async () => ({ success: true }),
224
+ },
232
225
  llmCaller: {
233
- callGroqDirect: async () => "mock response"
234
- } as any,
226
+ callGroqDirect: async () => "mock response",
227
+ },
235
228
  });
236
229
 
237
230
  // Import task interactor config
package/src/adapter.ts CHANGED
@@ -24,7 +24,7 @@ export type JsonValue =
24
24
  /**
25
25
  * UUID-like string identifier.
26
26
  */
27
- export type UUID = string & { readonly __brand: 'UUID' };
27
+ export type UUID = string & { readonly __brand: "UUID" };
28
28
 
29
29
  // ─── Record types (replace schema-derived types from @elizaos/db) ───────
30
30
 
@@ -170,7 +170,9 @@ export interface ITrainingDataAdapter {
170
170
  * Get scenario groups with counts.
171
171
  * Returns groups where count >= minGroupSize.
172
172
  */
173
- getScenarioGroups(minGroupSize: number): Promise<Array<{ scenarioId: string | null; count: number }>>;
173
+ getScenarioGroups(
174
+ minGroupSize: number,
175
+ ): Promise<Array<{ scenarioId: string | null; count: number }>>;
174
176
 
175
177
  /**
176
178
  * Sample recent trajectories for data quality assessment.
@@ -194,10 +196,17 @@ export interface ITrainingDataAdapter {
194
196
  getTrajectoryById(trajectoryId: string): Promise<TrajectoryRecord | null>;
195
197
 
196
198
  /** Mark trajectories as used in a training batch. */
197
- markTrajectoriesAsUsed(trajectoryIds: string[], batchId: string): Promise<void>;
199
+ markTrajectoriesAsUsed(
200
+ trajectoryIds: string[],
201
+ batchId: string,
202
+ ): Promise<void>;
198
203
 
199
204
  /** Update trajectory reward data. */
200
- updateTrajectoryRewards(id: string, stepsJson: string, totalReward: number): Promise<void>;
205
+ updateTrajectoryRewards(
206
+ id: string,
207
+ stepsJson: string,
208
+ totalReward: number,
209
+ ): Promise<void>;
201
210
 
202
211
  /** Update trajectory with judge score. */
203
212
  updateTrajectoryScore(
@@ -207,7 +216,9 @@ export interface ITrainingDataAdapter {
207
216
  ): Promise<void>;
208
217
 
209
218
  /** Insert a new trajectory record. */
210
- insertTrajectory(data: Omit<TrajectoryRecord, 'createdAt' | 'updatedAt'>): Promise<void>;
219
+ insertTrajectory(
220
+ data: Omit<TrajectoryRecord, "createdAt" | "updatedAt">,
221
+ ): Promise<void>;
211
222
 
212
223
  /**
213
224
  * Count trajectories created since a given timestamp.
@@ -227,7 +238,10 @@ export interface ITrainingDataAdapter {
227
238
  getModelByVersion(version: string): Promise<TrainedModelRecord | null>;
228
239
 
229
240
  /** Get model associated with a training batch and status. */
230
- getModelByBatchAndStatus(batchId: string, status: string): Promise<TrainedModelRecord | null>;
241
+ getModelByBatchAndStatus(
242
+ batchId: string,
243
+ status: string,
244
+ ): Promise<TrainedModelRecord | null>;
231
245
 
232
246
  /** Count deployed models. */
233
247
  countDeployedModels(): Promise<number>;
@@ -254,7 +268,7 @@ export interface ITrainingDataAdapter {
254
268
  updateModelHuggingFaceRepo(modelId: string, repoName: string): Promise<void>;
255
269
 
256
270
  /** Insert a new trained model record. */
257
- insertModel(data: Omit<TrainedModelRecord, 'createdAt'>): Promise<void>;
271
+ insertModel(data: Omit<TrainedModelRecord, "createdAt">): Promise<void>;
258
272
 
259
273
  // ── Batch operations ───────────────────────────────────────────────
260
274
 
@@ -271,10 +285,16 @@ export interface ITrainingDataAdapter {
271
285
  getLastCompletedBatch(): Promise<TrainingBatchRecord | null>;
272
286
 
273
287
  /** Update batch status. */
274
- updateBatchStatus(batchId: string, status: string, error?: string): Promise<void>;
288
+ updateBatchStatus(
289
+ batchId: string,
290
+ status: string,
291
+ error?: string,
292
+ ): Promise<void>;
275
293
 
276
294
  /** Insert a new training batch. */
277
- insertBatch(data: Omit<TrainingBatchRecord, 'startedAt' | 'completedAt'>): Promise<string>;
295
+ insertBatch(
296
+ data: Omit<TrainingBatchRecord, "startedAt" | "completedAt">,
297
+ ): Promise<string>;
278
298
 
279
299
  // ── Benchmark operations ───────────────────────────────────────────
280
300
 
@@ -285,13 +305,15 @@ export interface ITrainingDataAdapter {
285
305
  countBenchmarksSince(since: Date): Promise<number>;
286
306
 
287
307
  /** Insert a benchmark result. */
288
- insertBenchmarkResult(data: Omit<BenchmarkResultRecord, 'createdAt'>): Promise<void>;
308
+ insertBenchmarkResult(
309
+ data: Omit<BenchmarkResultRecord, "createdAt">,
310
+ ): Promise<void>;
289
311
 
290
312
  // ── User/Agent operations ──────────────────────────────────────────
291
313
 
292
314
  /** Get agent users (isAgent=true). Supports optional strategy filtering. */
293
315
  getAgentUsers(filter?: {
294
- strategy?: 'all' | 'gradual' | 'test';
316
+ strategy?: "all" | "gradual" | "test";
295
317
  rolloutPercentage?: number;
296
318
  testAgentIds?: string[];
297
319
  }): Promise<UserRecord[]>;
@@ -317,7 +339,10 @@ export interface ITrainingDataAdapter {
317
339
  createAgentConfig(data: Record<string, unknown>): Promise<void>;
318
340
 
319
341
  /** Update an agent configuration by userId. */
320
- updateAgentConfig(userId: string, data: Record<string, unknown>): Promise<void>;
342
+ updateAgentConfig(
343
+ userId: string,
344
+ data: Record<string, unknown>,
345
+ ): Promise<void>;
321
346
 
322
347
  /**
323
348
  * Flexible benchmark result query with optional filters.
@@ -332,15 +357,17 @@ export interface ITrainingDataAdapter {
332
357
  }): Promise<BenchmarkResultRecord[]>;
333
358
 
334
359
  /** Aggregate benchmark statistics per model, ordered by avgPnl descending. */
335
- getBenchmarkModelSummary(): Promise<Array<{
336
- modelId: string;
337
- runCount: number;
338
- avgPnl: number;
339
- avgAccuracy: number;
340
- avgOptimality: number;
341
- bestPnl: number;
342
- latestRun: Date;
343
- }>>;
360
+ getBenchmarkModelSummary(): Promise<
361
+ Array<{
362
+ modelId: string;
363
+ runCount: number;
364
+ avgPnl: number;
365
+ avgAccuracy: number;
366
+ avgOptimality: number;
367
+ bestPnl: number;
368
+ latestRun: Date;
369
+ }>
370
+ >;
344
371
 
345
372
  /**
346
373
  * Get scored training trajectories (isTrainingData=true with judge scores).
@@ -366,10 +393,15 @@ export interface ITrainingDataAdapter {
366
393
  // ── Additional operations (added for service refactoring) ────────
367
394
 
368
395
  /** Get the best benchmarked model, optionally excluding a model ID. Status 'ready'/'deployed', non-null benchmarkScore, ordered by score desc. */
369
- getBestBenchmarkedModel(excludeModelId?: string): Promise<TrainedModelRecord | null>;
396
+ getBestBenchmarkedModel(
397
+ excludeModelId?: string,
398
+ ): Promise<TrainedModelRecord | null>;
370
399
 
371
400
  /** Update model with detailed benchmark results (score, accuracy, eval metrics). */
372
- updateModelBenchmarkResults(modelId: string, data: { benchmarkScore: number; accuracy: number; evalMetrics: JsonValue }): Promise<void>;
401
+ updateModelBenchmarkResults(
402
+ modelId: string,
403
+ data: { benchmarkScore: number; accuracy: number; evalMetrics: JsonValue },
404
+ ): Promise<void>;
373
405
 
374
406
  /** Get models with benchmark scores, ordered by score descending. */
375
407
  getBenchmarkedModels(limit: number): Promise<TrainedModelRecord[]>;
@@ -387,7 +419,10 @@ export interface ITrainingDataAdapter {
387
419
  getTrajectoriesByIds(trajectoryIds: string[]): Promise<TrajectoryRecord[]>;
388
420
 
389
421
  /** Get unscored trajectories, optionally filtered by IDs or limited. */
390
- getUnscoredTrajectories(options?: { trajectoryIds?: string[]; limit?: number }): Promise<TrajectoryRecord[]>;
422
+ getUnscoredTrajectories(options?: {
423
+ trajectoryIds?: string[];
424
+ limit?: number;
425
+ }): Promise<TrajectoryRecord[]>;
391
426
 
392
427
  /** Get unscored trajectory IDs for a specific window. */
393
428
  getUnscoredWindowTrajectoryIds(windowId: string): Promise<string[]>;
@@ -399,32 +434,44 @@ export interface ITrainingDataAdapter {
399
434
  */
400
435
  export interface IMarketDataAdapter {
401
436
  /** Get perpetual positions within a time window. */
402
- getPerpPositionsForWindow(windowStart: Date, windowEnd: Date): Promise<Array<{
403
- id: string;
404
- ticker?: string;
405
- direction: string;
406
- entryPrice: number;
407
- currentPrice?: number | null;
408
- exitPrice: number | null;
409
- closedAt?: Date | null;
410
- pnl: number | null;
411
- [key: string]: JsonValue | Date | null | undefined;
412
- }>>;
437
+ getPerpPositionsForWindow(
438
+ windowStart: Date,
439
+ windowEnd: Date,
440
+ ): Promise<
441
+ Array<{
442
+ id: string;
443
+ ticker?: string;
444
+ direction: string;
445
+ entryPrice: number;
446
+ currentPrice?: number | null;
447
+ exitPrice: number | null;
448
+ closedAt?: Date | null;
449
+ pnl: number | null;
450
+ [key: string]: JsonValue | Date | null | undefined;
451
+ }>
452
+ >;
413
453
 
414
454
  /** Get resolved prediction markets within a time window. */
415
- getResolvedMarketsForWindow(windowStart: Date, windowEnd: Date): Promise<Array<{
416
- id: string;
417
- question: string;
418
- outcome: boolean | null;
419
- finalProbability: number | null;
420
- [key: string]: JsonValue | boolean | Date | null | undefined;
421
- }>>;
455
+ getResolvedMarketsForWindow(
456
+ windowStart: Date,
457
+ windowEnd: Date,
458
+ ): Promise<
459
+ Array<{
460
+ id: string;
461
+ question: string;
462
+ outcome: boolean | null;
463
+ finalProbability: number | null;
464
+ [key: string]: JsonValue | boolean | Date | null | undefined;
465
+ }>
466
+ >;
422
467
 
423
468
  /** Get market outcomes for a window ID. */
424
- getMarketOutcomesByWindow(windowId: string): Promise<Array<{
425
- windowId: string;
426
- [key: string]: JsonValue | undefined;
427
- }>>;
469
+ getMarketOutcomesByWindow(windowId: string): Promise<
470
+ Array<{
471
+ windowId: string;
472
+ [key: string]: JsonValue | undefined;
473
+ }>
474
+ >;
428
475
 
429
476
  /** Insert a market outcome record. */
430
477
  insertMarketOutcome(data: Record<string, JsonValue>): Promise<void>;
@@ -462,9 +509,9 @@ let _llmLogAdapter: ILlmLogAdapter | null = null;
462
509
  * Must be called before any training operations that need database access.
463
510
  */
464
511
  export function setTrainingDataAdapter(adapter: ITrainingDataAdapter): void {
465
- if (!adapter || typeof adapter.countScoredTrajectoriesReady !== 'function') {
512
+ if (!adapter || typeof adapter.countScoredTrajectoriesReady !== "function") {
466
513
  throw new TypeError(
467
- 'setTrainingDataAdapter: provided object does not implement ITrainingDataAdapter'
514
+ "setTrainingDataAdapter: provided object does not implement ITrainingDataAdapter",
468
515
  );
469
516
  }
470
517
  _dataAdapter = adapter;
@@ -487,7 +534,7 @@ export function setLlmLogAdapter(adapter: ILlmLogAdapter): void {
487
534
  export function getTrainingDataAdapter(): ITrainingDataAdapter {
488
535
  if (!_dataAdapter) {
489
536
  throw new Error(
490
- 'Training data adapter not registered. Call setTrainingDataAdapter() before using training operations.'
537
+ "Training data adapter not registered. Call setTrainingDataAdapter() before using training operations.",
491
538
  );
492
539
  }
493
540
  return _dataAdapter;