@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -5,9 +5,9 @@
5
5
  * Creates interactive HTML reports with embedded charts.
6
6
  */
7
7
 
8
- import { promises as fs } from 'fs';
9
- import * as path from 'path';
10
- import type { SimulationMetrics } from './SimulationEngine';
8
+ import { promises as fs } from "node:fs";
9
+ import * as path from "node:path";
10
+ import type { SimulationMetrics } from "./SimulationEngine";
11
11
 
12
12
  export interface ChartData {
13
13
  labels: string[];
@@ -41,19 +41,20 @@ export interface BenchmarkHistoryEntry {
41
41
  * Color palette for charts
42
42
  */
43
43
  const CHART_COLORS = {
44
- primary: '#3b82f6',
45
- success: '#10b981',
46
- warning: '#f59e0b',
47
- danger: '#ef4444',
48
- purple: '#8b5cf6',
49
- cyan: '#06b6d4',
50
- pink: '#ec4899',
51
- gray: '#6b7280',
44
+ primary: "#3b82f6",
45
+ success: "#10b981",
46
+ warning: "#f59e0b",
47
+ danger: "#ef4444",
48
+ purple: "#8b5cf6",
49
+ cyan: "#06b6d4",
50
+ pink: "#ec4899",
51
+ gray: "#6b7280",
52
52
  };
53
53
 
54
54
  /**
55
55
  * Generates benchmark charts and reports
56
56
  */
57
+ // biome-ignore lint/complexity/noStaticOnlyClass: Chart generator namespace - methods are logically grouped
57
58
  export class BenchmarkChartGenerator {
58
59
  /**
59
60
  * Generate a comprehensive HTML report with charts
@@ -65,10 +66,10 @@ export class BenchmarkChartGenerator {
65
66
  title?: string;
66
67
  benchmarkId?: string;
67
68
  includeHistory?: BenchmarkHistoryEntry[];
68
- } = {}
69
+ } = {},
69
70
  ): Promise<string> {
70
- const title = options.title ?? 'Benchmark Report';
71
- const benchmarkId = options.benchmarkId ?? 'unknown';
71
+ const title = options.title ?? "Benchmark Report";
72
+ const benchmarkId = options.benchmarkId ?? "unknown";
72
73
 
73
74
  const html = `
74
75
  <!DOCTYPE html>
@@ -273,21 +274,21 @@ export class BenchmarkChartGenerator {
273
274
  <p class="subtitle">Benchmark: ${benchmarkId} | Models: ${results.length}</p>
274
275
  </header>
275
276
 
276
- ${this.generateSummaryStats(results)}
277
+ ${BenchmarkChartGenerator.generateSummaryStats(results)}
277
278
 
278
279
  <div class="grid grid-2" style="margin-top: 1.5rem;">
279
- ${this.generatePnLChartCard()}
280
- ${this.generateAccuracyChartCard()}
280
+ ${BenchmarkChartGenerator.generatePnLChartCard()}
281
+ ${BenchmarkChartGenerator.generateAccuracyChartCard()}
281
282
  </div>
282
283
 
283
284
  <div class="grid grid-2" style="margin-top: 1.5rem;">
284
- ${this.generatePerpMetricsChartCard()}
285
- ${this.generateTimingChartCard()}
285
+ ${BenchmarkChartGenerator.generatePerpMetricsChartCard()}
286
+ ${BenchmarkChartGenerator.generateTimingChartCard()}
286
287
  </div>
287
288
 
288
- ${this.generateComparisonTable(results)}
289
+ ${BenchmarkChartGenerator.generateComparisonTable(results)}
289
290
 
290
- ${options.includeHistory ? this.generateHistorySection(options.includeHistory) : ''}
291
+ ${options.includeHistory ? BenchmarkChartGenerator.generateHistorySection(options.includeHistory) : ""}
291
292
 
292
293
  <p class="timestamp">Generated: ${new Date().toLocaleString()}</p>
293
294
  </div>
@@ -296,13 +297,13 @@ export class BenchmarkChartGenerator {
296
297
  Chart.defaults.color = '#94a3b8';
297
298
  Chart.defaults.borderColor = '#475569';
298
299
 
299
- ${this.generateChartScripts(results)}
300
+ ${BenchmarkChartGenerator.generateChartScripts(results)}
300
301
  </script>
301
302
  </body>
302
303
  </html>`;
303
304
 
304
305
  await fs.mkdir(path.dirname(outputPath), { recursive: true });
305
- await fs.writeFile(outputPath, html, 'utf-8');
306
+ await fs.writeFile(outputPath, html, "utf-8");
306
307
 
307
308
  return outputPath;
308
309
  }
@@ -311,31 +312,31 @@ export class BenchmarkChartGenerator {
311
312
  * Generate summary stats section
312
313
  */
313
314
  private static generateSummaryStats(results: ModelComparisonData[]): string {
314
- if (results.length === 0) return '';
315
+ if (results.length === 0) return "";
315
316
 
316
317
  // Find best model for each metric
317
318
  const bestPnl = results.reduce((best, curr) =>
318
- curr.metrics.totalPnl > best.metrics.totalPnl ? curr : best
319
+ curr.metrics.totalPnl > best.metrics.totalPnl ? curr : best,
319
320
  );
320
321
  const bestAccuracy = results.reduce((best, curr) =>
321
322
  curr.metrics.predictionMetrics.accuracy >
322
323
  best.metrics.predictionMetrics.accuracy
323
324
  ? curr
324
- : best
325
+ : best,
325
326
  );
326
327
  const avgPnl =
327
328
  results.reduce((sum, r) => sum + r.metrics.totalPnl, 0) / results.length;
328
329
  const avgAccuracy =
329
330
  results.reduce(
330
331
  (sum, r) => sum + r.metrics.predictionMetrics.accuracy,
331
- 0
332
+ 0,
332
333
  ) / results.length;
333
334
 
334
335
  return `
335
336
  <div class="stats-grid">
336
337
  <div class="stat-card">
337
- <div class="stat-value ${bestPnl.metrics.totalPnl >= 0 ? 'positive' : 'negative'}">
338
- ${bestPnl.metrics.totalPnl >= 0 ? '+' : ''}$${bestPnl.metrics.totalPnl.toFixed(0)}
338
+ <div class="stat-value ${bestPnl.metrics.totalPnl >= 0 ? "positive" : "negative"}">
339
+ ${bestPnl.metrics.totalPnl >= 0 ? "+" : ""}$${bestPnl.metrics.totalPnl.toFixed(0)}
339
340
  </div>
340
341
  <div class="stat-label">Best P&L (${bestPnl.modelName})</div>
341
342
  </div>
@@ -344,8 +345,8 @@ export class BenchmarkChartGenerator {
344
345
  <div class="stat-label">Best Accuracy (${bestAccuracy.modelName})</div>
345
346
  </div>
346
347
  <div class="stat-card">
347
- <div class="stat-value ${avgPnl >= 0 ? 'positive' : 'negative'}">
348
- ${avgPnl >= 0 ? '+' : ''}$${avgPnl.toFixed(0)}
348
+ <div class="stat-value ${avgPnl >= 0 ? "positive" : "negative"}">
349
+ ${avgPnl >= 0 ? "+" : ""}$${avgPnl.toFixed(0)}
349
350
  </div>
350
351
  <div class="stat-label">Average P&L</div>
351
352
  </div>
@@ -416,33 +417,33 @@ export class BenchmarkChartGenerator {
416
417
  * Generate comparison table
417
418
  */
418
419
  private static generateComparisonTable(
419
- results: ModelComparisonData[]
420
+ results: ModelComparisonData[],
420
421
  ): string {
421
422
  // Sort by P&L descending
422
423
  const sorted = [...results].sort(
423
- (a, b) => b.metrics.totalPnl - a.metrics.totalPnl
424
+ (a, b) => b.metrics.totalPnl - a.metrics.totalPnl,
424
425
  );
425
426
  const bestPnlModel = sorted[0]?.modelId;
426
427
 
427
428
  const rows = sorted
428
429
  .map((r) => {
429
- const pnlClass = r.metrics.totalPnl >= 0 ? 'positive' : 'negative';
430
+ const pnlClass = r.metrics.totalPnl >= 0 ? "positive" : "negative";
430
431
  const isWinner = r.modelId === bestPnlModel;
431
432
  const accuracyBadge =
432
433
  r.metrics.predictionMetrics.accuracy >= 0.6
433
- ? 'badge-success'
434
+ ? "badge-success"
434
435
  : r.metrics.predictionMetrics.accuracy >= 0.4
435
- ? 'badge-warning'
436
- : 'badge-danger';
436
+ ? "badge-warning"
437
+ : "badge-danger";
437
438
 
438
439
  return `
439
440
  <tr>
440
441
  <td>
441
442
  <strong>${r.modelName}</strong>
442
- ${isWinner ? '<span class="winner-tag">🏆 Winner</span>' : ''}
443
+ ${isWinner ? '<span class="winner-tag">🏆 Winner</span>' : ""}
443
444
  </td>
444
445
  <td class="${pnlClass}">
445
- ${r.metrics.totalPnl >= 0 ? '+' : ''}$${r.metrics.totalPnl.toFixed(2)}
446
+ ${r.metrics.totalPnl >= 0 ? "+" : ""}$${r.metrics.totalPnl.toFixed(2)}
446
447
  </td>
447
448
  <td>
448
449
  <span class="badge ${accuracyBadge}">
@@ -456,7 +457,7 @@ export class BenchmarkChartGenerator {
456
457
  <td>${(r.metrics.timing.totalDuration / 1000).toFixed(1)}s</td>
457
458
  </tr>`;
458
459
  })
459
- .join('');
460
+ .join("");
460
461
 
461
462
  return `
462
463
  <div class="card" style="margin-top: 1.5rem;">
@@ -485,9 +486,9 @@ export class BenchmarkChartGenerator {
485
486
  * Generate history section
486
487
  */
487
488
  private static generateHistorySection(
488
- history: BenchmarkHistoryEntry[]
489
+ history: BenchmarkHistoryEntry[],
489
490
  ): string {
490
- if (history.length === 0) return '';
491
+ if (history.length === 0) return "";
491
492
 
492
493
  // Group by model
493
494
  const byModel = new Map<string, BenchmarkHistoryEntry[]>();
@@ -513,16 +514,16 @@ export class BenchmarkChartGenerator {
513
514
  const labels = results.map((r) => r.modelName);
514
515
  const pnlData = results.map((r) => r.metrics.totalPnl);
515
516
  const accuracyData = results.map(
516
- (r) => r.metrics.predictionMetrics.accuracy * 100
517
+ (r) => r.metrics.predictionMetrics.accuracy * 100,
517
518
  );
518
519
  const winRateData = results.map((r) => r.metrics.perpMetrics.winRate * 100);
519
520
  const optimalityData = results.map((r) => r.metrics.optimalityScore);
520
521
  const durationData = results.map(
521
- (r) => r.metrics.timing.totalDuration / 1000
522
+ (r) => r.metrics.timing.totalDuration / 1000,
522
523
  );
523
524
 
524
525
  const pnlColors = pnlData.map((v) =>
525
- v >= 0 ? CHART_COLORS.success : CHART_COLORS.danger
526
+ v >= 0 ? CHART_COLORS.success : CHART_COLORS.danger,
526
527
  );
527
528
 
528
529
  return `
@@ -660,7 +661,7 @@ export class BenchmarkChartGenerator {
660
661
  static generateTerminalChart(
661
662
  title: string,
662
663
  data: Array<{ label: string; value: number }>,
663
- options: { width?: number; valueFormat?: (v: number) => string } = {}
664
+ options: { width?: number; valueFormat?: (v: number) => string } = {},
664
665
  ): string {
665
666
  const width = options.width ?? 40;
666
667
  const formatValue = options.valueFormat ?? ((v: number) => v.toFixed(2));
@@ -670,25 +671,25 @@ export class BenchmarkChartGenerator {
670
671
 
671
672
  const lines: string[] = [];
672
673
  lines.push(`\n ${title}`);
673
- lines.push(' ' + ''.repeat(width + maxLabelLen + 20));
674
+ lines.push(` ${"".repeat(width + maxLabelLen + 20)}`);
674
675
 
675
676
  for (const item of data) {
676
677
  const normalizedValue =
677
678
  maxValue > 0 ? Math.abs(item.value) / maxValue : 0;
678
679
  const barLen = Math.round(normalizedValue * width);
679
- const bar = item.value >= 0 ? ''.repeat(barLen) : ''.repeat(barLen);
680
- const color = item.value >= 0 ? '\x1b[32m' : '\x1b[31m';
681
- const reset = '\x1b[0m';
680
+ const bar = item.value >= 0 ? "".repeat(barLen) : "".repeat(barLen);
681
+ const color = item.value >= 0 ? "\x1b[32m" : "\x1b[31m";
682
+ const reset = "\x1b[0m";
682
683
  const paddedLabel = item.label.padEnd(maxLabelLen);
683
684
 
684
685
  lines.push(
685
- ` ${paddedLabel} │${color}${bar}${reset} ${formatValue(item.value)}`
686
+ ` ${paddedLabel} │${color}${bar}${reset} ${formatValue(item.value)}`,
686
687
  );
687
688
  }
688
689
 
689
- lines.push(' ' + ''.repeat(width + maxLabelLen + 20));
690
+ lines.push(` ${"".repeat(width + maxLabelLen + 20)}`);
690
691
 
691
- return lines.join('\n');
692
+ return lines.join("\n");
692
693
  }
693
694
 
694
695
  /**
@@ -696,34 +697,34 @@ export class BenchmarkChartGenerator {
696
697
  */
697
698
  static generateTerminalSummary(results: ModelComparisonData[]): string {
698
699
  const sorted = [...results].sort(
699
- (a, b) => b.metrics.totalPnl - a.metrics.totalPnl
700
+ (a, b) => b.metrics.totalPnl - a.metrics.totalPnl,
700
701
  );
701
702
  const winner = sorted[0];
702
703
 
703
704
  const lines: string[] = [];
704
- lines.push('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
705
- lines.push('📊 BENCHMARK RESULTS');
706
- lines.push('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
705
+ lines.push("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
706
+ lines.push("📊 BENCHMARK RESULTS");
707
+ lines.push("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
707
708
 
708
709
  // P&L Chart
709
710
  lines.push(
710
- this.generateTerminalChart(
711
- '💰 Total P&L',
711
+ BenchmarkChartGenerator.generateTerminalChart(
712
+ "💰 Total P&L",
712
713
  sorted.map((r) => ({ label: r.modelName, value: r.metrics.totalPnl })),
713
- { valueFormat: (v) => `$${v.toFixed(2)}` }
714
- )
714
+ { valueFormat: (v) => `$${v.toFixed(2)}` },
715
+ ),
715
716
  );
716
717
 
717
718
  // Accuracy Chart
718
719
  lines.push(
719
- this.generateTerminalChart(
720
- '🎯 Prediction Accuracy',
720
+ BenchmarkChartGenerator.generateTerminalChart(
721
+ "🎯 Prediction Accuracy",
721
722
  sorted.map((r) => ({
722
723
  label: r.modelName,
723
724
  value: r.metrics.predictionMetrics.accuracy * 100,
724
725
  })),
725
- { valueFormat: (v) => `${v.toFixed(1)}%` }
726
- )
726
+ { valueFormat: (v) => `${v.toFixed(1)}%` },
727
+ ),
727
728
  );
728
729
 
729
730
  // Winner
@@ -731,18 +732,18 @@ export class BenchmarkChartGenerator {
731
732
  const loser = sorted[sorted.length - 1];
732
733
  const pnlDelta = winner.metrics.totalPnl - (loser?.metrics.totalPnl ?? 0);
733
734
 
734
- lines.push('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
735
+ lines.push("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
735
736
  lines.push(`🏆 WINNER: ${winner.modelName}`);
736
737
  lines.push(` P&L: $${winner.metrics.totalPnl.toFixed(2)}`);
737
738
  lines.push(
738
- ` Accuracy: ${(winner.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%`
739
+ ` Accuracy: ${(winner.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%`,
739
740
  );
740
741
  if (results.length > 1 && loser) {
741
742
  lines.push(` Lead: $${pnlDelta.toFixed(2)} over ${loser.modelName}`);
742
743
  }
743
- lines.push('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
744
+ lines.push("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
744
745
  }
745
746
 
746
- return lines.join('\n');
747
+ return lines.join("\n");
747
748
  }
748
749
  }