@elizaos/training 2.0.0-alpha.77 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,133 +0,0 @@
1
- /**
2
- * Fast Evaluation Runner
3
- *
4
- * Provides efficient evaluation of agents on benchmarks with:
5
- * - Fast-forward mode (skip waiting)
6
- * - Batch processing
7
- * - Parallel execution
8
- * - Progress tracking
9
- */
10
- import { logger } from "../utils/logger";
11
- import { BenchmarkRunner } from "./BenchmarkRunner";
12
- // biome-ignore lint/complexity/noStaticOnlyClass: Runner namespace - run/runWithProgress are logically grouped
13
- export class FastEvalRunner {
14
- /**
15
- * Run fast evaluation
16
- *
17
- * Executes efficient batch evaluation of an agent on a benchmark with
18
- * parallel runs and progress tracking. Optimized for speed and throughput.
19
- *
20
- * @param config - Fast evaluation configuration
21
- * @returns FastEvalResult with all run results and summary statistics
22
- * @throws Error if evaluation fails
23
- *
24
- * @remarks
25
- * - Runs multiple iterations in parallel batches
26
- * - Provides progress callbacks for monitoring
27
- * - Calculates aggregate statistics across all runs
28
- * - Identifies best and worst performing runs
29
- *
30
- * @example
31
- * ```typescript
32
- * const result = await FastEvalRunner.run({
33
- * benchmarkPath: './benchmarks/test.json',
34
- * agentRuntime: runtime,
35
- * agentUserId: 'agent-123',
36
- * parallelRuns: 3,
37
- * iterations: 10,
38
- * outputDir: './results'
39
- * });
40
- * console.log(`Average P&L: ${result.summary.avgPnl}`);
41
- * ```
42
- */
43
- static async run(config) {
44
- const startTime = Date.now();
45
- const iterations = config.iterations || 1;
46
- const parallelRuns = config.parallelRuns || 1;
47
- logger.info("Starting fast evaluation", {
48
- benchmarkPath: config.benchmarkPath,
49
- agentUserId: config.agentUserId,
50
- iterations,
51
- parallelRuns,
52
- });
53
- const results = [];
54
- let completed = 0;
55
- // Run iterations in batches
56
- for (let batchStart = 0; batchStart < iterations; batchStart += parallelRuns) {
57
- const batchEnd = Math.min(batchStart + parallelRuns, iterations);
58
- const batchSize = batchEnd - batchStart;
59
- logger.info(`Running batch ${batchStart + 1}-${batchEnd} of ${iterations}`);
60
- // Run batch in parallel
61
- const batchPromises = Array.from({ length: batchSize }, (_, i) => {
62
- const runIndex = batchStart + i;
63
- const runOutputDir = `${config.outputDir}/run-${runIndex + 1}`;
64
- return BenchmarkRunner.runSingle({
65
- benchmarkPath: config.benchmarkPath,
66
- agentRuntime: config.agentRuntime,
67
- agentUserId: config.agentUserId,
68
- saveTrajectory: config.saveTrajectory ?? false,
69
- outputDir: runOutputDir,
70
- }).then((result) => {
71
- completed++;
72
- if (config.onProgress) {
73
- config.onProgress({
74
- completed,
75
- total: iterations,
76
- currentRun: `run-${runIndex + 1}`,
77
- });
78
- }
79
- return result;
80
- });
81
- });
82
- const batchResults = await Promise.all(batchPromises);
83
- results.push(...batchResults);
84
- }
85
- const totalDuration = Date.now() - startTime;
86
- // Calculate summary
87
- const avgPnl = results.reduce((sum, r) => sum + r.metrics.totalPnl, 0) / results.length;
88
- const avgAccuracy = results.reduce((sum, r) => sum + r.metrics.predictionMetrics.accuracy, 0) / results.length;
89
- const avgOptimality = results.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) /
90
- results.length;
91
- const bestRun = results.reduce((best, current) => current.metrics.totalPnl > best.metrics.totalPnl ? current : best);
92
- const worstRun = results.reduce((worst, current) => current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst);
93
- const summary = {
94
- avgPnl,
95
- avgAccuracy,
96
- avgOptimality,
97
- totalDuration,
98
- runsCompleted: results.length,
99
- };
100
- logger.info("Fast evaluation completed", summary);
101
- return {
102
- results,
103
- summary,
104
- bestRun,
105
- worstRun,
106
- };
107
- }
108
- /**
109
- * Run evaluation with progress bar
110
- */
111
- static async runWithProgress(config) {
112
- let lastProgress = 0;
113
- return FastEvalRunner.run({
114
- ...config,
115
- onProgress: (progress) => {
116
- const percent = Math.round((progress.completed / progress.total) * 100);
117
- if (percent !== lastProgress) {
118
- const barLength = 40;
119
- const filled = Math.round((progress.completed / progress.total) * barLength);
120
- const bar = "█".repeat(filled) + "░".repeat(barLength - filled);
121
- process.stdout.write(`\r[${bar}] ${percent}% (${progress.completed}/${progress.total})`);
122
- lastProgress = percent;
123
- }
124
- if (config.onProgress) {
125
- config.onProgress(progress);
126
- }
127
- },
128
- }).then((result) => {
129
- process.stdout.write("\n");
130
- return result;
131
- });
132
- }
133
- }
@@ -1,104 +0,0 @@
1
- /**
2
- * Metrics Validator
3
- *
4
- * Validates that benchmark metrics are calculated correctly against ground truth.
5
- */
6
- import { logger } from "../utils/logger";
7
- function validatePredictionMetrics(_predictionMetrics, actions, groundTruth) {
8
- const errors = [];
9
- const warnings = [];
10
- // Get all prediction actions
11
- const predictionActions = actions.filter((a) => a.type === "buy_prediction");
12
- // Validate each action against ground truth
13
- for (const action of predictionActions) {
14
- const data = action.data;
15
- const marketId = data.marketId;
16
- // Check if we have ground truth for this market
17
- if (!(marketId in groundTruth.marketOutcomes)) {
18
- warnings.push(`No ground truth for market ${marketId}`);
19
- }
20
- // Verify the outcome exists in ground truth
21
- // (actual verification of correctness happens in SimulationEngine)
22
- }
23
- return { valid: errors.length === 0, errors, warnings };
24
- }
25
- /**
26
- * Validate metrics against ground truth
27
- */
28
- export function validateMetrics(metrics, actions, groundTruth) {
29
- const errors = [];
30
- const warnings = [];
31
- // 1. Validate prediction accuracy calculation
32
- const predictionValidation = validatePredictionMetrics(metrics.predictionMetrics, actions, groundTruth);
33
- errors.push(...predictionValidation.errors);
34
- warnings.push(...predictionValidation.warnings);
35
- // 2. Validate optimality score is in valid range
36
- if (metrics.optimalityScore < 0 || metrics.optimalityScore > 100) {
37
- errors.push(`Optimality score out of range: ${metrics.optimalityScore}`);
38
- }
39
- // 3. Validate timing metrics are reasonable
40
- if (metrics.timing.avgResponseTime < 0) {
41
- errors.push(`Invalid average response time: ${metrics.timing.avgResponseTime}`);
42
- }
43
- if (metrics.timing.maxResponseTime < metrics.timing.avgResponseTime) {
44
- errors.push(`Max response time less than average: ${metrics.timing.maxResponseTime} < ${metrics.timing.avgResponseTime}`);
45
- }
46
- // 4. Validate action counts match
47
- const predictionActions = actions.filter((a) => a.type === "buy_prediction");
48
- if (predictionActions.length !== metrics.predictionMetrics.totalPositions) {
49
- warnings.push(`Prediction action count mismatch: ${predictionActions.length} actions vs ${metrics.predictionMetrics.totalPositions} positions`);
50
- }
51
- // 5. Validate accuracy calculation
52
- const { correctPredictions, incorrectPredictions, totalPositions } = metrics.predictionMetrics;
53
- const calculatedAccuracy = totalPositions > 0 ? correctPredictions / totalPositions : 0;
54
- const accuracyDiff = Math.abs(calculatedAccuracy - metrics.predictionMetrics.accuracy);
55
- if (accuracyDiff > 0.01) {
56
- // Allow 1% tolerance for floating point
57
- errors.push(`Accuracy calculation mismatch: reported ${metrics.predictionMetrics.accuracy}, calculated ${calculatedAccuracy}`);
58
- }
59
- // 6. Validate correct + incorrect = total
60
- if (correctPredictions + incorrectPredictions !== totalPositions) {
61
- errors.push(`Prediction count mismatch: ${correctPredictions} + ${incorrectPredictions} != ${totalPositions}`);
62
- }
63
- // 7. Validate perp win rate calculation
64
- if (metrics.perpMetrics.totalTrades > 0) {
65
- const calculatedWinRate = metrics.perpMetrics.profitableTrades / metrics.perpMetrics.totalTrades;
66
- const winRateDiff = Math.abs(calculatedWinRate - metrics.perpMetrics.winRate);
67
- if (winRateDiff > 0.01) {
68
- errors.push(`Win rate calculation mismatch: reported ${metrics.perpMetrics.winRate}, calculated ${calculatedWinRate}`);
69
- }
70
- }
71
- logger.info("Metrics validation complete", {
72
- valid: errors.length === 0,
73
- errors: errors.length,
74
- warnings: warnings.length,
75
- });
76
- return {
77
- valid: errors.length === 0,
78
- errors,
79
- warnings,
80
- };
81
- }
82
- /**
83
- * Quick sanity check for metrics
84
- */
85
- export function metricsSanityCheck(metrics) {
86
- // Basic sanity checks
87
- if (metrics.optimalityScore < 0 || metrics.optimalityScore > 100)
88
- return false;
89
- if (metrics.predictionMetrics.accuracy < 0 ||
90
- metrics.predictionMetrics.accuracy > 1)
91
- return false;
92
- if (metrics.perpMetrics.winRate < 0 || metrics.perpMetrics.winRate > 1)
93
- return false;
94
- if (metrics.timing.avgResponseTime < 0)
95
- return false;
96
- if (metrics.timing.maxResponseTime < 0)
97
- return false;
98
- return true;
99
- }
100
- /** @deprecated Use validateMetrics and metricsSanityCheck instead */
101
- export const MetricsValidator = {
102
- validate: validateMetrics,
103
- sanityCheck: metricsSanityCheck,
104
- };