npm - @elizaos/training - Versions diffs - 2.0.0-alpha.13 → 2.0.0-alpha.15 - Mend

@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

package/package.json +2 -2
package/research-output/training-runs/training-run-1773726941205.json +38 -0
package/scripts/rank_trajectories.ts +0 -1
package/scripts/run_task_benchmark.ts +4 -11
package/src/adapter.ts +96 -49
package/src/archetypes/ArchetypeConfigService.ts +188 -185
package/src/archetypes/derive-archetype.ts +47 -47
package/src/archetypes/index.ts +2 -2
package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
package/src/benchmark/BenchmarkDataViewer.ts +32 -30
package/src/benchmark/BenchmarkHistoryService.ts +13 -12
package/src/benchmark/BenchmarkRunner.ts +87 -83
package/src/benchmark/BenchmarkValidator.ts +48 -46
package/src/benchmark/FastEvalRunner.ts +17 -16
package/src/benchmark/MetricsValidator.ts +20 -21
package/src/benchmark/MetricsVisualizer.ts +92 -85
package/src/benchmark/ModelBenchmarkService.ts +90 -82
package/src/benchmark/ModelRegistry.ts +44 -44
package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
package/src/benchmark/SimulationA2AInterface.ts +118 -118
package/src/benchmark/SimulationEngine.ts +51 -51
package/src/benchmark/TaskRunner.ts +87 -79
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
package/src/benchmark/index.ts +27 -27
package/src/benchmark/parseSimulationMetrics.ts +32 -32
package/src/benchmark/simulation-types.ts +10 -10
package/src/dependencies.ts +34 -34
package/src/generation/TrajectoryGenerator.ts +39 -37
package/src/generation/index.ts +1 -1
package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
package/src/huggingface/index.ts +6 -6
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
package/src/index.ts +27 -27
package/src/init-training.ts +6 -6
package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
package/src/metrics/index.ts +2 -2
package/src/rubrics/__tests__/index.test.ts +73 -73
package/src/rubrics/ass-kisser.ts +6 -6
package/src/rubrics/degen.ts +6 -6
package/src/rubrics/goody-twoshoes.ts +6 -6
package/src/rubrics/index.ts +50 -50
package/src/rubrics/information-trader.ts +6 -6
package/src/rubrics/infosec.ts +6 -6
package/src/rubrics/liar.ts +6 -6
package/src/rubrics/perps-trader.ts +6 -6
package/src/rubrics/researcher.ts +6 -6
package/src/rubrics/scammer.ts +6 -6
package/src/rubrics/social-butterfly.ts +7 -7
package/src/rubrics/super-predictor.ts +6 -6
package/src/rubrics/trader.ts +5 -5
package/src/scoring/ArchetypeScoringService.ts +56 -54
package/src/scoring/JudgePromptBuilder.ts +96 -96
package/src/scoring/LLMJudgeCache.ts +26 -23
package/src/scoring/index.ts +3 -3
package/src/training/AutomationPipeline.ts +149 -140
package/src/training/BenchmarkService.ts +49 -45
package/src/training/ConfigValidator.ts +38 -32
package/src/training/MarketOutcomesTracker.ts +22 -12
package/src/training/ModelDeployer.ts +15 -15
package/src/training/ModelFetcher.ts +7 -7
package/src/training/ModelSelectionService.ts +32 -32
package/src/training/ModelUsageVerifier.ts +31 -24
package/src/training/MultiModelOrchestrator.ts +44 -44
package/src/training/RLModelConfig.ts +57 -57
package/src/training/RewardBackpropagationService.ts +18 -17
package/src/training/RulerScoringService.ts +73 -72
package/src/training/TrainingMonitor.ts +29 -29
package/src/training/TrajectoryRecorder.ts +25 -27
package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
package/src/training/index.ts +36 -36
package/src/training/logRLConfig.ts +7 -7
package/src/training/pipeline.ts +13 -16
package/src/training/storage/ModelStorageService.ts +32 -32
package/src/training/storage/TrainingDataArchiver.ts +21 -21
package/src/training/storage/index.ts +2 -2
package/src/training/types.ts +6 -6
package/src/training/window-utils.ts +14 -14
package/src/utils/index.ts +7 -7
package/src/utils/logger.ts +5 -5
package/src/utils/snowflake.ts +1 -1
package/src/utils/synthetic-detector.ts +7 -7

package/src/benchmark/BenchmarkValidator.ts CHANGED Viewed

@@ -5,9 +5,9 @@
  * and contains all required fields.
  */
-import type { JsonValue } from '../adapter';
-import { logger } from '../utils/logger';
-import type { BenchmarkGameSnapshot } from './BenchmarkDataGenerator';
+import type { JsonValue } from "../adapter";
+import { logger } from "../utils/logger";
+import type { BenchmarkGameSnapshot } from "./BenchmarkDataGenerator";
 export interface BenchmarkValidationResult {
   valid: boolean;
@@ -23,58 +23,58 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
   const warnings: string[] = [];
   // 1. Check required top-level fields
-  if (!snapshot || typeof snapshot !== 'object') {
-    errors.push('Snapshot is null, undefined, or not an object');
+  if (!snapshot || typeof snapshot !== "object") {
+    errors.push("Snapshot is null, undefined, or not an object");
     return { valid: false, errors, warnings };
   }
   const snap = snapshot as Record<string, JsonValue>;
-  if (!snap.id) errors.push('Missing required field: id');
-  if (!snap.version) errors.push('Missing required field: version');
-  if (typeof snap.duration !== 'number')
-    errors.push('Missing or invalid field: duration');
-  if (typeof snap.tickInterval !== 'number')
-    errors.push('Missing or invalid field: tickInterval');
-  if (!snap.initialState) errors.push('Missing required field: initialState');
+  if (!snap.id) errors.push("Missing required field: id");
+  if (!snap.version) errors.push("Missing required field: version");
+  if (typeof snap.duration !== "number")
+    errors.push("Missing or invalid field: duration");
+  if (typeof snap.tickInterval !== "number")
+    errors.push("Missing or invalid field: tickInterval");
+  if (!snap.initialState) errors.push("Missing required field: initialState");
   if (!Array.isArray(snap.ticks))
-    errors.push('Missing or invalid field: ticks (must be array)');
-  if (!snap.groundTruth) errors.push('Missing required field: groundTruth');
+    errors.push("Missing or invalid field: ticks (must be array)");
+  if (!snap.groundTruth) errors.push("Missing required field: groundTruth");
   // 2. Validate initial state
-  if (snap.initialState && typeof snap.initialState === 'object') {
+  if (snap.initialState && typeof snap.initialState === "object") {
     const state = snap.initialState as Record<string, JsonValue>;
-    if (typeof state.tick !== 'number')
-      errors.push('initialState.tick must be a number');
-    if (state.tick !== 0) warnings.push('initialState.tick should be 0');
+    if (typeof state.tick !== "number")
+      errors.push("initialState.tick must be a number");
+    if (state.tick !== 0) warnings.push("initialState.tick should be 0");
     if (!Array.isArray(state.predictionMarkets)) {
-      errors.push('initialState.predictionMarkets must be an array');
+      errors.push("initialState.predictionMarkets must be an array");
     }
     if (!Array.isArray(state.perpetualMarkets)) {
-      errors.push('initialState.perpetualMarkets must be an array');
+      errors.push("initialState.perpetualMarkets must be an array");
     }
     if (!Array.isArray(state.agents)) {
-      errors.push('initialState.agents must be an array');
+      errors.push("initialState.agents must be an array");
     }
   }
   // 3. Validate ticks
   if (Array.isArray(snap.ticks)) {
     if (snap.ticks.length === 0) {
-      warnings.push('Ticks array is empty');
+      warnings.push("Ticks array is empty");
     }
     snap.ticks.forEach((tick: JsonValue, index: number) => {
-      if (!tick || typeof tick !== 'object') {
+      if (!tick || typeof tick !== "object") {
         errors.push(`Tick ${index}: invalid tick object`);
         return;
       }
       const tickObj = tick as Record<string, JsonValue>;
-      if (typeof tickObj.number !== 'number') {
+      if (typeof tickObj.number !== "number") {
         errors.push(`Tick ${index}: missing or invalid 'number' field`);
       }
@@ -90,51 +90,51 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
     // Check tick numbering is sequential
     for (let i = 0; i < snap.ticks.length; i++) {
       const tick = snap.ticks[i] as Record<string, JsonValue> | undefined;
-      if (tick && typeof tick.number === 'number' && tick.number !== i) {
+      if (tick && typeof tick.number === "number" && tick.number !== i) {
         warnings.push(`Tick ${i}: number ${tick.number} doesn't match index`);
       }
     }
   }
   // 4. Validate ground truth
-  if (snap.groundTruth && typeof snap.groundTruth === 'object') {
+  if (snap.groundTruth && typeof snap.groundTruth === "object") {
     const gt = snap.groundTruth as Record<string, JsonValue>;
-    if (!gt.marketOutcomes || typeof gt.marketOutcomes !== 'object') {
-      errors.push('groundTruth.marketOutcomes must be an object');
+    if (!gt.marketOutcomes || typeof gt.marketOutcomes !== "object") {
+      errors.push("groundTruth.marketOutcomes must be an object");
     }
-    if (!gt.priceHistory || typeof gt.priceHistory !== 'object') {
-      errors.push('groundTruth.priceHistory must be an object');
+    if (!gt.priceHistory || typeof gt.priceHistory !== "object") {
+      errors.push("groundTruth.priceHistory must be an object");
     }
     if (!Array.isArray(gt.optimalActions)) {
-      errors.push('groundTruth.optimalActions must be an array');
+      errors.push("groundTruth.optimalActions must be an array");
     }
     if (!Array.isArray(gt.socialOpportunities)) {
-      errors.push('groundTruth.socialOpportunities must be an array');
+      errors.push("groundTruth.socialOpportunities must be an array");
     }
     if (!Array.isArray(gt.hiddenFacts)) {
-      errors.push('groundTruth.hiddenFacts must be an array');
+      errors.push("groundTruth.hiddenFacts must be an array");
     }
     if (!Array.isArray(gt.hiddenEvents)) {
-      errors.push('groundTruth.hiddenEvents must be an array');
+      errors.push("groundTruth.hiddenEvents must be an array");
     }
-    if (!gt.trueFacts || typeof gt.trueFacts !== 'object') {
-      errors.push('groundTruth.trueFacts must be an object');
+    if (!gt.trueFacts || typeof gt.trueFacts !== "object") {
+      errors.push("groundTruth.trueFacts must be an object");
     }
   }
   // 5. Cross-validate: markets in initialState should have outcomes in groundTruth
   if (
     snap.initialState &&
-    typeof snap.initialState === 'object' &&
+    typeof snap.initialState === "object" &&
     snap.groundTruth &&
-    typeof snap.groundTruth === 'object'
+    typeof snap.groundTruth === "object"
   ) {
     const initialState = snap.initialState as Record<string, JsonValue>;
     const groundTruth = snap.groundTruth as Record<string, JsonValue>;
@@ -145,7 +145,7 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
     ) as Array<Record<string, JsonValue>>;
     const outcomes = (
       groundTruth.marketOutcomes &&
-      typeof groundTruth.marketOutcomes === 'object'
+      typeof groundTruth.marketOutcomes === "object"
         ? groundTruth.marketOutcomes
         : {}
     ) as Record<string, JsonValue>;
@@ -153,17 +153,17 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
     markets.forEach((market) => {
       if (
         market.id &&
-        typeof market.id === 'string' &&
+        typeof market.id === "string" &&
         !(market.id in outcomes)
       ) {
         warnings.push(
-          `Market ${market.id} in initialState but no outcome in groundTruth`
+          `Market ${market.id} in initialState but no outcome in groundTruth`,
         );
       }
     });
   }
-  logger.info('Benchmark validation complete', {
+  logger.info("Benchmark validation complete", {
     valid: errors.length === 0,
     errors: errors.length,
     warnings: warnings.length,
@@ -179,8 +179,10 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
 /**
  * Quick sanity check (fast, minimal validation)
  */
-export function sanityCheck(snapshot: unknown): snapshot is BenchmarkGameSnapshot {
-  if (!snapshot || typeof snapshot !== 'object') return false;
+export function sanityCheck(
+  snapshot: unknown,
+): snapshot is BenchmarkGameSnapshot {
+  if (!snapshot || typeof snapshot !== "object") return false;
   const snap = snapshot as Record<string, JsonValue>;
   return !!(
     snap.id &&
@@ -194,11 +196,11 @@ export function sanityCheck(snapshot: unknown): snapshot is BenchmarkGameSnapsho
  * Validate and throw if invalid
  */
 export function validateOrThrow(
-  snapshot: unknown
+  snapshot: unknown,
 ): asserts snapshot is BenchmarkGameSnapshot {
   const result = validate(snapshot);
   if (!result.valid) {
-    throw new Error(`Invalid benchmark data: ${result.errors.join(', ')}`);
+    throw new Error(`Invalid benchmark data: ${result.errors.join(", ")}`);
   }
 }

package/src/benchmark/FastEvalRunner.ts CHANGED Viewed

@@ -8,16 +8,16 @@
  * - Progress tracking
  */
-import { logger } from '../utils/logger';
-import { type BenchmarkRunConfig, BenchmarkRunner } from './BenchmarkRunner';
-import type { SimulationResult } from './SimulationEngine';
+import { logger } from "../utils/logger";
+import { type BenchmarkRunConfig, BenchmarkRunner } from "./BenchmarkRunner";
+import type { SimulationResult } from "./SimulationEngine";
 export interface FastEvalConfig {
   /** Benchmark file path */
   benchmarkPath: string;
   /** Agent runtime to test */
-  agentRuntime: BenchmarkRunConfig['agentRuntime'];
+  agentRuntime: BenchmarkRunConfig["agentRuntime"];
   /** Agent user ID */
   agentUserId: string;
@@ -63,6 +63,7 @@ export interface FastEvalResult {
   worstRun: SimulationResult;
 }
+// biome-ignore lint/complexity/noStaticOnlyClass: Runner namespace - run/runWithProgress are logically grouped
 export class FastEvalRunner {
   /**
    * Run fast evaluation
@@ -98,7 +99,7 @@ export class FastEvalRunner {
     const iterations = config.iterations || 1;
     const parallelRuns = config.parallelRuns || 1;
-    logger.info('Starting fast evaluation', {
+    logger.info("Starting fast evaluation", {
       benchmarkPath: config.benchmarkPath,
       agentUserId: config.agentUserId,
       iterations,
@@ -118,7 +119,7 @@ export class FastEvalRunner {
       const batchSize = batchEnd - batchStart;
       logger.info(
-        `Running batch ${batchStart + 1}-${batchEnd} of ${iterations}`
+        `Running batch ${batchStart + 1}-${batchEnd} of ${iterations}`,
       );
       // Run batch in parallel
@@ -157,18 +158,18 @@ export class FastEvalRunner {
     const avgAccuracy =
       results.reduce(
         (sum, r) => sum + r.metrics.predictionMetrics.accuracy,
-        0
+        0,
       ) / results.length;
     const avgOptimality =
       results.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) /
       results.length;
     const bestRun = results.reduce((best, current) =>
-      current.metrics.totalPnl > best.metrics.totalPnl ? current : best
+      current.metrics.totalPnl > best.metrics.totalPnl ? current : best,
     );
     const worstRun = results.reduce((worst, current) =>
-      current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst
+      current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst,
     );
     const summary = {
@@ -179,7 +180,7 @@ export class FastEvalRunner {
       runsCompleted: results.length,
     };
-    logger.info('Fast evaluation completed', summary);
+    logger.info("Fast evaluation completed", summary);
     return {
       results,
@@ -193,22 +194,22 @@ export class FastEvalRunner {
    * Run evaluation with progress bar
    */
   static async runWithProgress(
-    config: FastEvalConfig
+    config: FastEvalConfig,
   ): Promise<FastEvalResult> {
     let lastProgress = 0;
-    return this.run({
+    return FastEvalRunner.run({
       ...config,
       onProgress: (progress) => {
         const percent = Math.round((progress.completed / progress.total) * 100);
         if (percent !== lastProgress) {
           const barLength = 40;
           const filled = Math.round(
-            (progress.completed / progress.total) * barLength
+            (progress.completed / progress.total) * barLength,
           );
-          const bar = '█'.repeat(filled) + '░'.repeat(barLength - filled);
+          const bar = "█".repeat(filled) + "░".repeat(barLength - filled);
           process.stdout.write(
-            `\r[${bar}] ${percent}% (${progress.completed}/${progress.total})`
+            `\r[${bar}] ${percent}% (${progress.completed}/${progress.total})`,
           );
           lastProgress = percent;
         }
@@ -218,7 +219,7 @@ export class FastEvalRunner {
         }
       },
     }).then((result) => {
-      process.stdout.write('\n');
+      process.stdout.write("\n");
       return result;
     });
   }

package/src/benchmark/MetricsValidator.ts CHANGED Viewed

@@ -4,10 +4,10 @@
  * Validates that benchmark metrics are calculated correctly against ground truth.
  */
-import type { ValidationResult } from '../training/ConfigValidator';
-import { logger } from '../utils/logger';
-import type { GroundTruth } from './BenchmarkDataGenerator';
-import type { AgentAction, SimulationMetrics } from './simulation-types';
+import type { ValidationResult } from "../training/ConfigValidator";
+import { logger } from "../utils/logger";
+import type { GroundTruth } from "./BenchmarkDataGenerator";
+import type { AgentAction, SimulationMetrics } from "./simulation-types";
 export class MetricsValidator {
   /**
@@ -16,16 +16,16 @@ export class MetricsValidator {
   static validate(
     metrics: SimulationMetrics,
     actions: AgentAction[],
-    groundTruth: GroundTruth
+    groundTruth: GroundTruth,
   ): ValidationResult {
     const errors: string[] = [];
     const warnings: string[] = [];
     // 1. Validate prediction accuracy calculation
-    const predictionValidation = this.validatePredictionMetrics(
+    const predictionValidation = MetricsValidator.validatePredictionMetrics(
       metrics.predictionMetrics,
       actions,
-      groundTruth
+      groundTruth,
     );
     errors.push(...predictionValidation.errors);
     warnings.push(...predictionValidation.warnings);
@@ -38,23 +38,23 @@ export class MetricsValidator {
     // 3. Validate timing metrics are reasonable
     if (metrics.timing.avgResponseTime < 0) {
       errors.push(
-        `Invalid average response time: ${metrics.timing.avgResponseTime}`
+        `Invalid average response time: ${metrics.timing.avgResponseTime}`,
       );
     }
     if (metrics.timing.maxResponseTime < metrics.timing.avgResponseTime) {
       errors.push(
-        `Max response time less than average: ${metrics.timing.maxResponseTime} < ${metrics.timing.avgResponseTime}`
+        `Max response time less than average: ${metrics.timing.maxResponseTime} < ${metrics.timing.avgResponseTime}`,
       );
     }
     // 4. Validate action counts match
     const predictionActions = actions.filter(
-      (a) => a.type === 'buy_prediction'
+      (a) => a.type === "buy_prediction",
     );
     if (predictionActions.length !== metrics.predictionMetrics.totalPositions) {
       warnings.push(
-        `Prediction action count mismatch: ${predictionActions.length} actions vs ${metrics.predictionMetrics.totalPositions} positions`
+        `Prediction action count mismatch: ${predictionActions.length} actions vs ${metrics.predictionMetrics.totalPositions} positions`,
       );
     }
@@ -64,20 +64,20 @@ export class MetricsValidator {
     const calculatedAccuracy =
       totalPositions > 0 ? correctPredictions / totalPositions : 0;
     const accuracyDiff = Math.abs(
-      calculatedAccuracy - metrics.predictionMetrics.accuracy
+      calculatedAccuracy - metrics.predictionMetrics.accuracy,
     );
     if (accuracyDiff > 0.01) {
       // Allow 1% tolerance for floating point
       errors.push(
-        `Accuracy calculation mismatch: reported ${metrics.predictionMetrics.accuracy}, calculated ${calculatedAccuracy}`
+        `Accuracy calculation mismatch: reported ${metrics.predictionMetrics.accuracy}, calculated ${calculatedAccuracy}`,
       );
     }
     // 6. Validate correct + incorrect = total
     if (correctPredictions + incorrectPredictions !== totalPositions) {
       errors.push(
-        `Prediction count mismatch: ${correctPredictions} + ${incorrectPredictions} != ${totalPositions}`
+        `Prediction count mismatch: ${correctPredictions} + ${incorrectPredictions} != ${totalPositions}`,
       );
     }
@@ -86,17 +86,17 @@ export class MetricsValidator {
       const calculatedWinRate =
         metrics.perpMetrics.profitableTrades / metrics.perpMetrics.totalTrades;
       const winRateDiff = Math.abs(
-        calculatedWinRate - metrics.perpMetrics.winRate
+        calculatedWinRate - metrics.perpMetrics.winRate,
       );
       if (winRateDiff > 0.01) {
         errors.push(
-          `Win rate calculation mismatch: reported ${metrics.perpMetrics.winRate}, calculated ${calculatedWinRate}`
+          `Win rate calculation mismatch: reported ${metrics.perpMetrics.winRate}, calculated ${calculatedWinRate}`,
         );
       }
     }
-    logger.info('Metrics validation complete', {
+    logger.info("Metrics validation complete", {
       valid: errors.length === 0,
       errors: errors.length,
       warnings: warnings.length,
@@ -113,16 +113,16 @@ export class MetricsValidator {
    * Validate prediction metrics against ground truth
    */
   private static validatePredictionMetrics(
-    _predictionMetrics: SimulationMetrics['predictionMetrics'],
+    _predictionMetrics: SimulationMetrics["predictionMetrics"],
     actions: AgentAction[],
-    groundTruth: GroundTruth
+    groundTruth: GroundTruth,
   ): ValidationResult {
     const errors: string[] = [];
     const warnings: string[] = [];
     // Get all prediction actions
     const predictionActions = actions.filter(
-      (a) => a.type === 'buy_prediction'
+      (a) => a.type === "buy_prediction",
     );
     // Validate each action against ground truth
@@ -133,7 +133,6 @@ export class MetricsValidator {
       // Check if we have ground truth for this market
       if (!(marketId in groundTruth.marketOutcomes)) {
         warnings.push(`No ground truth for market ${marketId}`);
-        continue;
       }
       // Verify the outcome exists in ground truth