npm - @elizaos/training - Versions diffs - 2.0.0-alpha.13 → 2.0.0-alpha.15 - Mend

@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

package/package.json +2 -2
package/research-output/training-runs/training-run-1773726941205.json +38 -0
package/scripts/rank_trajectories.ts +0 -1
package/scripts/run_task_benchmark.ts +4 -11
package/src/adapter.ts +96 -49
package/src/archetypes/ArchetypeConfigService.ts +188 -185
package/src/archetypes/derive-archetype.ts +47 -47
package/src/archetypes/index.ts +2 -2
package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
package/src/benchmark/BenchmarkDataViewer.ts +32 -30
package/src/benchmark/BenchmarkHistoryService.ts +13 -12
package/src/benchmark/BenchmarkRunner.ts +87 -83
package/src/benchmark/BenchmarkValidator.ts +48 -46
package/src/benchmark/FastEvalRunner.ts +17 -16
package/src/benchmark/MetricsValidator.ts +20 -21
package/src/benchmark/MetricsVisualizer.ts +92 -85
package/src/benchmark/ModelBenchmarkService.ts +90 -82
package/src/benchmark/ModelRegistry.ts +44 -44
package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
package/src/benchmark/SimulationA2AInterface.ts +118 -118
package/src/benchmark/SimulationEngine.ts +51 -51
package/src/benchmark/TaskRunner.ts +87 -79
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
package/src/benchmark/index.ts +27 -27
package/src/benchmark/parseSimulationMetrics.ts +32 -32
package/src/benchmark/simulation-types.ts +10 -10
package/src/dependencies.ts +34 -34
package/src/generation/TrajectoryGenerator.ts +39 -37
package/src/generation/index.ts +1 -1
package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
package/src/huggingface/index.ts +6 -6
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
package/src/index.ts +27 -27
package/src/init-training.ts +6 -6
package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
package/src/metrics/index.ts +2 -2
package/src/rubrics/__tests__/index.test.ts +73 -73
package/src/rubrics/ass-kisser.ts +6 -6
package/src/rubrics/degen.ts +6 -6
package/src/rubrics/goody-twoshoes.ts +6 -6
package/src/rubrics/index.ts +50 -50
package/src/rubrics/information-trader.ts +6 -6
package/src/rubrics/infosec.ts +6 -6
package/src/rubrics/liar.ts +6 -6
package/src/rubrics/perps-trader.ts +6 -6
package/src/rubrics/researcher.ts +6 -6
package/src/rubrics/scammer.ts +6 -6
package/src/rubrics/social-butterfly.ts +7 -7
package/src/rubrics/super-predictor.ts +6 -6
package/src/rubrics/trader.ts +5 -5
package/src/scoring/ArchetypeScoringService.ts +56 -54
package/src/scoring/JudgePromptBuilder.ts +96 -96
package/src/scoring/LLMJudgeCache.ts +26 -23
package/src/scoring/index.ts +3 -3
package/src/training/AutomationPipeline.ts +149 -140
package/src/training/BenchmarkService.ts +49 -45
package/src/training/ConfigValidator.ts +38 -32
package/src/training/MarketOutcomesTracker.ts +22 -12
package/src/training/ModelDeployer.ts +15 -15
package/src/training/ModelFetcher.ts +7 -7
package/src/training/ModelSelectionService.ts +32 -32
package/src/training/ModelUsageVerifier.ts +31 -24
package/src/training/MultiModelOrchestrator.ts +44 -44
package/src/training/RLModelConfig.ts +57 -57
package/src/training/RewardBackpropagationService.ts +18 -17
package/src/training/RulerScoringService.ts +73 -72
package/src/training/TrainingMonitor.ts +29 -29
package/src/training/TrajectoryRecorder.ts +25 -27
package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
package/src/training/index.ts +36 -36
package/src/training/logRLConfig.ts +7 -7
package/src/training/pipeline.ts +13 -16
package/src/training/storage/ModelStorageService.ts +32 -32
package/src/training/storage/TrainingDataArchiver.ts +21 -21
package/src/training/storage/index.ts +2 -2
package/src/training/types.ts +6 -6
package/src/training/window-utils.ts +14 -14
package/src/utils/index.ts +7 -7
package/src/utils/logger.ts +5 -5
package/src/utils/snowflake.ts +1 -1
package/src/utils/synthetic-detector.ts +7 -7

package/src/benchmark/BenchmarkDataViewer.ts CHANGED Viewed

@@ -5,15 +5,15 @@
  * Useful for validation and understanding benchmark structure.
  */
-import type { JsonValue } from '../adapter';
-import { promises as fs } from 'fs';
+import { promises as fs } from "node:fs";
+import type { JsonValue } from "../adapter";
 import type {
   BenchmarkGameSnapshot,
   GameState,
   GroundTruth,
   Tick,
-} from './BenchmarkDataGenerator';
-import * as BenchmarkValidator from './BenchmarkValidator';
+} from "./BenchmarkDataGenerator";
+import * as BenchmarkValidator from "./BenchmarkValidator";
 export interface BenchmarkViewOptions {
   /** Show detailed information */
@@ -81,9 +81,9 @@ export class BenchmarkDataViewer {
    */
   static async view(
     filePath: string,
-    options: BenchmarkViewOptions = {}
+    options: BenchmarkViewOptions = {},
   ): Promise<BenchmarkView> {
-    const data = await fs.readFile(filePath, 'utf-8');
+    const data = await fs.readFile(filePath, "utf-8");
     const snapshot = JSON.parse(data) as BenchmarkGameSnapshot;
     // Validate
@@ -105,13 +105,15 @@ export class BenchmarkDataViewer {
         groupChats: snapshot.initialState.groupChats?.length || 0,
       },
-      ticks: this.analyzeTicks(snapshot.ticks),
+      ticks: BenchmarkDataViewer.analyzeTicks(snapshot.ticks),
       validation,
     };
     if (options.showGroundTruth || options.verbose) {
-      view.groundTruth = this.analyzeGroundTruth(snapshot.groundTruth);
+      view.groundTruth = BenchmarkDataViewer.analyzeGroundTruth(
+        snapshot.groundTruth,
+      );
     }
     return view;
@@ -120,7 +122,7 @@ export class BenchmarkDataViewer {
   /**
    * Analyze ticks
    */
-  private static analyzeTicks(ticks: Tick[]): BenchmarkView['ticks'] {
+  private static analyzeTicks(ticks: Tick[]): BenchmarkView["ticks"] {
     const eventTypes: Record<string, number> = {};
     let withEvents = 0;
@@ -145,15 +147,15 @@ export class BenchmarkDataViewer {
    * Analyze ground truth
    */
   private static analyzeGroundTruth(
-    groundTruth: GroundTruth
-  ): BenchmarkView['groundTruth'] {
+    groundTruth: GroundTruth,
+  ): BenchmarkView["groundTruth"] {
     return {
       marketOutcomes: Object.keys(groundTruth.marketOutcomes).length,
       priceHistory: Object.fromEntries(
         Object.entries(groundTruth.priceHistory).map(([ticker, history]) => [
           ticker,
           history.length,
-        ])
+        ]),
       ),
       optimalActions: groundTruth.optimalActions.length,
       socialOpportunities: groundTruth.socialOpportunities.length,
@@ -167,21 +169,21 @@ export class BenchmarkDataViewer {
    * Print view to console
    */
   static print(view: BenchmarkView, options: BenchmarkViewOptions = {}): void {
-    console.log('\n📊 Benchmark Data View\n');
+    console.log("\n📊 Benchmark Data View\n");
     console.log(`ID: ${view.id}`);
     console.log(`Version: ${view.version}`);
     console.log(`Created: ${new Date(view.createdAt).toISOString()}`);
     console.log(`Duration: ${(view.duration / 60).toFixed(1)} minutes`);
     console.log(`Tick Interval: ${view.tickInterval}s`);
-    console.log('\n📈 Initial State:');
+    console.log("\n📈 Initial State:");
     console.log(`  Prediction Markets: ${view.initialState.predictionMarkets}`);
     console.log(`  Perpetual Markets: ${view.initialState.perpetualMarkets}`);
     console.log(`  Agents: ${view.initialState.agents}`);
     console.log(`  Posts: ${view.initialState.posts}`);
     console.log(`  Group Chats: ${view.initialState.groupChats}`);
-    console.log('\n⏱️  Ticks:');
+    console.log("\n⏱️  Ticks:");
     console.log(`  Total: ${view.ticks.total}`);
     console.log(`  With Events: ${view.ticks.withEvents}`);
     if (options.verbose) {
@@ -192,27 +194,27 @@ export class BenchmarkDataViewer {
     }
     if (view.groundTruth) {
-      console.log('\n🎯 Ground Truth:');
+      console.log("\n🎯 Ground Truth:");
       console.log(`  Market Outcomes: ${view.groundTruth.marketOutcomes}`);
       console.log(`  Price History:`);
       for (const [ticker, count] of Object.entries(
-        view.groundTruth.priceHistory
+        view.groundTruth.priceHistory,
       )) {
         console.log(`    ${ticker}: ${count} ticks`);
       }
       console.log(`  Optimal Actions: ${view.groundTruth.optimalActions}`);
       console.log(
-        `  Social Opportunities: ${view.groundTruth.socialOpportunities}`
+        `  Social Opportunities: ${view.groundTruth.socialOpportunities}`,
       );
       if (options.showHidden) {
         console.log(`  Hidden Facts: ${view.groundTruth.hiddenFacts}`);
         console.log(`  Hidden Events: ${view.groundTruth.hiddenEvents}`);
-        console.log(`  True Facts: ${view.groundTruth.trueFacts.join(', ')}`);
+        console.log(`  True Facts: ${view.groundTruth.trueFacts.join(", ")}`);
       }
     }
-    console.log('\n✅ Validation:');
-    console.log(`  Valid: ${view.validation.valid ? '✅' : '❌'}`);
+    console.log("\n✅ Validation:");
+    console.log(`  Valid: ${view.validation.valid ? "✅" : "❌"}`);
     if (view.validation.errors.length > 0) {
       console.log(`  Errors: ${view.validation.errors.length}`);
       if (options.verbose) {
@@ -230,7 +232,7 @@ export class BenchmarkDataViewer {
       }
     }
-    console.log('');
+    console.log("");
   }
   /**
@@ -238,7 +240,7 @@ export class BenchmarkDataViewer {
    */
   static getTickDetails(
     snapshot: BenchmarkGameSnapshot,
-    tickNumber: number
+    tickNumber: number,
   ): {
     tick: Tick | null;
     state: GameState | null;
@@ -265,7 +267,7 @@ export class BenchmarkDataViewer {
    */
   static getGroundTruthForTick(
     snapshot: BenchmarkGameSnapshot,
-    tickNumber: number
+    tickNumber: number,
   ): {
     hiddenFacts: Array<{ fact: string; category: string }>;
     hiddenEvents: Array<{ type: string; description: string }>;
@@ -302,14 +304,14 @@ export class BenchmarkDataViewer {
     // Check if ground truth is accidentally in state
     const stateKeys = Object.keys(state);
     const hasGroundTruthInState =
-      stateKeys.includes('groundTruth') ||
-      stateKeys.includes('hiddenFacts') ||
-      stateKeys.includes('hiddenEvents');
+      stateKeys.includes("groundTruth") ||
+      stateKeys.includes("hiddenFacts") ||
+      stateKeys.includes("hiddenEvents");
     if (hasGroundTruthInState) {
       return {
         canAccess: true,
-        reason: 'Ground truth found in game state (security issue!)',
+        reason: "Ground truth found in game state (security issue!)",
       };
     }
@@ -317,8 +319,8 @@ export class BenchmarkDataViewer {
       canAccess: false,
       reason:
         hasGroundTruth && hasHiddenFacts
-          ? 'Ground truth exists but is properly isolated from game state'
-          : 'No ground truth data found',
+          ? "Ground truth exists but is properly isolated from game state"
+          : "No ground truth data found",
     };
   }
 }

package/src/benchmark/BenchmarkHistoryService.ts CHANGED Viewed

@@ -5,13 +5,13 @@
  */
 import {
-  getTrainingDataAdapter,
   type BenchmarkResultRecord,
+  getTrainingDataAdapter,
   type JsonValue,
-} from '../adapter';
-import { logger } from '../utils/logger';
-import { generateSnowflakeId } from '../utils/snowflake';
-import type { SimulationMetrics } from './SimulationEngine';
+} from "../adapter";
+import { logger } from "../utils/logger";
+import { generateSnowflakeId } from "../utils/snowflake";
+import type { SimulationMetrics } from "./SimulationEngine";
 export interface BenchmarkResultInput {
   modelId: string;
@@ -45,12 +45,13 @@ export interface BenchmarkTrendData {
 /**
  * Service for managing benchmark result history
  */
+// biome-ignore lint/complexity/noStaticOnlyClass: Service namespace - methods are logically grouped
 export class BenchmarkHistoryService {
   /**
    * Save a benchmark result to the database
    */
   static async saveResult(
-    input: BenchmarkResultInput
+    input: BenchmarkResultInput,
   ): Promise<BenchmarkResultRecord> {
     const id = await generateSnowflakeId();
     const now = new Date();
@@ -74,7 +75,7 @@ export class BenchmarkHistoryService {
     await getTrainingDataAdapter().insertBenchmarkResult(insertData);
-    logger.info('Saved benchmark result', {
+    logger.info("Saved benchmark result", {
       id,
       modelId: input.modelId,
       benchmarkId: input.benchmarkId,
@@ -88,7 +89,7 @@ export class BenchmarkHistoryService {
    * Get benchmark results by query
    */
   static async getResults(
-    query: BenchmarkHistoryQuery
+    query: BenchmarkHistoryQuery,
   ): Promise<BenchmarkResultRecord[]> {
     return getTrainingDataAdapter().queryBenchmarkResults({
       modelId: query.modelId,
@@ -103,7 +104,7 @@ export class BenchmarkHistoryService {
    * Get the latest result for a model
    */
   static async getLatestResult(
-    modelId: string
+    modelId: string,
   ): Promise<BenchmarkResultRecord | null> {
     const results = await getTrainingDataAdapter().queryBenchmarkResults({
       modelId,
@@ -117,7 +118,7 @@ export class BenchmarkHistoryService {
    */
   static async getTrendData(
     modelId: string,
-    limit = 20
+    limit = 20,
   ): Promise<BenchmarkTrendData> {
     const results = await getTrainingDataAdapter().queryBenchmarkResults({
       modelId,
@@ -141,7 +142,7 @@ export class BenchmarkHistoryService {
    */
   static async getModelComparison(
     modelIds: string[],
-    benchmarkId?: string
+    benchmarkId?: string,
   ): Promise<Map<string, BenchmarkResultRecord[]>> {
     const adapter = getTrainingDataAdapter();
     const comparison = new Map<string, BenchmarkResultRecord[]>();
@@ -181,7 +182,7 @@ export class BenchmarkHistoryService {
   static async checkImprovement(
     modelId: string,
     baselineModelId: string,
-    benchmarkId: string
+    benchmarkId: string,
   ): Promise<{
     improved: boolean;
     modelPnl: number;