npm - @elizaos/training - Versions diffs - 2.0.0-alpha.10 - Mend

@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (224) hide show

package/Dockerfile +75 -0
package/LICENSE +21 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/docker-compose.test.yml +57 -0
package/package.json +57 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/benchmark_should_respond.py +190 -0
package/python/scripts/debug_inference.py +62 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/optimize_prompt_grpo.py +269 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_generation.py +29 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_grpo.py +360 -0
package/python/scripts/train_jsonl.py +223 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/research-output/training-runs/training-run-1771276293257.json +38 -0
package/research-output/training-runs/training-run-1771276389280.json +38 -0
package/research-output/training-runs/training-run-1771276502776.json +38 -0
package/research-output/training-runs/training-run-1771277340748.json +38 -0
package/research-output/training-runs/training-run-1773013658993.json +38 -0
package/research-output/training-runs/training-run-1773013861014.json +38 -0
package/research-output/training-runs/training-run-1773014215983.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/generate_should_respond.ts +267 -0
package/scripts/generate_should_respond_dataset.ts +162 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/rank_trajectories.ts +207 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/run_rlaif_loop.ts +78 -0
package/scripts/run_task_benchmark.ts +247 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +204 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/TaskRunner.ts +94 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +91 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +475 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/src/benchmark/__tests__/HeadToHead.test.ts ADDED Viewed

@@ -0,0 +1,126 @@
+import { describe, expect, it } from 'bun:test';
+import type { BenchmarkGameSnapshot } from '../BenchmarkDataGenerator';
+import { MetricsVisualizer } from '../MetricsVisualizer';
+import { SimulationEngine, type SimulationResult } from '../SimulationEngine';
+describe('Head-to-Head Benchmark Infrastructure', () => {
+  // 1. Test Simulation Engine PnL History Tracking
+  describe('SimulationEngine PnL History', () => {
+    it('should initialize with empty pnlHistory and return it after run()', async () => {
+      const mockSnapshot = {
+        id: 'test',
+        ticks: [],
+        initialState: {
+          predictionMarkets: [],
+          perpetualMarkets: [],
+          agents: [],
+        },
+        groundTruth: {
+          marketOutcomes: {},
+          priceHistory: {},
+          optimalActions: [],
+        },
+      } as unknown as BenchmarkGameSnapshot;
+      const engine = new SimulationEngine({
+        snapshot: mockSnapshot,
+        agentId: 'test-agent',
+        fastForward: true,
+      });
+      engine.initialize();
+      // Use public API - run() returns pnlHistory
+      const result = await engine.run();
+      expect(result.pnlHistory).toEqual([]);
+    });
+  });
+  // 2. Test MetricsVisualizer Logic
+  describe('MetricsVisualizer Comparison Logic', () => {
+    // Mock Result Helper
+    const createMockResult = (
+      id: string,
+      pnl: number,
+      history: number[]
+    ): SimulationResult => ({
+      id,
+      agentId: id,
+      benchmarkId: 'bench-1',
+      startTime: 0,
+      endTime: 1000,
+      ticksProcessed: history.length,
+      actions: [],
+      metrics: {
+        totalPnl: pnl,
+        predictionMetrics: {
+          accuracy: 0.5,
+          totalPositions: 0,
+          correctPredictions: 0,
+          incorrectPredictions: 0,
+          avgPnlPerPosition: 0,
+        },
+        perpMetrics: {
+          winRate: 0.5,
+          totalTrades: 0,
+          profitableTrades: 0,
+          avgPnlPerTrade: 0,
+          maxDrawdown: 0,
+        },
+        socialMetrics: {
+          postsCreated: 0,
+          groupsJoined: 0,
+          messagesReceived: 0,
+          reputationGained: 0,
+        },
+        timing: { totalDuration: 0, avgResponseTime: 0, maxResponseTime: 0 },
+        optimalityScore: 50,
+      },
+      trajectory: { states: [], actions: [], rewards: [], windowId: '' },
+      pnlHistory: history.map((val, idx) => ({ tick: idx, pnl: val })),
+    });
+    it('should correctly merge PnL histories of equal length', () => {
+      const baseline = createMockResult('baseline', 100, [10, 50, 100]);
+      const challenger = createMockResult('challenger', 200, [20, 100, 200]);
+      // Use public static method
+      const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
+      expect(history).toHaveLength(3);
+      expect(history[2]).toEqual({ tick: 2, baseline: 100, challenger: 200 });
+    });
+    it('should handle unequal history lengths (fill with final value)', () => {
+      // Baseline died early (e.g., bankruptcy or crash)
+      const baseline = createMockResult('baseline', -50, [10, -50]);
+      // Challenger kept going
+      const challenger = createMockResult('challenger', 100, [20, 60, 80, 100]);
+      const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
+      expect(history).toHaveLength(4); // Should match longest
+      // Tick 0
+      expect(history[0]).toEqual({ tick: 0, baseline: 10, challenger: 20 });
+      // Tick 1
+      expect(history[1]).toEqual({ tick: 1, baseline: -50, challenger: 60 });
+      // Tick 2 (Baseline stopped, should carry over -50)
+      expect(history[2]).toEqual({ tick: 2, baseline: -50, challenger: 80 });
+      // Tick 3
+      expect(history[3]).toEqual({ tick: 3, baseline: -50, challenger: 100 });
+    });
+    it('should generate ASCII chart string', () => {
+      const baseline = createMockResult('baseline', 100, [10, 100]);
+      const challenger = createMockResult('challenger', 200, [20, 200]);
+      const chart = MetricsVisualizer.generateAsciiComparison(
+        baseline,
+        challenger
+      );
+      expect(chart).toContain('HEAD-TO-HEAD RESULTS');
+      expect(chart).toContain('WINNER: Challenger');
+      expect(chart).toContain('Alpha Generated: +$100.00');
+    });
+  });
+});

package/src/benchmark/index.ts ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * Benchmark Module
+ *
+ * Tools for evaluating agent performance through simulation.
+ */
+// Multi-archetype matchup benchmarking
+export type {
+  ArchetypeVsResult,
+  MatchupAgent,
+  MatchupAgentResult,
+  MatchupBenchmarkConfig,
+  MatchupBenchmarkResult,
+} from './ArchetypeMatchupBenchmark';
+export {
+  ArchetypeMatchupBenchmark,
+  runQuickMatchupBenchmark,
+} from './ArchetypeMatchupBenchmark';
+export type {
+  BenchmarkHistoryEntry,
+  ModelComparisonData,
+} from './BenchmarkChartGenerator';
+export { BenchmarkChartGenerator } from './BenchmarkChartGenerator';
+export type {
+  BenchmarkConfig,
+  BenchmarkGameSnapshot,
+  CausalEventType,
+  GroundTruth,
+  HiddenNarrativeFact,
+  ScheduledCausalEvent,
+  VolatilityBucket,
+} from './BenchmarkDataGenerator';
+export { BenchmarkDataGenerator, SeededRandom } from './BenchmarkDataGenerator';
+export { BenchmarkDataViewer } from './BenchmarkDataViewer';
+export type {
+  BenchmarkHistoryQuery,
+  BenchmarkResultInput,
+  BenchmarkTrendData,
+} from './BenchmarkHistoryService';
+export { BenchmarkHistoryService } from './BenchmarkHistoryService';
+export type {
+  BenchmarkComparisonResult,
+  BenchmarkRunConfig,
+} from './BenchmarkRunner';
+export { BenchmarkRunner } from './BenchmarkRunner';
+export * as BenchmarkValidator from './BenchmarkValidator';
+export type { FastEvalConfig, FastEvalResult } from './FastEvalRunner';
+export { FastEvalRunner } from './FastEvalRunner';
+export { MetricsValidator } from './MetricsValidator';
+export { MetricsVisualizer } from './MetricsVisualizer';
+export type {
+  AverageMetrics,
+  ModelBenchmarkOptions,
+  ModelBenchmarkResult,
+  ModelComparisonResult,
+} from './ModelBenchmarkService';
+export { ModelBenchmarkService } from './ModelBenchmarkService';
+export type { ModelConfig } from './ModelRegistry';
+export {
+  getBaselineModels,
+  getModelById,
+  getModelByModelId,
+  getModelDisplayName,
+  getModelsByProvider,
+  getModelsByTier,
+  MODEL_REGISTRY,
+  validateModelId,
+} from './ModelRegistry';
+// Shared utilities
+export {
+  type JsonValue,
+  parseSimulationMetrics,
+} from './parseSimulationMetrics';
+export {
+  createRulerContext,
+  extractMarketOutcomesFromBenchmark,
+  getHiddenEventsForTick,
+  getHiddenFactsForTick,
+  getTrueFacts,
+  scoreActionAgainstGroundTruth,
+  wasDecisionOptimal,
+} from './RulerBenchmarkIntegration';
+export { SimulationA2AInterface } from './SimulationA2AInterface';
+export type {
+  SimulationConfig,
+  SimulationMetrics,
+  SimulationResult,
+} from './SimulationEngine';
+export { SimulationEngine } from './SimulationEngine';
+export type { TaskRunnerConfig, TaskRunResult } from './TaskRunner';
+export { TaskRunner } from './TaskRunner';

package/src/benchmark/parseSimulationMetrics.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * Simulation Metrics Parser
+ *
+ * Shared utility for validating and parsing SimulationMetrics from JSON data.
+ * Used by ModelBenchmarkService and HuggingFaceModelUploader.
+ */
+import type { SimulationMetrics } from './SimulationEngine';
+/**
+ * JSON value type for parsing untyped data
+ */
+export type JsonValue =
+  | string
+  | number
+  | boolean
+  | null
+  | undefined
+  | JsonValue[]
+  | { [key: string]: JsonValue };
+/**
+ * Parse and validate SimulationMetrics from JSON data
+ *
+ * @param data - Raw JSON data to parse
+ * @returns Validated SimulationMetrics object
+ * @throws Error if data is invalid or missing required fields
+ */
+export function parseSimulationMetrics(data: JsonValue): SimulationMetrics {
+  if (typeof data !== 'object' || data === null) {
+    throw new Error('Invalid SimulationMetrics: expected object');
+  }
+  const metrics = data as Record<string, JsonValue>;
+  // Validate required fields
+  if (typeof metrics.totalPnl !== 'number') {
+    throw new Error('Invalid SimulationMetrics: totalPnl must be a number');
+  }
+  if (
+    typeof metrics.predictionMetrics !== 'object' ||
+    metrics.predictionMetrics === null
+  ) {
+    throw new Error(
+      'Invalid SimulationMetrics: predictionMetrics must be an object'
+    );
+  }
+  if (typeof metrics.perpMetrics !== 'object' || metrics.perpMetrics === null) {
+    throw new Error('Invalid SimulationMetrics: perpMetrics must be an object');
+  }
+  if (typeof metrics.optimalityScore !== 'number') {
+    throw new Error(
+      'Invalid SimulationMetrics: optimalityScore must be a number'
+    );
+  }
+  if (typeof metrics.timing !== 'object' || metrics.timing === null) {
+    throw new Error('Invalid SimulationMetrics: timing must be an object');
+  }
+  // Validate nested structures
+  const predictionMetrics = metrics.predictionMetrics as Record<
+    string,
+    JsonValue
+  >;
+  const perpMetrics = metrics.perpMetrics as Record<string, JsonValue>;
+  const timing = metrics.timing as Record<string, JsonValue>;
+  // Helper to safely get number or default
+  const getNumber = (obj: Record<string, JsonValue>, key: string): number => {
+    const val = obj[key];
+    return typeof val === 'number' ? val : 0;
+  };
+  // Parse socialMetrics if present
+  const socialMetricsData = metrics.socialMetrics;
+  const socialMetrics =
+    typeof socialMetricsData === 'object' && socialMetricsData !== null
+      ? (socialMetricsData as Record<string, JsonValue>)
+      : null;
+  return {
+    totalPnl: metrics.totalPnl as number,
+    predictionMetrics: {
+      totalPositions: getNumber(predictionMetrics, 'totalPositions'),
+      correctPredictions: getNumber(predictionMetrics, 'correctPredictions'),
+      incorrectPredictions: getNumber(
+        predictionMetrics,
+        'incorrectPredictions'
+      ),
+      accuracy: getNumber(predictionMetrics, 'accuracy'),
+      avgPnlPerPosition: getNumber(predictionMetrics, 'avgPnlPerPosition'),
+    },
+    perpMetrics: {
+      totalTrades: getNumber(perpMetrics, 'totalTrades'),
+      profitableTrades: getNumber(perpMetrics, 'profitableTrades'),
+      winRate: getNumber(perpMetrics, 'winRate'),
+      avgPnlPerTrade: getNumber(perpMetrics, 'avgPnlPerTrade'),
+      maxDrawdown: getNumber(perpMetrics, 'maxDrawdown'),
+    },
+    socialMetrics: socialMetrics
+      ? {
+          postsCreated: getNumber(socialMetrics, 'postsCreated'),
+          groupsJoined: getNumber(socialMetrics, 'groupsJoined'),
+          messagesReceived: getNumber(socialMetrics, 'messagesReceived'),
+          reputationGained: getNumber(socialMetrics, 'reputationGained'),
+        }
+      : {
+          postsCreated: 0,
+          groupsJoined: 0,
+          messagesReceived: 0,
+          reputationGained: 0,
+        },
+    timing: {
+      avgResponseTime: getNumber(timing, 'avgResponseTime'),
+      maxResponseTime: getNumber(timing, 'maxResponseTime'),
+      totalDuration: getNumber(timing, 'totalDuration'),
+    },
+    optimalityScore: metrics.optimalityScore as number,
+  };
+}

package/src/benchmark/simulation-types.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import type { JsonValue } from '../adapter';
+export type AgentActionType =
+  | 'query_state'
+  | 'buy_prediction'
+  | 'sell_prediction'
+  | 'open_perp'
+  | 'close_perp'
+  | 'create_post'
+  | 'join_group'
+  | 'send_message';
+export interface AgentAction {
+  tick: number;
+  timestamp: number;
+  type: AgentActionType;
+  data: Record<string, JsonValue>;
+  /** How long agent took to respond in milliseconds */
+  duration: number;
+  correctness?: {
+    /** Prediction market correctness tracking */
+    predictionCorrect?: boolean;
+    actualOutcome?: boolean;
+    predictedOutcome?: boolean;
+    /** Perpetual trade correctness tracking */
+    perpCorrect?: boolean;
+    sentimentAtTrade?: number;
+    priceChange?: number;
+    expectedDirection?: 'up' | 'down';
+    /** Sentiment analysis accuracy tracking */
+    sentimentAccuracy?: number;
+    sentimentAtTime?: number;
+    actualSentiment?: number;
+  };
+}
+export interface SimulationMetrics {
+  /** Total P&L from all positions */
+  totalPnl: number;
+  /** Prediction market metrics */
+  predictionMetrics: {
+    totalPositions: number;
+    correctPredictions: number;
+    incorrectPredictions: number;
+    accuracy: number;
+    avgPnlPerPosition: number;
+  };
+  /** Perpetual trading metrics */
+  perpMetrics: {
+    totalTrades: number;
+    profitableTrades: number;
+    winRate: number;
+    avgPnlPerTrade: number;
+    maxDrawdown: number;
+  };
+  /** Social metrics */
+  socialMetrics: {
+    postsCreated: number;
+    groupsJoined: number;
+    messagesReceived: number;
+    reputationGained: number;
+  };
+  /** Timing metrics */
+  timing: {
+    avgResponseTime: number;
+    maxResponseTime: number;
+    totalDuration: number;
+  };
+  /** Compared to optimal actions */
+  optimalityScore: number; // 0-100, how close to optimal
+}