npm - @elizaos/training - Versions diffs - 2.0.0-alpha.11 - Mend

@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

package/Dockerfile +75 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/data/.gitkeep +0 -0
package/data/degen/.gitkeep +2 -0
package/data/trader/.gitkeep +2 -0
package/docker-compose.test.yml +57 -0
package/package.json +58 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +206 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +89 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +439 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/src/benchmark/index.ts ADDED Viewed

@@ -0,0 +1,89 @@
+/**
+ * Benchmark Module
+ *
+ * Tools for evaluating agent performance through simulation.
+ */
+// Multi-archetype matchup benchmarking
+export type {
+  ArchetypeVsResult,
+  MatchupAgent,
+  MatchupAgentResult,
+  MatchupBenchmarkConfig,
+  MatchupBenchmarkResult,
+} from './ArchetypeMatchupBenchmark';
+export {
+  ArchetypeMatchupBenchmark,
+  runQuickMatchupBenchmark,
+} from './ArchetypeMatchupBenchmark';
+export type {
+  BenchmarkHistoryEntry,
+  ModelComparisonData,
+} from './BenchmarkChartGenerator';
+export { BenchmarkChartGenerator } from './BenchmarkChartGenerator';
+export type {
+  BenchmarkConfig,
+  BenchmarkGameSnapshot,
+  CausalEventType,
+  GroundTruth,
+  HiddenNarrativeFact,
+  ScheduledCausalEvent,
+  VolatilityBucket,
+} from './BenchmarkDataGenerator';
+export { BenchmarkDataGenerator, SeededRandom } from './BenchmarkDataGenerator';
+export { BenchmarkDataViewer } from './BenchmarkDataViewer';
+export type {
+  BenchmarkHistoryQuery,
+  BenchmarkResultInput,
+  BenchmarkTrendData,
+} from './BenchmarkHistoryService';
+export { BenchmarkHistoryService } from './BenchmarkHistoryService';
+export type {
+  BenchmarkComparisonResult,
+  BenchmarkRunConfig,
+} from './BenchmarkRunner';
+export { BenchmarkRunner } from './BenchmarkRunner';
+export { BenchmarkValidator } from './BenchmarkValidator';
+export type { FastEvalConfig, FastEvalResult } from './FastEvalRunner';
+export { FastEvalRunner } from './FastEvalRunner';
+export { MetricsValidator } from './MetricsValidator';
+export { MetricsVisualizer } from './MetricsVisualizer';
+export type {
+  AverageMetrics,
+  ModelBenchmarkOptions,
+  ModelBenchmarkResult,
+  ModelComparisonResult,
+} from './ModelBenchmarkService';
+export { ModelBenchmarkService } from './ModelBenchmarkService';
+export type { ModelConfig } from './ModelRegistry';
+export {
+  getBaselineModels,
+  getModelById,
+  getModelByModelId,
+  getModelDisplayName,
+  getModelsByProvider,
+  getModelsByTier,
+  MODEL_REGISTRY,
+  validateModelId,
+} from './ModelRegistry';
+// Shared utilities
+export {
+  type JsonValue,
+  parseSimulationMetrics,
+} from './parseSimulationMetrics';
+export {
+  createRulerContext,
+  extractMarketOutcomesFromBenchmark,
+  getHiddenEventsForTick,
+  getHiddenFactsForTick,
+  getTrueFacts,
+  scoreActionAgainstGroundTruth,
+  wasDecisionOptimal,
+} from './RulerBenchmarkIntegration';
+export { SimulationA2AInterface } from './SimulationA2AInterface';
+export type {
+  SimulationConfig,
+  SimulationMetrics,
+  SimulationResult,
+} from './SimulationEngine';
+export { SimulationEngine } from './SimulationEngine';

package/src/benchmark/parseSimulationMetrics.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * Simulation Metrics Parser
+ *
+ * Shared utility for validating and parsing SimulationMetrics from JSON data.
+ * Used by ModelBenchmarkService and HuggingFaceModelUploader.
+ */
+import type { SimulationMetrics } from './SimulationEngine';
+/**
+ * JSON value type for parsing untyped data
+ */
+export type JsonValue =
+  | string
+  | number
+  | boolean
+  | null
+  | undefined
+  | JsonValue[]
+  | { [key: string]: JsonValue };
+/**
+ * Parse and validate SimulationMetrics from JSON data
+ *
+ * @param data - Raw JSON data to parse
+ * @returns Validated SimulationMetrics object
+ * @throws Error if data is invalid or missing required fields
+ */
+export function parseSimulationMetrics(data: JsonValue): SimulationMetrics {
+  if (typeof data !== 'object' || data === null) {
+    throw new Error('Invalid SimulationMetrics: expected object');
+  }
+  const metrics = data as Record<string, JsonValue>;
+  // Validate required fields
+  if (typeof metrics.totalPnl !== 'number') {
+    throw new Error('Invalid SimulationMetrics: totalPnl must be a number');
+  }
+  if (
+    typeof metrics.predictionMetrics !== 'object' ||
+    metrics.predictionMetrics === null
+  ) {
+    throw new Error(
+      'Invalid SimulationMetrics: predictionMetrics must be an object'
+    );
+  }
+  if (typeof metrics.perpMetrics !== 'object' || metrics.perpMetrics === null) {
+    throw new Error('Invalid SimulationMetrics: perpMetrics must be an object');
+  }
+  if (typeof metrics.optimalityScore !== 'number') {
+    throw new Error(
+      'Invalid SimulationMetrics: optimalityScore must be a number'
+    );
+  }
+  if (typeof metrics.timing !== 'object' || metrics.timing === null) {
+    throw new Error('Invalid SimulationMetrics: timing must be an object');
+  }
+  // Validate nested structures
+  const predictionMetrics = metrics.predictionMetrics as Record<
+    string,
+    JsonValue
+  >;
+  const perpMetrics = metrics.perpMetrics as Record<string, JsonValue>;
+  const timing = metrics.timing as Record<string, JsonValue>;
+  // Helper to safely get number or default
+  const getNumber = (obj: Record<string, JsonValue>, key: string): number => {
+    const val = obj[key];
+    return typeof val === 'number' ? val : 0;
+  };
+  // Parse socialMetrics if present
+  const socialMetricsData = metrics.socialMetrics;
+  const socialMetrics =
+    typeof socialMetricsData === 'object' && socialMetricsData !== null
+      ? (socialMetricsData as Record<string, JsonValue>)
+      : null;
+  return {
+    totalPnl: metrics.totalPnl as number,
+    predictionMetrics: {
+      totalPositions: getNumber(predictionMetrics, 'totalPositions'),
+      correctPredictions: getNumber(predictionMetrics, 'correctPredictions'),
+      incorrectPredictions: getNumber(
+        predictionMetrics,
+        'incorrectPredictions'
+      ),
+      accuracy: getNumber(predictionMetrics, 'accuracy'),
+      avgPnlPerPosition: getNumber(predictionMetrics, 'avgPnlPerPosition'),
+    },
+    perpMetrics: {
+      totalTrades: getNumber(perpMetrics, 'totalTrades'),
+      profitableTrades: getNumber(perpMetrics, 'profitableTrades'),
+      winRate: getNumber(perpMetrics, 'winRate'),
+      avgPnlPerTrade: getNumber(perpMetrics, 'avgPnlPerTrade'),
+      maxDrawdown: getNumber(perpMetrics, 'maxDrawdown'),
+    },
+    socialMetrics: socialMetrics
+      ? {
+          postsCreated: getNumber(socialMetrics, 'postsCreated'),
+          groupsJoined: getNumber(socialMetrics, 'groupsJoined'),
+          messagesReceived: getNumber(socialMetrics, 'messagesReceived'),
+          reputationGained: getNumber(socialMetrics, 'reputationGained'),
+        }
+      : {
+          postsCreated: 0,
+          groupsJoined: 0,
+          messagesReceived: 0,
+          reputationGained: 0,
+        },
+    timing: {
+      avgResponseTime: getNumber(timing, 'avgResponseTime'),
+      maxResponseTime: getNumber(timing, 'maxResponseTime'),
+      totalDuration: getNumber(timing, 'totalDuration'),
+    },
+    optimalityScore: metrics.optimalityScore as number,
+  };
+}

package/src/benchmark/simulation-types.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import type { JsonValue } from '../adapter';
+export type AgentActionType =
+  | 'query_state'
+  | 'buy_prediction'
+  | 'sell_prediction'
+  | 'open_perp'
+  | 'close_perp'
+  | 'create_post'
+  | 'join_group'
+  | 'send_message';
+export interface AgentAction {
+  tick: number;
+  timestamp: number;
+  type: AgentActionType;
+  data: Record<string, JsonValue>;
+  /** How long agent took to respond in milliseconds */
+  duration: number;
+  correctness?: {
+    /** Prediction market correctness tracking */
+    predictionCorrect?: boolean;
+    actualOutcome?: boolean;
+    predictedOutcome?: boolean;
+    /** Perpetual trade correctness tracking */
+    perpCorrect?: boolean;
+    sentimentAtTrade?: number;
+    priceChange?: number;
+    expectedDirection?: 'up' | 'down';
+    /** Sentiment analysis accuracy tracking */
+    sentimentAccuracy?: number;
+    sentimentAtTime?: number;
+    actualSentiment?: number;
+  };
+}
+export interface SimulationMetrics {
+  /** Total P&L from all positions */
+  totalPnl: number;
+  /** Prediction market metrics */
+  predictionMetrics: {
+    totalPositions: number;
+    correctPredictions: number;
+    incorrectPredictions: number;
+    accuracy: number;
+    avgPnlPerPosition: number;
+  };
+  /** Perpetual trading metrics */
+  perpMetrics: {
+    totalTrades: number;
+    profitableTrades: number;
+    winRate: number;
+    avgPnlPerTrade: number;
+    maxDrawdown: number;
+  };
+  /** Social metrics */
+  socialMetrics: {
+    postsCreated: number;
+    groupsJoined: number;
+    messagesReceived: number;
+    reputationGained: number;
+  };
+  /** Timing metrics */
+  timing: {
+    avgResponseTime: number;
+    maxResponseTime: number;
+    totalDuration: number;
+  };
+  /** Compared to optimal actions */
+  optimalityScore: number; // 0-100, how close to optimal
+}