npm - @elizaos/training - Versions diffs - 2.0.0-alpha.10 - Mend

@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (224) hide show

package/Dockerfile +75 -0
package/LICENSE +21 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/docker-compose.test.yml +57 -0
package/package.json +57 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/benchmark_should_respond.py +190 -0
package/python/scripts/debug_inference.py +62 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/optimize_prompt_grpo.py +269 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_generation.py +29 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_grpo.py +360 -0
package/python/scripts/train_jsonl.py +223 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/research-output/training-runs/training-run-1771276293257.json +38 -0
package/research-output/training-runs/training-run-1771276389280.json +38 -0
package/research-output/training-runs/training-run-1771276502776.json +38 -0
package/research-output/training-runs/training-run-1771277340748.json +38 -0
package/research-output/training-runs/training-run-1773013658993.json +38 -0
package/research-output/training-runs/training-run-1773013861014.json +38 -0
package/research-output/training-runs/training-run-1773014215983.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/generate_should_respond.ts +267 -0
package/scripts/generate_should_respond_dataset.ts +162 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/rank_trajectories.ts +207 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/run_rlaif_loop.ts +78 -0
package/scripts/run_task_benchmark.ts +247 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +204 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/TaskRunner.ts +94 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +91 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +475 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/scripts/test-scoring.ts ADDED Viewed

@@ -0,0 +1,73 @@
+#!/usr/bin/env bun
+/**
+ * Test scoring directly
+ */
+import { and, db, desc, eq, isNull, not, trajectories } from '@elizaos/db';
+import { archetypeScoringService } from '../src/scoring';
+async function main() {
+  console.log('Testing trajectory scoring...\n');
+  // Get unscored trajectories
+  const unscored = await db
+    .select({ trajectoryId: trajectories.trajectoryId })
+    .from(trajectories)
+    .where(
+      and(
+        isNull(trajectories.aiJudgeReward),
+        eq(trajectories.isTrainingData, true),
+        not(eq(trajectories.stepsJson, 'null')),
+        not(eq(trajectories.stepsJson, '[]'))
+      )
+    )
+    .limit(10);
+  console.log(`Found ${unscored.length} unscored trajectories`);
+  if (unscored.length === 0) {
+    console.log('No trajectories to score!');
+    process.exit(0);
+  }
+  const ids = unscored.map((t) => t.trajectoryId);
+  console.log('Trajectory IDs:', ids);
+  console.log('\nAttempting to score...');
+  try {
+    const result = await archetypeScoringService.scoreByArchetype(
+      'default',
+      ids
+    );
+    console.log('\nResult:', result);
+    // Check if any were scored
+    const scored = await db
+      .select({
+        trajectoryId: trajectories.trajectoryId,
+        aiJudgeReward: trajectories.aiJudgeReward,
+        aiJudgeReasoning: trajectories.aiJudgeReasoning,
+      })
+      .from(trajectories)
+      .where(not(isNull(trajectories.aiJudgeReward)))
+      .orderBy(desc(trajectories.judgedAt))
+      .limit(5);
+    console.log('\nScored trajectories:', scored.length);
+    if (scored.length > 0) {
+      console.log('Sample scores:');
+      for (const s of scored) {
+        console.log(
+          `  ${s.trajectoryId}: score=${s.aiJudgeReward}, reasoning=${s.aiJudgeReasoning?.substring(0, 50)}...`
+        );
+      }
+    }
+  } catch (error) {
+    console.error('Scoring error:', error);
+  }
+  process.exit(0);
+}
+main().catch(console.error);

package/scripts/test-trained-model.ts ADDED Viewed

@@ -0,0 +1,209 @@
+#!/usr/bin/env bun
+/**
+ * Test Trained Model - TypeScript/Node
+ *
+ * Tests a trained model by:
+ * 1. Loading model from database or path
+ * 2. Running benchmark if available
+ * 3. Testing inference
+ * 4. Comparing to baseline
+ *
+ * Usage:
+ *   bun run packages/training/scripts/test-trained-model.ts --model-id <id>
+ *   bun run packages/training/scripts/test-trained-model.ts --model-path <path> --benchmark
+ */
+import { db, eq, trainedModels } from '@elizaos/db';
+import { BenchmarkService } from '../src/training/BenchmarkService';
+import { logger } from '../src/utils/logger';
+interface TestConfig {
+  modelId?: string;
+  modelPath?: string;
+  benchmark?: boolean;
+  benchmarkPath?: string;
+  compareToBaseline?: boolean;
+}
+async function testModel(config: TestConfig): Promise<void> {
+  logger.info('Testing trained model', config);
+  // Get model from database or path
+  let model;
+  if (config.modelId) {
+    const result = await db
+      .select()
+      .from(trainedModels)
+      .where(eq(trainedModels.modelId, config.modelId))
+      .limit(1);
+    model = result[0];
+    if (!model) {
+      throw new Error(`Model not found: ${config.modelId}`);
+    }
+    logger.info('Found model in database', {
+      modelId: model.modelId,
+      version: model.version,
+      status: model.status,
+      storagePath: model.storagePath,
+    });
+  } else if (config.modelPath) {
+    // Create mock model entry for testing
+    model = {
+      modelId: `test-${Date.now()}`,
+      version: 'test',
+      status: 'ready' as const,
+      storagePath: config.modelPath,
+      benchmarkScore: null,
+    };
+    logger.info('Using model from path', {
+      modelPath: config.modelPath,
+    });
+  } else {
+    throw new Error('Must provide either --model-id or --model-path');
+  }
+  // Test 1: Model loading validation
+  logger.info('='.repeat(60));
+  logger.info('TEST 1: Model Loading');
+  logger.info('='.repeat(60));
+  if (!model.storagePath) {
+    throw new Error('Model storage path not set');
+  }
+  const modelExists = await Bun.file(model.storagePath)
+    .exists()
+    .catch(() => false);
+  if (!modelExists && !config.modelPath) {
+    logger.warn('Model file not found at storage path', {
+      storagePath: model.storagePath,
+    });
+  } else {
+    logger.info('✅ Model path validated', {
+      path: model.storagePath || config.modelPath,
+    });
+  }
+  // Test 2: Benchmark if requested
+  if (config.benchmark) {
+    logger.info('='.repeat(60));
+    logger.info('TEST 2: Running Benchmark');
+    logger.info('='.repeat(60));
+    if (config.modelId) {
+      const benchmarkService = new BenchmarkService();
+      const results = await benchmarkService.benchmarkModel(
+        config.modelId,
+        config.benchmarkPath
+      );
+      logger.info('Benchmark Results:', {
+        score: results.benchmarkScore,
+        pnl: results.pnl,
+        accuracy: results.accuracy,
+        optimality: results.optimality,
+      });
+      // Compare to baseline if requested
+      if (config.compareToBaseline) {
+        const comparison = await benchmarkService.compareModels(config.modelId);
+        logger.info('Comparison to Baseline:', {
+          newScore: comparison.newScore,
+          previousScore: comparison.previousScore,
+          improvement: comparison.improvement,
+          shouldDeploy: comparison.shouldDeploy,
+          reason: comparison.reason,
+        });
+      }
+    } else {
+      logger.warn('Benchmark requires model-id (model must be in database)');
+    }
+  }
+  // Test 3: Inference test (if we can get runtime)
+  logger.info('='.repeat(60));
+  logger.info('TEST 3: Inference Test');
+  logger.info('='.repeat(60));
+  try {
+    // Get test agent
+    const testAgentResult = await db.select().from(trainedModels).limit(1);
+    if (testAgentResult.length > 0) {
+      logger.info('✅ Inference test setup available');
+      logger.info('Run full benchmark to test inference with real agent');
+    } else {
+      logger.warn('No test agent available for inference test');
+    }
+  } catch (error) {
+    logger.warn('Inference test skipped', {
+      error: error instanceof Error ? error.message : String(error),
+    });
+  }
+  // Summary
+  logger.info('='.repeat(60));
+  logger.info('TESTING COMPLETE');
+  logger.info('='.repeat(60));
+  logger.info('Model:', {
+    id: model.modelId,
+    version: model.version,
+    status: model.status,
+  });
+  if (model.benchmarkScore !== null) {
+    logger.info('Benchmark Score:', model.benchmarkScore);
+  }
+}
+async function main() {
+  const args = process.argv.slice(2);
+  const config: TestConfig = {};
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+    if (arg === '--model-id' && i + 1 < args.length) {
+      config.modelId = args[i + 1];
+      i++;
+    } else if (arg === '--model-path' && i + 1 < args.length) {
+      config.modelPath = args[i + 1];
+      i++;
+    } else if (arg === '--benchmark') {
+      config.benchmark = true;
+    } else if (arg === '--benchmark-path' && i + 1 < args.length) {
+      config.benchmarkPath = args[i + 1];
+      i++;
+    } else if (arg === '--compare') {
+      config.compareToBaseline = true;
+    }
+  }
+  if (!config.modelId && !config.modelPath) {
+    console.error('Usage:');
+    console.error(
+      '  bun run test-trained-model.ts --model-id <id> [--benchmark] [--compare]'
+    );
+    console.error(
+      '  bun run test-trained-model.ts --model-path <path> [--benchmark]'
+    );
+    process.exit(1);
+  }
+  try {
+    await testModel(config);
+  } catch (error) {
+    logger.error('Testing failed', {
+      error: error instanceof Error ? error.message : String(error),
+    });
+    process.exit(1);
+  }
+}
+main();