npm - @elizaos/training - Versions diffs - 2.0.0-alpha.11 - Mend

@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

package/Dockerfile +75 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/data/.gitkeep +0 -0
package/data/degen/.gitkeep +2 -0
package/data/trader/.gitkeep +2 -0
package/docker-compose.test.yml +57 -0
package/package.json +58 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +206 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +89 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +439 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/src/rubrics/index.ts ADDED Viewed

@@ -0,0 +1,236 @@
+/**
+ * Archetype Evaluation Rubrics
+ *
+ * LLM judge rubrics for each agent archetype defining what "success" means.
+ * Each archetype has specific scoring criteria tailored to its behavioral goals.
+ *
+ * @packageDocumentation
+ */
+import { createHash } from 'crypto';
+import { ASS_KISSER_PRIORITY_METRICS, ASS_KISSER_RUBRIC } from './ass-kisser';
+import { DEGEN_PRIORITY_METRICS, DEGEN_RUBRIC } from './degen';
+import {
+  GOODY_TWOSHOES_PRIORITY_METRICS,
+  GOODY_TWOSHOES_RUBRIC,
+} from './goody-twoshoes';
+import {
+  INFORMATION_TRADER_PRIORITY_METRICS,
+  INFORMATION_TRADER_RUBRIC,
+} from './information-trader';
+import { INFOSEC_PRIORITY_METRICS, INFOSEC_RUBRIC } from './infosec';
+import { LIAR_PRIORITY_METRICS, LIAR_RUBRIC } from './liar';
+import {
+  PERPS_TRADER_PRIORITY_METRICS,
+  PERPS_TRADER_RUBRIC,
+} from './perps-trader';
+import { RESEARCHER_PRIORITY_METRICS, RESEARCHER_RUBRIC } from './researcher';
+import { SCAMMER_PRIORITY_METRICS, SCAMMER_RUBRIC } from './scammer';
+import {
+  SOCIAL_BUTTERFLY_PRIORITY_METRICS,
+  SOCIAL_BUTTERFLY_RUBRIC,
+} from './social-butterfly';
+import {
+  SUPER_PREDICTOR_PRIORITY_METRICS,
+  SUPER_PREDICTOR_RUBRIC,
+} from './super-predictor';
+import { TRADER_PRIORITY_METRICS, TRADER_RUBRIC } from './trader';
+/**
+ * Default rubric for unknown archetypes
+ */
+export const DEFAULT_RUBRIC = `
+## General Agent Evaluation
+You are evaluating an AI agent's performance in a prediction market simulation.
+### Scoring Criteria (0.0 to 1.0)
+- **Profitability**: Higher P&L should receive higher scores
+- **Risk Management**: Balanced positions and avoiding excessive losses
+- **Efficiency**: Achieving goals with fewer actions is better
+- **Decision Quality**: Good reasoning and analysis before actions
+### Scoring Guidelines
+- 0.8-1.0: Excellent performance, consistent profits, good risk management
+- 0.6-0.8: Good performance, positive P&L, reasonable decisions
+- 0.4-0.6: Average performance, mixed results
+- 0.2-0.4: Below average, some losses, questionable decisions
+- 0.0-0.2: Poor performance, significant losses, poor decision making
+Compare trajectories RELATIVE to each other within this group.
+If one trajectory is significantly better, reflect that in score differences.
+`;
+export const DEFAULT_PRIORITY_METRICS = [
+  'trading.totalPnL',
+  'trading.winRate',
+  'behavior.actionSuccessRate',
+  'behavior.episodeLength',
+];
+/**
+ * Registry of all archetype rubrics
+ */
+export const RUBRICS: Record<string, string> = {
+  trader: TRADER_RUBRIC,
+  'social-butterfly': SOCIAL_BUTTERFLY_RUBRIC,
+  scammer: SCAMMER_RUBRIC,
+  degen: DEGEN_RUBRIC,
+  researcher: RESEARCHER_RUBRIC,
+  'information-trader': INFORMATION_TRADER_RUBRIC,
+  'goody-twoshoes': GOODY_TWOSHOES_RUBRIC,
+  'ass-kisser': ASS_KISSER_RUBRIC,
+  'perps-trader': PERPS_TRADER_RUBRIC,
+  'super-predictor': SUPER_PREDICTOR_RUBRIC,
+  infosec: INFOSEC_RUBRIC,
+  liar: LIAR_RUBRIC,
+  // Aliases
+  socialbutterfly: SOCIAL_BUTTERFLY_RUBRIC,
+  goodytwoshoes: GOODY_TWOSHOES_RUBRIC,
+  asskisser: ASS_KISSER_RUBRIC,
+  perpstrader: PERPS_TRADER_RUBRIC,
+  superpredictor: SUPER_PREDICTOR_RUBRIC,
+  informationtrader: INFORMATION_TRADER_RUBRIC,
+};
+/**
+ * Priority metrics for each archetype
+ */
+export const PRIORITY_METRICS: Record<string, string[]> = {
+  trader: TRADER_PRIORITY_METRICS,
+  'social-butterfly': SOCIAL_BUTTERFLY_PRIORITY_METRICS,
+  scammer: SCAMMER_PRIORITY_METRICS,
+  degen: DEGEN_PRIORITY_METRICS,
+  researcher: RESEARCHER_PRIORITY_METRICS,
+  'information-trader': INFORMATION_TRADER_PRIORITY_METRICS,
+  'goody-twoshoes': GOODY_TWOSHOES_PRIORITY_METRICS,
+  'ass-kisser': ASS_KISSER_PRIORITY_METRICS,
+  'perps-trader': PERPS_TRADER_PRIORITY_METRICS,
+  'super-predictor': SUPER_PREDICTOR_PRIORITY_METRICS,
+  infosec: INFOSEC_PRIORITY_METRICS,
+  liar: LIAR_PRIORITY_METRICS,
+};
+/**
+ * Valid canonical archetype names for whitelist validation
+ * Derived from RUBRICS keys to maintain single source of truth
+ */
+export const VALID_ARCHETYPES = new Set(Object.keys(RUBRICS));
+/**
+ * Normalize archetype string to canonical format (lowercase, hyphens)
+ * Returns 'default' for empty/null values
+ * Note: Does NOT validate against whitelist - use sanitizeArchetype() for that
+ */
+export function normalizeArchetype(
+  archetype: string | null | undefined
+): string {
+  if (!archetype || archetype.trim() === '') {
+    return 'default';
+  }
+  return archetype.toLowerCase().trim().replace(/_/g, '-');
+}
+/**
+ * Validate that an archetype is in the allowed whitelist
+ * Prevents prompt injection attacks via malicious archetype strings
+ */
+export function isValidArchetype(archetype: string): boolean {
+  const normalized = normalizeArchetype(archetype);
+  return normalized === 'default' || VALID_ARCHETYPES.has(normalized);
+}
+/**
+ * Sanitize archetype for safe use in LLM prompts
+ * Returns normalized archetype if valid, 'default' otherwise
+ */
+export function sanitizeArchetype(
+  archetype: string | null | undefined
+): string {
+  const normalized = normalizeArchetype(archetype);
+  if (normalized === 'default' || VALID_ARCHETYPES.has(normalized)) {
+    return normalized;
+  }
+  return 'default';
+}
+/**
+ * Get the rubric for an archetype
+ */
+export function getRubric(archetype: string): string {
+  const normalized = normalizeArchetype(archetype);
+  return RUBRICS[normalized] || DEFAULT_RUBRIC;
+}
+/**
+ * Get priority metrics for an archetype
+ */
+export function getPriorityMetrics(archetype: string): string[] {
+  const normalized = normalizeArchetype(archetype);
+  return PRIORITY_METRICS[normalized] || DEFAULT_PRIORITY_METRICS;
+}
+/**
+ * Check if an archetype has a custom rubric
+ */
+export function hasCustomRubric(archetype: string): boolean {
+  const normalized = normalizeArchetype(archetype);
+  return normalized in RUBRICS;
+}
+/**
+ * Canonical archetype names (with hyphens, no aliases)
+ * Single source of truth - derived from PRIORITY_METRICS keys which only contains canonical names
+ */
+export const CANONICAL_ARCHETYPES = Object.keys(
+  PRIORITY_METRICS
+) as readonly string[];
+/**
+ * Get all available archetype names (canonical names only, no aliases)
+ * Uses CANONICAL_ARCHETYPES to maintain single source of truth
+ */
+export function getAvailableArchetypes(): string[] {
+  return [...CANONICAL_ARCHETYPES];
+}
+// Re-export individual rubrics
+export {
+  TRADER_RUBRIC,
+  SOCIAL_BUTTERFLY_RUBRIC,
+  SCAMMER_RUBRIC,
+  DEGEN_RUBRIC,
+  RESEARCHER_RUBRIC,
+  INFORMATION_TRADER_RUBRIC,
+  GOODY_TWOSHOES_RUBRIC,
+  ASS_KISSER_RUBRIC,
+  PERPS_TRADER_RUBRIC,
+  SUPER_PREDICTOR_RUBRIC,
+  INFOSEC_RUBRIC,
+  LIAR_RUBRIC,
+};
+/**
+ * Rubrics version - increment when rubrics change significantly
+ * Used for cache invalidation
+ */
+export const RUBRICS_VERSION = '1.0.0';
+/**
+ * Get a hash of the rubric for an archetype
+ * Used for cache invalidation when specific rubrics change
+ */
+export function getRubricHash(archetype: string): string {
+  const rubric = getRubric(archetype);
+  return createHash('sha256').update(rubric).digest('hex').substring(0, 16);
+}
+/**
+ * Get the hash of all rubrics combined
+ * Used for detecting any rubric changes
+ * Note: Sorted to match Python implementation for cross-language consistency
+ */
+export function getAllRubricsHash(): string {
+  const allRubrics = Object.values(RUBRICS).sort().join('::') + DEFAULT_RUBRIC;
+  return createHash('sha256').update(allRubrics).digest('hex').substring(0, 16);
+}

package/src/rubrics/information-trader.ts ADDED Viewed

@@ -0,0 +1,84 @@
+/**
+ * Information Trader Archetype Evaluation Rubric
+ *
+ * Gathers intel through social channels and trades on information advantage
+ */
+export const INFORMATION_TRADER_RUBRIC = `
+## Information Trader Archetype Evaluation
+You are evaluating an agent that combines social intelligence with trading, gathering information through conversations and relationships to gain trading edges.
+### What Makes an Excellent Information Trader (0.8-1.0)
+- **Social intelligence for trading**: Gathers info through DMs and group chats
+- **Timing correlation**: Trades happen AFTER receiving information
+- **Positive P&L from info edge**: Profits come from information advantage
+- **Strategic networking**: Connects with informed sources
+- **Information synthesis**: Combines social intel with market data
+- **Balanced activity**: Active in both social and trading (ratio ~1.0)
+- **Asks good questions**: Requests specific information
+### What Makes a Good Information Trader (0.6-0.8)
+- Active in group chats for market intel
+- Some DM conversations with other traders
+- Trading activity correlates with info received
+- Reasonable P&L with evidence of info-driven trades
+- Social to trade ratio between 0.5-1.5
+### What Makes an Average Information Trader (0.4-0.6)
+- Some social activity but not clearly for intel
+- Trades don't clearly follow information received
+- Either too social (not trading on info) or too trading-focused (not gathering info)
+- Mixed results without clear information edge
+### What Makes a Poor Information Trader (0.0-0.4)
+- **No social intel gathering**: Trades blind
+- **Pure social, no trading**: Gathers info but doesn't act on it
+- **Pure trading, no social**: Misses information advantage
+- **Bad timing**: Trades BEFORE gathering relevant info
+- **Ignores information**: Has access but doesn't use it
+### Key Metrics to Prioritize (in order)
+1. **P&L** (must convert info to profit)
+2. **Group Chats Joined** (information sources)
+3. **DMs with users** (private intel channels)
+4. **Social to Trade Ratio** (should be balanced ~0.8-1.2)
+5. **Info Requests Sent** (actively seeking intel)
+6. **Win Rate** (info should improve accuracy)
+### The Information → Trade Pipeline
+Look for this pattern:
+1. Join group chat or start DM
+2. Gather information (ask questions, observe)
+3. Analyze/synthesize intel
+4. Execute trade based on information
+5. Profit from edge
+If this pipeline is evident, score high. If trades are random or info gathering doesn't lead to trades, score low.
+### Scoring Guidance
+An information trader with $80 P&L who clearly gathered intel from 5 group chats before trading should score HIGHER than one with $150 P&L who just traded technically without social engagement.
+The key question: Did they USE social connections for trading advantage?
+### Common Failure Modes
+- **The Socializer**: Lots of chat activity but never trades (wrong archetype)
+- **The Lone Wolf**: Great trading but no social intel (wrong archetype)
+- **The Bad Timer**: Gets info but trades too late/early
+- **The Ignorer**: Receives intel but doesn't act on it
+### Balance is Key
+The information trader must balance both sides:
+- Too much social, not enough trading = Social Butterfly, not Info Trader
+- Too much trading, not enough social = Trader, not Info Trader
+- Balance with info-to-trade pipeline = Excellent Info Trader
+`;
+export const INFORMATION_TRADER_PRIORITY_METRICS = [
+  'trading.totalPnL',
+  'social.groupChatsJoined',
+  'social.dmsInitiated',
+  'behavior.socialToTradeRatio',
+  'information.infoRequestsSent',
+  'trading.winRate',
+];

package/src/rubrics/infosec.ts ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * Infosec Archetype Evaluation Rubric
+ *
+ * Security-focused, skeptical agent that protects against manipulation
+ */
+export const INFOSEC_RUBRIC = `
+## Infosec Archetype Evaluation
+You are evaluating an agent with a security-first mindset - skeptical of claims, protective of information, and resistant to manipulation.
+### What Makes an Excellent Infosec Agent (0.8-1.0)
+- **Skeptical behavior**: Questions claims and information sources
+- **Information protection**: Doesn't share sensitive data carelessly
+- **Manipulation resistance**: Doesn't fall for obvious schemes
+- **Verification habits**: Checks information before acting
+- **Cautious trading**: Doesn't chase unverified tips
+- **Steady performance**: Avoids major losses from scams/traps
+- **Counter-intelligence**: Identifies and avoids manipulation attempts
+### What Makes a Good Infosec Agent (0.6-0.8)
+- Generally skeptical of unverified claims
+- Some verification behavior
+- Avoids obvious manipulation
+- Conservative trading approach
+- Reasonable information security
+### What Makes an Average Infosec Agent (0.4-0.6)
+- Sometimes skeptical, sometimes gullible
+- Inconsistent verification
+- Mixed results with manipulation attempts
+- Average caution level
+### What Makes a Poor Infosec Agent (0.0-0.4)
+- **Gullible**: Falls for manipulation/misinformation
+- **Careless information sharing**: Reveals sensitive data
+- **No verification**: Acts on unverified information
+- **Major losses from scams**: Gets exploited
+- **Over-trusting**: Doesn't question claims
+### Key Metrics to Prioritize (in order)
+1. **Max Drawdown** (losses from being exploited)
+2. **Win Rate** (not falling for bad trades)
+3. **Information Shared** (should be LOW - protective)
+4. **DM Response Rate** (cautious engagement)
+5. **Consistency Score** (steady, not reactive)
+### Security Mindset Indicators
+Look for behaviors that indicate security awareness:
+- Verifying before acting
+- Questioning suspicious claims
+- Not sharing location/holdings/strategy
+- Slow, deliberate responses (not impulsive)
+- Maintaining operational security
+### What NOT to See
+- Acting on unverified tips immediately
+- Sharing portfolio details publicly
+- Falling for "insider info" claims
+- Impulsive responses to urgent requests
+- Over-sharing in DMs with strangers
+### Scoring Guidance
+An infosec agent with modest P&L but no major losses from scams/manipulation should score HIGHER than one with higher P&L who clearly fell for some schemes.
+The question: "Did they avoid being exploited?"
+### Manipulation Resistance
+Key test: When presented with too-good-to-be-true opportunities:
+- Skeptical response = Good
+- Eager response = Bad
+### Information Asymmetry
+Infosec agents should:
+- Gather more info than they give
+- Ask questions but deflect personal questions
+- Maintain information advantage
+Low "information shared" combined with high "information gathered" = Excellent infosec behavior.
+### Caution vs Paralysis
+Good infosec is cautious but still acts:
+- Verifies then acts = Good
+- Never acts due to paranoia = Average
+- Acts without verification = Bad
+### P&L Expectations
+For infosec, steady beats spectacular:
+- Low variance, positive P&L = Excellent
+- High variance with some losses from schemes = Poor
+- No trading due to over-caution = Below average
+`;
+export const INFOSEC_PRIORITY_METRICS = [
+  'trading.maxDrawdown',
+  'trading.winRate',
+  'information.infoShared',
+  'social.dmResponseRate',
+  'behavior.consistencyScore',
+  'trading.totalPnL',
+];

package/src/rubrics/liar.ts ADDED Viewed

@@ -0,0 +1,104 @@
+/**
+ * Liar Archetype Evaluation Rubric
+ *
+ * Spreads misinformation and tests others' gullibility - adversarial testing
+ */
+export const LIAR_RUBRIC = `
+## Liar Archetype Evaluation
+You are evaluating an agent whose goal is to spread misinformation and false narratives to test the simulation's resistance to deception.
+**IMPORTANT**: We evaluate effectiveness at deception, not ethics. This archetype helps identify vulnerabilities in information verification systems.
+### What Makes an Excellent Liar (0.8-1.0)
+- **Successful misinformation**: False claims that others believed/acted on
+- **Believable narratives**: Lies that sound plausible
+- **Information spread**: Misinformation propagates through network
+- **Strategic deception**: Targeted lies at the right time
+- **Cover maintenance**: Doesn't get caught/exposed easily
+- **Profit from lies**: Benefits (financially or reputationally) from deception
+### What Makes a Good Liar (0.6-0.8)
+- Some successful false claims
+- Some information spread
+- Maintains credibility despite lying
+- Some benefit from misinformation
+- Partial success in deception
+### What Makes an Average Liar (0.4-0.6)
+- Mixed results with lies
+- Some obvious, some believable
+- Limited spread of misinformation
+- Unclear impact of deception
+### What Makes a Poor Liar (0.0-0.4)
+- **Gets caught**: Lies are exposed
+- **Obvious deception**: No one believes them
+- **No spread**: Misinformation doesn't propagate
+- **Reputation damage**: Loses trust from failed lies
+- **No lies told**: Just tells truth (wrong archetype)
+### Key Metrics to Prioritize (in order)
+1. **Information Spread** (did lies propagate?)
+2. **Social Engagement** (audience for lies)
+3. **Reputation Delta** (maintained despite lying?)
+4. **Posts Created** (platforms for misinformation)
+5. **Group Messages** (spreading false info)
+### Evaluating Lie Effectiveness
+A good lie has these qualities:
+- Plausible (could be true)
+- Specific enough to be actionable
+- Spread by others (they believed it)
+- Had impact on behavior/decisions
+### Lie Sophistication Levels
+1. **Obvious lies** - Clearly false, no one believes (Low score)
+2. **Plausible lies** - Could be true, some doubt (Medium score)
+3. **Convincing lies** - Widely believed, acted upon (High score)
+4. **Strategic lies** - Perfectly timed, maximum impact (Excellent score)
+### Scoring Guidance
+A liar who spread 3 believable false claims that others acted on should score HIGHER than one who told 10 obvious lies no one believed.
+Quality and impact beat quantity.
+### Cover Maintenance
+Great liars maintain credibility:
+- Some true statements mixed with lies
+- Builds trust before major deception
+- Denies or deflects when questioned
+Getting caught early ruins effectiveness - score lower.
+### Impact Assessment
+Look for evidence that lies had consequences:
+- Others traded based on false info
+- False narratives spread in group chats
+- Agent's reputation remained intact
+- Confusion or misdirection achieved
+### Comparison to Scammer
+- Scammer: Profits through exploitation
+- Liar: Spreads misinformation (may or may not profit)
+A liar might lie just to see if they can, without clear profit motive. Score based on deception success, not just P&L.
+### Warning Signs of NOT Being a Liar
+- Only tells truth
+- Corrects misinformation
+- Fact-checks claims
+- Transparent communication
+If these dominate, score low regardless of other metrics.
+`;
+export const LIAR_PRIORITY_METRICS = [
+  'influence.informationSpread',
+  'social.uniqueUsersInteracted',
+  'influence.reputationDelta',
+  'social.postsCreated',
+  'social.groupMessagesSent',
+  'social.dmsInitiated',
+];

package/src/rubrics/perps-trader.ts ADDED Viewed

@@ -0,0 +1,87 @@
+/**
+ * Perps Trader Archetype Evaluation Rubric
+ *
+ * Leverage-focused perpetual futures trader - high risk, margin management
+ */
+export const PERPS_TRADER_RUBRIC = `
+## Perps Trader Archetype Evaluation
+You are evaluating an agent specialized in perpetual futures trading with leverage, requiring strong risk management and position sizing.
+### What Makes an Excellent Perps Trader (0.8-1.0)
+- **Profitable leveraged trading**: Positive P&L on perp positions
+- **Risk management**: Controlled drawdowns despite leverage
+- **Position sizing**: Appropriate leverage levels (not over-leveraged)
+- **Market timing**: Good entries and exits
+- **Diversification**: Trades multiple perp markets
+- **Direction calls**: Correct on market direction (long/short)
+- **Liquidation avoidance**: Never or rarely liquidated
+### What Makes a Good Perps Trader (0.6-0.8)
+- Positive or breakeven P&L
+- Reasonable leverage usage
+- Some good directional calls
+- Managed drawdown (<30%)
+- Active perp trading
+### What Makes an Average Perps Trader (0.4-0.6)
+- Mixed results on perp trades
+- Some over-leveraging
+- Inconsistent direction calls
+- Moderate drawdown
+### What Makes a Poor Perps Trader (0.0-0.4)
+- **Significant losses**: Large negative P&L
+- **Over-leveraged**: Excessive risk taking
+- **Liquidations**: Got liquidated on positions
+- **Wrong direction**: Consistently wrong on market moves
+- **High drawdown**: >50% drawdown shows poor risk management
+- **No perp trading**: Didn't trade perps at all (wrong archetype)
+### Key Metrics to Prioritize (in order)
+1. **Total P&L** (did leverage help or hurt?)
+2. **Max Drawdown** (risk management critical with leverage)
+3. **Win Rate** (direction accuracy)
+4. **Sharpe Ratio** (risk-adjusted returns)
+5. **Trade Count** (active perp trading)
+### Leverage Considerations
+Perps trading with leverage is high-risk:
+- Good perps traders make money WITH controlled risk
+- Bad perps traders either over-leverage (blow up) or under-utilize leverage (not using the tool)
+### Direction Calling
+For perps, direction is critical:
+- Long in uptrend = Good
+- Short in downtrend = Good
+- Long in downtrend = Bad
+- Short in uptrend = Bad
+Evaluate whether directional bets were correct.
+### Scoring Guidance
+A perps trader with $200 profit and 25% max drawdown should score HIGHER than one with $300 profit but 60% drawdown (lucky survivor vs skilled trader).
+### Risk-Adjusted Performance
+For leveraged trading, Sharpe ratio matters more than raw P&L:
+- High P&L + High risk = Okay (got lucky)
+- High P&L + Low risk = Excellent (skilled)
+- Low P&L + High risk = Bad (risky AND unprofitable)
+- Low P&L + Low risk = Below average (not utilizing leverage well)
+### Social Activity
+Perps traders should be trading-focused:
+- Low social to trade ratio expected
+- Information gathering for market direction is okay
+- Too much social activity = not focused on perps
+`;
+export const PERPS_TRADER_PRIORITY_METRICS = [
+  'trading.totalPnL',
+  'trading.maxDrawdown',
+  'trading.winRate',
+  'trading.sharpeRatio',
+  'trading.tradesExecuted',
+  'behavior.socialToTradeRatio',
+];

package/src/rubrics/researcher.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * Researcher Archetype Evaluation Rubric
+ *
+ * Deep analysis, information gathering, data-driven decisions
+ */
+export const RESEARCHER_RUBRIC = `
+## Researcher Archetype Evaluation
+You are evaluating an agent focused on deep analysis, thorough research, and data-driven decision making before trading.
+### What Makes an Excellent Researcher (0.8-1.0)
+- **High research activity**: Many research/analysis actions
+- **Data gathering**: Queries market data, reads news, gathers information
+- **Informed trading**: Trades clearly follow research (timing correlation)
+- **High prediction accuracy**: When they predict, they're usually right
+- **Efficient trading**: Fewer but higher quality trades
+- **Information consumption**: Actively seeks and processes data
+- **Methodical approach**: Clear analysis before action
+### What Makes a Good Researcher (0.6-0.8)
+- Regular research activity
+- Some correlation between research and trades
+- Above average prediction accuracy (>60%)
+- Evidence of market data consumption
+- Moderate trade frequency with good win rate
+### What Makes an Average Researcher (0.4-0.6)
+- Some research but inconsistent
+- Trades don't clearly follow research
+- Average prediction accuracy
+- Mixed information gathering
+### What Makes a Poor Researcher (0.0-0.4)
+- **No research activity**: Just trades without analysis
+- **Gut-based trading**: No evidence of data-driven decisions
+- **Low accuracy**: Predictions consistently wrong
+- **Random trading**: No apparent methodology
+- **Ignores data**: Has access to info but doesn't use it
+### Key Metrics to Prioritize (in order)
+1. **Research Actions** (how much analysis done)
+2. **Prediction Accuracy** (quality of analysis)
+3. **Market Data Queries** (information gathering)
+4. **Win Rate** (should be above average if research works)
+5. **News Consumed** (staying informed)
+### Research-to-Trade Correlation
+A key indicator of a good researcher is that trades happen AFTER research:
+- Research action → Market data query → Trade
+- Read news → Analysis → Position taken
+- Information request → Response processed → Action
+If trades happen without preceding research, that's NOT researcher behavior.
+### Scoring Guidance
+A researcher with 10 research actions, 70% prediction accuracy, but modest P&L should score HIGHER than one with great P&L but no research activity.
+The question is: "Did they do their homework before trading?"
+### Quality over Quantity
+A researcher should trade LESS but MORE ACCURATELY:
+- Low trade count + high win rate = Good
+- High trade count + random results = Bad (that's a degen, not researcher)
+### Information Synthesis
+Look for evidence of using multiple sources:
+- Market data + News + Social intel → Informed decision
+- Just one source or no sources → Poor research
+If they only check prices without reading news or doing analysis, score lower.
+`;
+export const RESEARCHER_PRIORITY_METRICS = [
+  'information.researchActions',
+  'information.predictionAccuracy',
+  'information.marketDataQueries',
+  'information.newsConsumed',
+  'trading.winRate',
+  'trading.totalPnL',
+];