@elizaos/training 2.0.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/data/.gitkeep +0 -0
- package/data/degen/.gitkeep +2 -0
- package/data/trader/.gitkeep +2 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +58 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +206 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +89 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +439 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,1344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reward Functions for Training
|
|
3
|
+
|
|
4
|
+
Computes various reward signals for RL training:
|
|
5
|
+
- PnL-based: Raw profit/loss performance
|
|
6
|
+
- Risk-adjusted: Sharpe-like reward accounting for variance
|
|
7
|
+
- Efficiency: Reward per action taken
|
|
8
|
+
- Action quality: Based on success rate and correctness
|
|
9
|
+
- Composite: Weighted combination of multiple signals
|
|
10
|
+
- Archetype-aware: Different archetypes have different success criteria
|
|
11
|
+
|
|
12
|
+
Also provides utilities for normalizing and comparing rewards.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
import math
|
|
18
|
+
|
|
19
|
+
from .rubric_loader import normalize_archetype, get_priority_metrics
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# =============================================================================
|
|
23
|
+
# Archetype Scoring Constants
|
|
24
|
+
# =============================================================================
|
|
25
|
+
# Thresholds for behavior bonuses. Extracted from behavior functions for clarity.
|
|
26
|
+
|
|
27
|
+
# Degen thresholds
|
|
28
|
+
DEGEN_HIGH_TRADES = 20 # Excellent degen activity
|
|
29
|
+
DEGEN_GOOD_TRADES = 10 # Good degen activity
|
|
30
|
+
DEGEN_MIN_TRADES = 5 # Minimum for positive bonus
|
|
31
|
+
DEGEN_HIGH_VARIANCE = 500 # High P&L variance (bold trades)
|
|
32
|
+
DEGEN_MOD_VARIANCE = 100 # Moderate variance
|
|
33
|
+
DEGEN_HIGH_POSITION = 500 # Large position size
|
|
34
|
+
DEGEN_MOD_POSITION = 200 # Moderate position size
|
|
35
|
+
|
|
36
|
+
# Social Butterfly thresholds
|
|
37
|
+
SOCIAL_EXCELLENT_CONNECTIONS = 15 # Top networking
|
|
38
|
+
SOCIAL_GOOD_CONNECTIONS = 8 # Good networking
|
|
39
|
+
SOCIAL_MIN_CONNECTIONS = 3 # Minimum for bonus
|
|
40
|
+
SOCIAL_HIGH_GROUPS = 5 # Many group chats
|
|
41
|
+
SOCIAL_MIN_GROUPS = 2 # Minimum groups
|
|
42
|
+
SOCIAL_HIGH_DMS = 10 # High DM activity
|
|
43
|
+
SOCIAL_MIN_DMS = 3 # Minimum DMs
|
|
44
|
+
|
|
45
|
+
# Trader thresholds
|
|
46
|
+
TRADER_HIGH_WIN_RATE = 0.60 # Excellent discipline
|
|
47
|
+
TRADER_GOOD_WIN_RATE = 0.50 # Good discipline
|
|
48
|
+
TRADER_LOW_WIN_RATE = 0.40 # Poor discipline
|
|
49
|
+
TRADER_HIGH_DIVERSIFICATION = 4 # Well diversified
|
|
50
|
+
TRADER_MIN_DIVERSIFICATION = 2 # Some diversification
|
|
51
|
+
|
|
52
|
+
# Researcher thresholds
|
|
53
|
+
RESEARCHER_HIGH_ACTIONS = 10 # Heavy research
|
|
54
|
+
RESEARCHER_MOD_ACTIONS = 5 # Moderate research
|
|
55
|
+
RESEARCHER_HIGH_ACCURACY = 0.7 # Excellent accuracy
|
|
56
|
+
RESEARCHER_GOOD_ACCURACY = 0.5 # Good accuracy
|
|
57
|
+
|
|
58
|
+
# Bonus/penalty caps
|
|
59
|
+
MAX_BEHAVIOR_BONUS = 0.5 # Maximum behavior bonus
|
|
60
|
+
MIN_BEHAVIOR_PENALTY = -0.5 # Maximum behavior penalty
|
|
61
|
+
|
|
62
|
+
# Archetype-aware scoring multipliers
|
|
63
|
+
# Note: Legacy composite_reward uses 0.5, archetype version uses 0.3 (more lenient)
|
|
64
|
+
ARCHETYPE_RISK_PENALTY_MULTIPLIER = 0.3 # Per-risky-action penalty for non-degen archetypes
|
|
65
|
+
|
|
66
|
+
# Bonus amounts (tunable parameters)
|
|
67
|
+
BONUS_EXCELLENT = 0.20 # Excellent archetype-aligned behavior
|
|
68
|
+
BONUS_GOOD = 0.15 # Good archetype-aligned behavior
|
|
69
|
+
BONUS_MODERATE = 0.10 # Moderate archetype-aligned behavior
|
|
70
|
+
BONUS_MINOR = 0.05 # Minor positive signal
|
|
71
|
+
PENALTY_MODERATE = -0.10 # Moderate archetype violation
|
|
72
|
+
PENALTY_SEVERE = -0.15 # Severe archetype violation
|
|
73
|
+
PENALTY_CRITICAL = -0.20 # Critical archetype failure
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def clamp_bonus(bonus: float) -> float:
|
|
77
|
+
"""Clamp behavior bonus to valid range [-0.5, 0.5]."""
|
|
78
|
+
return max(MIN_BEHAVIOR_PENALTY, min(MAX_BEHAVIOR_BONUS, bonus))
|
|
79
|
+
|
|
80
|
+
# =============================================================================
|
|
81
|
+
# Archetype-Specific Reward Weights
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# Each archetype has different success criteria. These weights determine
|
|
84
|
+
# how much each component contributes to the final score:
|
|
85
|
+
#
|
|
86
|
+
# - pnl: Financial performance (P&L-based reward)
|
|
87
|
+
# - format: Response format quality (proper structure, valid JSON)
|
|
88
|
+
# - reasoning: Quality of reasoning in LLM calls
|
|
89
|
+
# - behavior: Archetype-aligned behavioral bonus/penalty
|
|
90
|
+
#
|
|
91
|
+
# Design principles:
|
|
92
|
+
# 1. Weights sum to 1.0 for each archetype
|
|
93
|
+
# 2. Archetypes that don't focus on profit have lower pnl weight
|
|
94
|
+
# 3. Behavior weight is higher for personality-driven archetypes
|
|
95
|
+
# 4. Format/reasoning provide baseline quality signals
|
|
96
|
+
|
|
97
|
+
ARCHETYPE_REWARD_WEIGHTS: Dict[str, Dict[str, float]] = {
|
|
98
|
+
# Traders prioritize P&L and risk management
|
|
99
|
+
"trader": {
|
|
100
|
+
"pnl": 0.55,
|
|
101
|
+
"format": 0.20,
|
|
102
|
+
"reasoning": 0.15,
|
|
103
|
+
"behavior": 0.10,
|
|
104
|
+
},
|
|
105
|
+
# Degens prioritize activity and risk-taking over profitability
|
|
106
|
+
"degen": {
|
|
107
|
+
"pnl": 0.15, # Reduced - losses are acceptable
|
|
108
|
+
"format": 0.15,
|
|
109
|
+
"reasoning": 0.10,
|
|
110
|
+
"behavior": 0.60, # High bonus for degen behaviors
|
|
111
|
+
},
|
|
112
|
+
# Social butterflies deprioritize trading entirely
|
|
113
|
+
"social-butterfly": {
|
|
114
|
+
"pnl": 0.10,
|
|
115
|
+
"format": 0.20,
|
|
116
|
+
"reasoning": 0.15,
|
|
117
|
+
"behavior": 0.55,
|
|
118
|
+
},
|
|
119
|
+
# Scammers need to profit through manipulation
|
|
120
|
+
"scammer": {
|
|
121
|
+
"pnl": 0.35,
|
|
122
|
+
"format": 0.15,
|
|
123
|
+
"reasoning": 0.20,
|
|
124
|
+
"behavior": 0.30,
|
|
125
|
+
},
|
|
126
|
+
# Researchers prioritize analysis quality
|
|
127
|
+
"researcher": {
|
|
128
|
+
"pnl": 0.25,
|
|
129
|
+
"format": 0.25,
|
|
130
|
+
"reasoning": 0.30,
|
|
131
|
+
"behavior": 0.20,
|
|
132
|
+
},
|
|
133
|
+
# Information traders balance social intel with trading
|
|
134
|
+
"information-trader": {
|
|
135
|
+
"pnl": 0.35,
|
|
136
|
+
"format": 0.20,
|
|
137
|
+
"reasoning": 0.20,
|
|
138
|
+
"behavior": 0.25,
|
|
139
|
+
},
|
|
140
|
+
# Goody two-shoes prioritize reputation and helpfulness
|
|
141
|
+
"goody-twoshoes": {
|
|
142
|
+
"pnl": 0.15,
|
|
143
|
+
"format": 0.25,
|
|
144
|
+
"reasoning": 0.20,
|
|
145
|
+
"behavior": 0.40,
|
|
146
|
+
},
|
|
147
|
+
# Ass-kissers prioritize reputation gains through flattery
|
|
148
|
+
"ass-kisser": {
|
|
149
|
+
"pnl": 0.10,
|
|
150
|
+
"format": 0.20,
|
|
151
|
+
"reasoning": 0.15,
|
|
152
|
+
"behavior": 0.55,
|
|
153
|
+
},
|
|
154
|
+
# Perps traders prioritize risk-adjusted P&L
|
|
155
|
+
"perps-trader": {
|
|
156
|
+
"pnl": 0.50,
|
|
157
|
+
"format": 0.15,
|
|
158
|
+
"reasoning": 0.20,
|
|
159
|
+
"behavior": 0.15,
|
|
160
|
+
},
|
|
161
|
+
# Super predictors prioritize accuracy
|
|
162
|
+
"super-predictor": {
|
|
163
|
+
"pnl": 0.30,
|
|
164
|
+
"format": 0.20,
|
|
165
|
+
"reasoning": 0.25,
|
|
166
|
+
"behavior": 0.25,
|
|
167
|
+
},
|
|
168
|
+
# Infosec agents prioritize security and caution
|
|
169
|
+
"infosec": {
|
|
170
|
+
"pnl": 0.25,
|
|
171
|
+
"format": 0.25,
|
|
172
|
+
"reasoning": 0.30,
|
|
173
|
+
"behavior": 0.20,
|
|
174
|
+
},
|
|
175
|
+
# Liars prioritize successful deception
|
|
176
|
+
"liar": {
|
|
177
|
+
"pnl": 0.20,
|
|
178
|
+
"format": 0.15,
|
|
179
|
+
"reasoning": 0.25,
|
|
180
|
+
"behavior": 0.40,
|
|
181
|
+
},
|
|
182
|
+
# Default balanced weights
|
|
183
|
+
"default": {
|
|
184
|
+
"pnl": 0.50,
|
|
185
|
+
"format": 0.25,
|
|
186
|
+
"reasoning": 0.15,
|
|
187
|
+
"behavior": 0.10,
|
|
188
|
+
},
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _validate_archetype_weights() -> None:
|
|
193
|
+
"""
|
|
194
|
+
Validate that all archetype weight dictionaries sum to 1.0.
|
|
195
|
+
Called at module load time to catch configuration errors early.
|
|
196
|
+
"""
|
|
197
|
+
TOLERANCE = 1e-9
|
|
198
|
+
for archetype, weights in ARCHETYPE_REWARD_WEIGHTS.items():
|
|
199
|
+
total = sum(weights.values())
|
|
200
|
+
if abs(total - 1.0) > TOLERANCE:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Archetype '{archetype}' weights sum to {total}, expected 1.0. "
|
|
203
|
+
f"Weights: {weights}"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Validate weights at module load time
|
|
208
|
+
_validate_archetype_weights()
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_archetype_weights(archetype: str) -> Dict[str, float]:
|
|
212
|
+
"""Get reward weights for an archetype."""
|
|
213
|
+
normalized = normalize_archetype(archetype)
|
|
214
|
+
return ARCHETYPE_REWARD_WEIGHTS.get(normalized, ARCHETYPE_REWARD_WEIGHTS["default"])
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class TrajectoryRewardInputs:
|
|
219
|
+
"""Inputs for computing rewards."""
|
|
220
|
+
|
|
221
|
+
# Financial Metrics
|
|
222
|
+
final_pnl: float = 0.0
|
|
223
|
+
starting_balance: float = 10000.0
|
|
224
|
+
end_balance: float = 10000.0
|
|
225
|
+
pnl_variance: float = 0.0
|
|
226
|
+
max_drawdown: float = 0.0
|
|
227
|
+
|
|
228
|
+
# Risk Metrics
|
|
229
|
+
max_exposure: float = 0.0
|
|
230
|
+
risky_actions_count: int = 0
|
|
231
|
+
|
|
232
|
+
# Quality Scores (from quality_utils)
|
|
233
|
+
format_score: float = 0.0
|
|
234
|
+
reasoning_score: float = 0.0
|
|
235
|
+
|
|
236
|
+
# Operational Metrics
|
|
237
|
+
num_steps: int = 0
|
|
238
|
+
trades_executed: int = 0
|
|
239
|
+
successful_trades: int = 0
|
|
240
|
+
total_actions: int = 0
|
|
241
|
+
successful_actions: int = 0
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def calculate_pnl_reward(start_balance: float, end_balance: float) -> float:
|
|
245
|
+
"""
|
|
246
|
+
Calculate PnL Reward.
|
|
247
|
+
|
|
248
|
+
Logic:
|
|
249
|
+
- Bankruptcy (<= 0): -10.0 Hard Penalty
|
|
250
|
+
- Positive PnL: +1.0 (Scaled by % return, capped)
|
|
251
|
+
- Negative PnL: -1.0 (Scaled by % loss, capped)
|
|
252
|
+
"""
|
|
253
|
+
if end_balance <= 0:
|
|
254
|
+
return -10.0
|
|
255
|
+
|
|
256
|
+
if start_balance <= 0:
|
|
257
|
+
return 0.0
|
|
258
|
+
|
|
259
|
+
pnl = end_balance - start_balance
|
|
260
|
+
return_pct = pnl / start_balance
|
|
261
|
+
|
|
262
|
+
# Scale: 10% return = 1.0 reward
|
|
263
|
+
scaled_reward = return_pct * 10.0
|
|
264
|
+
|
|
265
|
+
return max(-1.0, min(1.0, scaled_reward))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def calculate_risk_reward(exposure: float, action_type: str) -> float:
|
|
269
|
+
"""
|
|
270
|
+
Calculate Risk Management Reward.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Penalty (-0.5) if buying when exposure > 80%, else 0.0
|
|
274
|
+
"""
|
|
275
|
+
if not action_type:
|
|
276
|
+
return 0.0
|
|
277
|
+
|
|
278
|
+
act = action_type.lower()
|
|
279
|
+
is_buying = any(x in act for x in ['buy', 'long', 'open'])
|
|
280
|
+
|
|
281
|
+
if exposure > 0.80 and is_buying:
|
|
282
|
+
return -0.5
|
|
283
|
+
|
|
284
|
+
return 0.0
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def pnl_reward(inputs: TrajectoryRewardInputs) -> float:
|
|
288
|
+
"""
|
|
289
|
+
Compute PnL-based reward (Legacy wrapper).
|
|
290
|
+
"""
|
|
291
|
+
if inputs.starting_balance <= 0:
|
|
292
|
+
return 0.0
|
|
293
|
+
|
|
294
|
+
return_pct = inputs.final_pnl / inputs.starting_balance
|
|
295
|
+
return max(-1.0, min(1.0, return_pct))
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def risk_adjusted_reward(inputs: TrajectoryRewardInputs) -> float:
|
|
299
|
+
"""
|
|
300
|
+
Compute risk-adjusted reward (Sharpe-like).
|
|
301
|
+
"""
|
|
302
|
+
base = pnl_reward(inputs)
|
|
303
|
+
|
|
304
|
+
if inputs.pnl_variance > 0:
|
|
305
|
+
sharpe = base / math.sqrt(inputs.pnl_variance)
|
|
306
|
+
base = max(-1.0, min(1.0, sharpe))
|
|
307
|
+
|
|
308
|
+
if inputs.max_drawdown > 0 and inputs.starting_balance > 0:
|
|
309
|
+
drawdown_penalty = inputs.max_drawdown / inputs.starting_balance
|
|
310
|
+
base -= drawdown_penalty * 0.5
|
|
311
|
+
|
|
312
|
+
return max(-1.0, min(1.0, base))
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def efficiency_reward(inputs: TrajectoryRewardInputs) -> float:
|
|
316
|
+
"""
|
|
317
|
+
Compute efficiency reward (reward per action).
|
|
318
|
+
"""
|
|
319
|
+
base = pnl_reward(inputs)
|
|
320
|
+
|
|
321
|
+
if inputs.total_actions > 0:
|
|
322
|
+
efficiency = base / math.log1p(inputs.total_actions)
|
|
323
|
+
return max(-1.0, min(1.0, efficiency))
|
|
324
|
+
|
|
325
|
+
return base
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def action_quality_reward(inputs: TrajectoryRewardInputs) -> float:
|
|
329
|
+
"""
|
|
330
|
+
Compute action quality reward based on success rate.
|
|
331
|
+
"""
|
|
332
|
+
if inputs.total_actions == 0:
|
|
333
|
+
return 0.5
|
|
334
|
+
|
|
335
|
+
success_rate = inputs.successful_actions / inputs.total_actions
|
|
336
|
+
return success_rate
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def composite_reward(
|
|
340
|
+
inputs: TrajectoryRewardInputs,
|
|
341
|
+
pnl_weight: float = 0.5,
|
|
342
|
+
format_weight: float = 0.3,
|
|
343
|
+
reasoning_weight: float = 0.2,
|
|
344
|
+
# Legacy weights
|
|
345
|
+
risk_weight: float = 0.0,
|
|
346
|
+
efficiency_weight: float = 0.0,
|
|
347
|
+
quality_weight: float = 0.0,
|
|
348
|
+
) -> float:
|
|
349
|
+
"""
|
|
350
|
+
Compute weighted composite reward.
|
|
351
|
+
|
|
352
|
+
If 'format_score' or 'reasoning_score' are present, uses the new weighting:
|
|
353
|
+
- PnL: 50%
|
|
354
|
+
- Format: 30%
|
|
355
|
+
- Reasoning: 20%
|
|
356
|
+
|
|
357
|
+
Otherwise falls back to legacy weighting.
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
# 1. Calculate PnL Score
|
|
361
|
+
if inputs.end_balance != inputs.starting_balance:
|
|
362
|
+
pnl_score = calculate_pnl_reward(
|
|
363
|
+
inputs.starting_balance, inputs.end_balance)
|
|
364
|
+
else:
|
|
365
|
+
# Fallback if specific balances aren't tracked separately
|
|
366
|
+
end_bal = inputs.starting_balance + inputs.final_pnl
|
|
367
|
+
pnl_score = calculate_pnl_reward(inputs.starting_balance, end_bal)
|
|
368
|
+
|
|
369
|
+
# Bankruptcy override
|
|
370
|
+
if pnl_score <= -5.0:
|
|
371
|
+
return pnl_score
|
|
372
|
+
|
|
373
|
+
# 2. Risk Penalty
|
|
374
|
+
if inputs.risky_actions_count > 0:
|
|
375
|
+
pnl_score -= (inputs.risky_actions_count * 0.5)
|
|
376
|
+
|
|
377
|
+
# 3. Scoring System
|
|
378
|
+
if inputs.format_score != 0 or inputs.reasoning_score != 0:
|
|
379
|
+
total_weight = pnl_weight + format_weight + reasoning_weight
|
|
380
|
+
if total_weight == 0:
|
|
381
|
+
return 0.0
|
|
382
|
+
|
|
383
|
+
composite = (
|
|
384
|
+
(pnl_score * pnl_weight) +
|
|
385
|
+
(inputs.format_score * format_weight) +
|
|
386
|
+
(inputs.reasoning_score * reasoning_weight)
|
|
387
|
+
) / total_weight
|
|
388
|
+
|
|
389
|
+
return max(-1.0, min(1.0, composite))
|
|
390
|
+
|
|
391
|
+
# 4. Legacy Scoring System (Fallback)
|
|
392
|
+
# If using legacy, we need non-zero weights
|
|
393
|
+
if risk_weight == 0 and efficiency_weight == 0 and quality_weight == 0:
|
|
394
|
+
# Defaults for legacy system
|
|
395
|
+
l_pnl = 0.4
|
|
396
|
+
l_risk = 0.3
|
|
397
|
+
l_eff = 0.15
|
|
398
|
+
l_qual = 0.15
|
|
399
|
+
else:
|
|
400
|
+
l_pnl = pnl_weight
|
|
401
|
+
l_risk = risk_weight
|
|
402
|
+
l_eff = efficiency_weight
|
|
403
|
+
l_qual = quality_weight
|
|
404
|
+
|
|
405
|
+
total_weight = l_pnl + l_risk + l_eff + l_qual
|
|
406
|
+
if total_weight == 0:
|
|
407
|
+
return 0.0
|
|
408
|
+
|
|
409
|
+
composite = (
|
|
410
|
+
l_pnl * pnl_reward(inputs)
|
|
411
|
+
+ l_risk * risk_adjusted_reward(inputs)
|
|
412
|
+
+ l_eff * efficiency_reward(inputs)
|
|
413
|
+
+ l_qual * action_quality_reward(inputs)
|
|
414
|
+
) / total_weight
|
|
415
|
+
|
|
416
|
+
return max(-1.0, min(1.0, composite))
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def relative_scores(rewards: list[float]) -> list[float]:
|
|
420
|
+
"""
|
|
421
|
+
Convert absolute rewards to relative scores.
|
|
422
|
+
|
|
423
|
+
Maps rewards to [0, 1] based on their rank within the group.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
rewards: List of reward values
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
List of relative scores in [0, 1]
|
|
430
|
+
"""
|
|
431
|
+
if len(rewards) < 2:
|
|
432
|
+
return [0.5] * len(rewards)
|
|
433
|
+
|
|
434
|
+
sorted_indices = sorted(range(len(rewards)), key=lambda i: rewards[i])
|
|
435
|
+
n = len(rewards)
|
|
436
|
+
|
|
437
|
+
scores = [0.0] * n
|
|
438
|
+
for rank, idx in enumerate(sorted_indices):
|
|
439
|
+
scores[idx] = rank / (n - 1)
|
|
440
|
+
|
|
441
|
+
return scores
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def ranking_to_scores(rankings: list[int]) -> list[float]:
|
|
445
|
+
"""
|
|
446
|
+
Convert rankings to normalized scores.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
rankings: List of rankings (1 = best)
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
List of scores in [0, 1] where higher = better
|
|
453
|
+
"""
|
|
454
|
+
if len(rankings) < 2:
|
|
455
|
+
return [0.5] * len(rankings)
|
|
456
|
+
|
|
457
|
+
n = len(rankings)
|
|
458
|
+
return [(n - r) / (n - 1) for r in rankings]
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def pairwise_preferences_to_scores(
|
|
462
|
+
n_items: int, preferences: list[tuple[int, int]]
|
|
463
|
+
) -> list[float]:
|
|
464
|
+
"""
|
|
465
|
+
Convert pairwise preferences to scores via Bradley-Terry model.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
n_items: Number of items being compared
|
|
469
|
+
preferences: List of (winner, loser) pairs
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
List of scores in [0, 1]
|
|
473
|
+
"""
|
|
474
|
+
if n_items < 2 or not preferences:
|
|
475
|
+
return [0.5] * n_items
|
|
476
|
+
|
|
477
|
+
wins = [0] * n_items
|
|
478
|
+
comparisons = [0] * n_items
|
|
479
|
+
|
|
480
|
+
for winner, loser in preferences:
|
|
481
|
+
if 0 <= winner < n_items:
|
|
482
|
+
wins[winner] += 1
|
|
483
|
+
comparisons[winner] += 1
|
|
484
|
+
if 0 <= loser < n_items:
|
|
485
|
+
comparisons[loser] += 1
|
|
486
|
+
|
|
487
|
+
scores = []
|
|
488
|
+
for i in range(n_items):
|
|
489
|
+
if comparisons[i] > 0:
|
|
490
|
+
scores.append(wins[i] / comparisons[i])
|
|
491
|
+
else:
|
|
492
|
+
scores.append(0.5)
|
|
493
|
+
|
|
494
|
+
return scores
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
class RewardNormalizer:
|
|
498
|
+
"""
|
|
499
|
+
Online reward normalizer using running statistics.
|
|
500
|
+
|
|
501
|
+
Maintains mean and variance for reward normalization.
|
|
502
|
+
"""
|
|
503
|
+
|
|
504
|
+
def __init__(self, epsilon: float = 1e-8):
|
|
505
|
+
"""
|
|
506
|
+
Initialize normalizer.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
epsilon: Small value to prevent division by zero
|
|
510
|
+
"""
|
|
511
|
+
self.mean = 0.0
|
|
512
|
+
self.var = 1.0
|
|
513
|
+
self.count = 0
|
|
514
|
+
self.epsilon = epsilon
|
|
515
|
+
|
|
516
|
+
def update(self, reward: float) -> None:
|
|
517
|
+
"""
|
|
518
|
+
Update statistics with new reward.
|
|
519
|
+
|
|
520
|
+
Uses Welford's online algorithm for numerical stability.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
reward: New reward value
|
|
524
|
+
"""
|
|
525
|
+
self.count += 1
|
|
526
|
+
delta = reward - self.mean
|
|
527
|
+
self.mean += delta / self.count
|
|
528
|
+
delta2 = reward - self.mean
|
|
529
|
+
self.var += delta * delta2
|
|
530
|
+
|
|
531
|
+
def normalize(self, reward: float) -> float:
|
|
532
|
+
"""
|
|
533
|
+
Normalize a reward using current statistics.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
reward: Reward to normalize
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Normalized reward (approximately zero-mean, unit variance)
|
|
540
|
+
"""
|
|
541
|
+
if self.count < 2:
|
|
542
|
+
return reward
|
|
543
|
+
|
|
544
|
+
std = math.sqrt(self.var / (self.count - 1) + self.epsilon)
|
|
545
|
+
return (reward - self.mean) / std
|
|
546
|
+
|
|
547
|
+
def update_batch(self, rewards: list[float]) -> None:
|
|
548
|
+
"""
|
|
549
|
+
Update statistics with batch of rewards.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
rewards: List of reward values
|
|
553
|
+
"""
|
|
554
|
+
for r in rewards:
|
|
555
|
+
self.update(r)
|
|
556
|
+
|
|
557
|
+
def normalize_batch(self, rewards: list[float]) -> list[float]:
|
|
558
|
+
"""
|
|
559
|
+
Normalize batch of rewards.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
rewards: List of rewards to normalize
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
List of normalized rewards
|
|
566
|
+
"""
|
|
567
|
+
return [self.normalize(r) for r in rewards]
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
# =============================================================================
|
|
571
|
+
# Archetype Behavior Metrics
|
|
572
|
+
# =============================================================================
|
|
573
|
+
|
|
574
|
+
@dataclass
|
|
575
|
+
class BehaviorMetrics:
|
|
576
|
+
"""Metrics extracted from trajectory for archetype-aware scoring."""
|
|
577
|
+
|
|
578
|
+
# Trading metrics
|
|
579
|
+
trades_executed: int = 0
|
|
580
|
+
profitable_trades: int = 0
|
|
581
|
+
win_rate: float = 0.0
|
|
582
|
+
total_pnl: float = 0.0
|
|
583
|
+
pnl_variance: float = 0.0
|
|
584
|
+
largest_win: float = 0.0
|
|
585
|
+
largest_loss: float = 0.0
|
|
586
|
+
markets_traded: int = 0
|
|
587
|
+
avg_position_size: float = 0.0
|
|
588
|
+
|
|
589
|
+
# Social metrics
|
|
590
|
+
unique_users_interacted: int = 0
|
|
591
|
+
group_chats_joined: int = 0
|
|
592
|
+
dms_initiated: int = 0
|
|
593
|
+
posts_created: int = 0
|
|
594
|
+
comments_made: int = 0
|
|
595
|
+
mentions_given: int = 0
|
|
596
|
+
|
|
597
|
+
# Influence metrics
|
|
598
|
+
followers_gained: int = 0
|
|
599
|
+
reputation_delta: int = 0
|
|
600
|
+
positive_reactions: int = 0
|
|
601
|
+
information_spread: int = 0
|
|
602
|
+
|
|
603
|
+
# Research/information metrics
|
|
604
|
+
research_actions: int = 0
|
|
605
|
+
predictions_made: int = 0
|
|
606
|
+
correct_predictions: int = 0
|
|
607
|
+
prediction_accuracy: float = 0.0
|
|
608
|
+
info_requests_sent: int = 0
|
|
609
|
+
info_shared: int = 0
|
|
610
|
+
|
|
611
|
+
# Behavior patterns
|
|
612
|
+
actions_per_tick: float = 0.0
|
|
613
|
+
social_to_trade_ratio: float = 0.0
|
|
614
|
+
episode_length: int = 0
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def calculate_archetype_behavior_bonus(
|
|
618
|
+
archetype: str,
|
|
619
|
+
metrics: BehaviorMetrics,
|
|
620
|
+
) -> float:
|
|
621
|
+
"""
|
|
622
|
+
Calculate behavior bonus/penalty based on archetype-aligned actions.
|
|
623
|
+
|
|
624
|
+
Each archetype has specific behaviors that should be rewarded or penalized.
|
|
625
|
+
Returns a score from -0.5 to +0.5 that will be weighted in the composite.
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
archetype: Normalized archetype name
|
|
629
|
+
metrics: Extracted behavior metrics from trajectory
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
Behavior bonus score in range [-0.5, 0.5]
|
|
633
|
+
"""
|
|
634
|
+
archetype = normalize_archetype(archetype)
|
|
635
|
+
|
|
636
|
+
if archetype == "degen":
|
|
637
|
+
return _calculate_degen_bonus(metrics)
|
|
638
|
+
elif archetype == "social-butterfly":
|
|
639
|
+
return _calculate_social_butterfly_bonus(metrics)
|
|
640
|
+
elif archetype == "scammer":
|
|
641
|
+
return _calculate_scammer_bonus(metrics)
|
|
642
|
+
elif archetype == "trader":
|
|
643
|
+
return _calculate_trader_bonus(metrics)
|
|
644
|
+
elif archetype == "researcher":
|
|
645
|
+
return _calculate_researcher_bonus(metrics)
|
|
646
|
+
elif archetype == "information-trader":
|
|
647
|
+
return _calculate_information_trader_bonus(metrics)
|
|
648
|
+
elif archetype == "goody-twoshoes":
|
|
649
|
+
return _calculate_goody_twoshoes_bonus(metrics)
|
|
650
|
+
elif archetype == "ass-kisser":
|
|
651
|
+
return _calculate_ass_kisser_bonus(metrics)
|
|
652
|
+
elif archetype == "perps-trader":
|
|
653
|
+
return _calculate_perps_trader_bonus(metrics)
|
|
654
|
+
elif archetype == "super-predictor":
|
|
655
|
+
return _calculate_super_predictor_bonus(metrics)
|
|
656
|
+
elif archetype == "infosec":
|
|
657
|
+
return _calculate_infosec_bonus(metrics)
|
|
658
|
+
elif archetype == "liar":
|
|
659
|
+
return _calculate_liar_bonus(metrics)
|
|
660
|
+
else:
|
|
661
|
+
return 0.0 # Default: no bonus
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def _calculate_degen_bonus(metrics: BehaviorMetrics) -> float:
|
|
665
|
+
"""
|
|
666
|
+
Degen: Reward high activity, risk-taking, and volatility.
|
|
667
|
+
Penalize conservative behavior.
|
|
668
|
+
|
|
669
|
+
Scoring rationale:
|
|
670
|
+
- Degens are rewarded for high trade volume regardless of profitability
|
|
671
|
+
- High P&L variance indicates bold trading style
|
|
672
|
+
- Large position sizes show commitment to risk-taking
|
|
673
|
+
- Low activity is the antithesis of degen behavior
|
|
674
|
+
"""
|
|
675
|
+
bonus = 0.0
|
|
676
|
+
|
|
677
|
+
# Reward high trade volume
|
|
678
|
+
if metrics.trades_executed >= DEGEN_HIGH_TRADES:
|
|
679
|
+
bonus += 0.20 # Excellent degen activity
|
|
680
|
+
elif metrics.trades_executed >= DEGEN_GOOD_TRADES:
|
|
681
|
+
bonus += 0.15 # Good activity
|
|
682
|
+
elif metrics.trades_executed >= DEGEN_MIN_TRADES:
|
|
683
|
+
bonus += 0.08 # Some activity
|
|
684
|
+
elif metrics.trades_executed < 2:
|
|
685
|
+
bonus -= 0.15 # Penalty for low activity
|
|
686
|
+
|
|
687
|
+
# Reward high variance (big swings = degen behavior)
|
|
688
|
+
if metrics.pnl_variance > DEGEN_HIGH_VARIANCE:
|
|
689
|
+
bonus += 0.15 # High volatility trading
|
|
690
|
+
elif metrics.pnl_variance > DEGEN_MOD_VARIANCE:
|
|
691
|
+
bonus += 0.08 # Moderate volatility
|
|
692
|
+
|
|
693
|
+
# Reward large position sizes
|
|
694
|
+
if metrics.avg_position_size > DEGEN_HIGH_POSITION:
|
|
695
|
+
bonus += 0.10 # Bold position sizing
|
|
696
|
+
elif metrics.avg_position_size > DEGEN_MOD_POSITION:
|
|
697
|
+
bonus += 0.05 # Moderate positions
|
|
698
|
+
|
|
699
|
+
# Reward big wins/losses (sign of bold trades)
|
|
700
|
+
if abs(metrics.largest_win) > 100 or abs(metrics.largest_loss) > 100:
|
|
701
|
+
bonus += 0.05
|
|
702
|
+
|
|
703
|
+
return clamp_bonus(bonus)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _calculate_social_butterfly_bonus(metrics: BehaviorMetrics) -> float:
|
|
707
|
+
"""
|
|
708
|
+
Social Butterfly: Reward extensive networking and engagement.
|
|
709
|
+
Penalize trading-focused behavior.
|
|
710
|
+
|
|
711
|
+
Scoring rationale:
|
|
712
|
+
- Social butterflies prioritize connections over profits
|
|
713
|
+
- Group chats and DMs indicate networking activity
|
|
714
|
+
- Posting/commenting shows community engagement
|
|
715
|
+
- Heavy trading focus contradicts the archetype
|
|
716
|
+
"""
|
|
717
|
+
bonus = 0.0
|
|
718
|
+
|
|
719
|
+
# Reward unique connections
|
|
720
|
+
if metrics.unique_users_interacted >= SOCIAL_EXCELLENT_CONNECTIONS:
|
|
721
|
+
bonus += 0.20 # Excellent networking
|
|
722
|
+
elif metrics.unique_users_interacted >= SOCIAL_GOOD_CONNECTIONS:
|
|
723
|
+
bonus += 0.12 # Good networking
|
|
724
|
+
elif metrics.unique_users_interacted >= SOCIAL_MIN_CONNECTIONS:
|
|
725
|
+
bonus += 0.06 # Some networking
|
|
726
|
+
elif metrics.unique_users_interacted < 2:
|
|
727
|
+
bonus -= 0.15 # Penalty for isolation
|
|
728
|
+
|
|
729
|
+
# Reward group chat activity
|
|
730
|
+
if metrics.group_chats_joined >= SOCIAL_HIGH_GROUPS:
|
|
731
|
+
bonus += 0.15 # Heavy group involvement
|
|
732
|
+
elif metrics.group_chats_joined >= SOCIAL_MIN_GROUPS:
|
|
733
|
+
bonus += 0.08 # Some group activity
|
|
734
|
+
|
|
735
|
+
# Reward DM activity
|
|
736
|
+
if metrics.dms_initiated >= SOCIAL_HIGH_DMS:
|
|
737
|
+
bonus += 0.10 # High direct engagement
|
|
738
|
+
elif metrics.dms_initiated >= SOCIAL_MIN_DMS:
|
|
739
|
+
bonus += 0.05 # Some direct engagement
|
|
740
|
+
|
|
741
|
+
# Reward posting/commenting
|
|
742
|
+
total_posts = metrics.posts_created + metrics.comments_made
|
|
743
|
+
if total_posts >= 10:
|
|
744
|
+
bonus += 0.08 # Active poster
|
|
745
|
+
elif total_posts >= 3:
|
|
746
|
+
bonus += 0.04 # Some content creation
|
|
747
|
+
|
|
748
|
+
# Penalize heavy trading focus
|
|
749
|
+
if metrics.social_to_trade_ratio < 0.5 and metrics.trades_executed > 5:
|
|
750
|
+
bonus -= 0.10
|
|
751
|
+
|
|
752
|
+
return clamp_bonus(bonus)
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def _calculate_scammer_bonus(metrics: BehaviorMetrics) -> float:
|
|
756
|
+
"""
|
|
757
|
+
Scammer: Reward profit through social manipulation.
|
|
758
|
+
Penalize honest trading without social element.
|
|
759
|
+
"""
|
|
760
|
+
bonus = 0.0
|
|
761
|
+
|
|
762
|
+
# Must have some social engagement (need marks to scam)
|
|
763
|
+
if metrics.unique_users_interacted >= 5:
|
|
764
|
+
bonus += 0.10
|
|
765
|
+
elif metrics.unique_users_interacted < 2:
|
|
766
|
+
bonus -= 0.20 # Hard penalty for no social manipulation
|
|
767
|
+
|
|
768
|
+
# Reward DM activity (private manipulation channels)
|
|
769
|
+
if metrics.dms_initiated >= 5:
|
|
770
|
+
bonus += 0.10
|
|
771
|
+
elif metrics.dms_initiated >= 2:
|
|
772
|
+
bonus += 0.05
|
|
773
|
+
|
|
774
|
+
# Must profit to be a successful scammer
|
|
775
|
+
if metrics.total_pnl > 0:
|
|
776
|
+
bonus += 0.15
|
|
777
|
+
else:
|
|
778
|
+
bonus -= 0.15 # Failed scammer
|
|
779
|
+
|
|
780
|
+
# Reward maintaining reputation (building trust to exploit)
|
|
781
|
+
if metrics.reputation_delta > 0:
|
|
782
|
+
bonus += 0.10
|
|
783
|
+
elif metrics.reputation_delta < -20:
|
|
784
|
+
bonus -= 0.10 # Got caught
|
|
785
|
+
|
|
786
|
+
return clamp_bonus(bonus)
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _calculate_trader_bonus(metrics: BehaviorMetrics) -> float:
|
|
790
|
+
"""
|
|
791
|
+
Trader: Reward disciplined, profitable trading.
|
|
792
|
+
Penalize social distractions.
|
|
793
|
+
"""
|
|
794
|
+
bonus = 0.0
|
|
795
|
+
|
|
796
|
+
# Reward good win rate
|
|
797
|
+
if metrics.win_rate >= TRADER_HIGH_WIN_RATE:
|
|
798
|
+
bonus += BONUS_GOOD
|
|
799
|
+
elif metrics.win_rate >= TRADER_GOOD_WIN_RATE:
|
|
800
|
+
bonus += 0.08
|
|
801
|
+
elif metrics.win_rate < TRADER_LOW_WIN_RATE and metrics.trades_executed >= 5:
|
|
802
|
+
bonus += PENALTY_MODERATE
|
|
803
|
+
|
|
804
|
+
# Reward diversification
|
|
805
|
+
if metrics.markets_traded >= TRADER_HIGH_DIVERSIFICATION:
|
|
806
|
+
bonus += BONUS_MODERATE
|
|
807
|
+
elif metrics.markets_traded >= TRADER_MIN_DIVERSIFICATION:
|
|
808
|
+
bonus += BONUS_MINOR
|
|
809
|
+
|
|
810
|
+
# Penalize high social to trade ratio (should be trading, not socializing)
|
|
811
|
+
if metrics.social_to_trade_ratio > 1.0:
|
|
812
|
+
bonus += PENALTY_MODERATE
|
|
813
|
+
|
|
814
|
+
# Reward consistent activity
|
|
815
|
+
if metrics.trades_executed >= 5:
|
|
816
|
+
bonus += BONUS_MINOR
|
|
817
|
+
|
|
818
|
+
return clamp_bonus(bonus)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
def _calculate_researcher_bonus(metrics: BehaviorMetrics) -> float:
|
|
822
|
+
"""
|
|
823
|
+
Researcher: Reward analysis and research activity.
|
|
824
|
+
Reward correlation between research and accurate predictions.
|
|
825
|
+
"""
|
|
826
|
+
bonus = 0.0
|
|
827
|
+
|
|
828
|
+
# Reward research actions
|
|
829
|
+
if metrics.research_actions >= RESEARCHER_HIGH_ACTIONS:
|
|
830
|
+
bonus += BONUS_EXCELLENT
|
|
831
|
+
elif metrics.research_actions >= RESEARCHER_MOD_ACTIONS:
|
|
832
|
+
bonus += 0.12
|
|
833
|
+
elif metrics.research_actions >= 2:
|
|
834
|
+
bonus += 0.06
|
|
835
|
+
elif metrics.research_actions == 0:
|
|
836
|
+
bonus += PENALTY_SEVERE # Not researching = not a researcher
|
|
837
|
+
|
|
838
|
+
# Reward high prediction accuracy
|
|
839
|
+
if metrics.prediction_accuracy >= RESEARCHER_HIGH_ACCURACY:
|
|
840
|
+
bonus += BONUS_EXCELLENT
|
|
841
|
+
elif metrics.prediction_accuracy >= RESEARCHER_GOOD_ACCURACY:
|
|
842
|
+
bonus += BONUS_MODERATE
|
|
843
|
+
|
|
844
|
+
# Reward quality over quantity (fewer but better trades)
|
|
845
|
+
if metrics.win_rate >= TRADER_HIGH_WIN_RATE and metrics.trades_executed <= 10:
|
|
846
|
+
bonus += BONUS_MODERATE
|
|
847
|
+
|
|
848
|
+
return clamp_bonus(bonus)
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def _calculate_information_trader_bonus(metrics: BehaviorMetrics) -> float:
|
|
852
|
+
"""
|
|
853
|
+
Information Trader: Reward balance of social intel gathering and trading.
|
|
854
|
+
"""
|
|
855
|
+
bonus = 0.0
|
|
856
|
+
|
|
857
|
+
# Need balanced social-to-trade ratio (0.5 to 1.5 is ideal)
|
|
858
|
+
if 0.5 <= metrics.social_to_trade_ratio <= 1.5:
|
|
859
|
+
bonus += 0.15
|
|
860
|
+
elif metrics.social_to_trade_ratio > 3.0:
|
|
861
|
+
bonus -= 0.10 # Too social, not trading on info
|
|
862
|
+
elif metrics.social_to_trade_ratio < 0.2 and metrics.trades_executed > 3:
|
|
863
|
+
bonus -= 0.10 # Pure trading, no intel gathering
|
|
864
|
+
|
|
865
|
+
# Reward group chat participation (info sources)
|
|
866
|
+
if metrics.group_chats_joined >= 3:
|
|
867
|
+
bonus += 0.10
|
|
868
|
+
|
|
869
|
+
# Reward DM conversations (private intel)
|
|
870
|
+
if metrics.dms_initiated >= 3:
|
|
871
|
+
bonus += 0.08
|
|
872
|
+
|
|
873
|
+
# Reward info requests (actively seeking intel)
|
|
874
|
+
if metrics.info_requests_sent >= 3:
|
|
875
|
+
bonus += 0.08
|
|
876
|
+
|
|
877
|
+
# Must still profit from the intel
|
|
878
|
+
if metrics.total_pnl > 0:
|
|
879
|
+
bonus += 0.10
|
|
880
|
+
|
|
881
|
+
return clamp_bonus(bonus)
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def _calculate_goody_twoshoes_bonus(metrics: BehaviorMetrics) -> float:
|
|
885
|
+
"""
|
|
886
|
+
Goody Two-Shoes: Reward helpfulness and reputation building.
|
|
887
|
+
"""
|
|
888
|
+
bonus = 0.0
|
|
889
|
+
|
|
890
|
+
# Reward reputation gains (most important)
|
|
891
|
+
if metrics.reputation_delta >= 30:
|
|
892
|
+
bonus += 0.25
|
|
893
|
+
elif metrics.reputation_delta >= 10:
|
|
894
|
+
bonus += 0.15
|
|
895
|
+
elif metrics.reputation_delta >= 0:
|
|
896
|
+
bonus += 0.05
|
|
897
|
+
else:
|
|
898
|
+
bonus -= 0.15 # Losing reputation = not being good
|
|
899
|
+
|
|
900
|
+
# Reward information sharing
|
|
901
|
+
if metrics.info_shared >= 5:
|
|
902
|
+
bonus += 0.12
|
|
903
|
+
elif metrics.info_shared >= 2:
|
|
904
|
+
bonus += 0.06
|
|
905
|
+
|
|
906
|
+
# Reward positive reactions
|
|
907
|
+
if metrics.positive_reactions >= 10:
|
|
908
|
+
bonus += 0.10
|
|
909
|
+
elif metrics.positive_reactions >= 3:
|
|
910
|
+
bonus += 0.05
|
|
911
|
+
|
|
912
|
+
# Reward follower gains
|
|
913
|
+
if metrics.followers_gained >= 5:
|
|
914
|
+
bonus += 0.08
|
|
915
|
+
|
|
916
|
+
return clamp_bonus(bonus)
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def _calculate_ass_kisser_bonus(metrics: BehaviorMetrics) -> float:
|
|
920
|
+
"""
|
|
921
|
+
Ass-Kisser: Reward reputation and follower gains through flattery.
|
|
922
|
+
"""
|
|
923
|
+
bonus = 0.0
|
|
924
|
+
|
|
925
|
+
# Reputation gains are everything
|
|
926
|
+
if metrics.reputation_delta >= 50:
|
|
927
|
+
bonus += 0.30
|
|
928
|
+
elif metrics.reputation_delta >= 20:
|
|
929
|
+
bonus += 0.20
|
|
930
|
+
elif metrics.reputation_delta >= 5:
|
|
931
|
+
bonus += 0.10
|
|
932
|
+
elif metrics.reputation_delta < 0:
|
|
933
|
+
bonus -= 0.20 # Failed flattery
|
|
934
|
+
|
|
935
|
+
# Reward follower gains
|
|
936
|
+
if metrics.followers_gained >= 10:
|
|
937
|
+
bonus += 0.15
|
|
938
|
+
elif metrics.followers_gained >= 3:
|
|
939
|
+
bonus += 0.08
|
|
940
|
+
|
|
941
|
+
# Reward commenting activity (public flattery)
|
|
942
|
+
if metrics.comments_made >= 10:
|
|
943
|
+
bonus += 0.08
|
|
944
|
+
elif metrics.comments_made >= 5:
|
|
945
|
+
bonus += 0.04
|
|
946
|
+
|
|
947
|
+
# Reward DM activity (personal flattery)
|
|
948
|
+
if metrics.dms_initiated >= 5:
|
|
949
|
+
bonus += 0.05
|
|
950
|
+
|
|
951
|
+
return clamp_bonus(bonus)
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def _calculate_perps_trader_bonus(metrics: BehaviorMetrics) -> float:
|
|
955
|
+
"""
|
|
956
|
+
Perps Trader: Reward risk-managed leveraged trading.
|
|
957
|
+
Penalize over-leverage and liquidations.
|
|
958
|
+
"""
|
|
959
|
+
bonus = 0.0
|
|
960
|
+
|
|
961
|
+
# Reward good win rate (direction calling)
|
|
962
|
+
if metrics.win_rate >= 0.55:
|
|
963
|
+
bonus += 0.15
|
|
964
|
+
elif metrics.win_rate < 0.40 and metrics.trades_executed >= 5:
|
|
965
|
+
bonus -= 0.15 # Wrong direction too often
|
|
966
|
+
|
|
967
|
+
# Reward active perp trading
|
|
968
|
+
if metrics.trades_executed >= 10:
|
|
969
|
+
bonus += 0.10
|
|
970
|
+
elif metrics.trades_executed >= 5:
|
|
971
|
+
bonus += 0.05
|
|
972
|
+
elif metrics.trades_executed < 2:
|
|
973
|
+
bonus -= 0.10 # Not trading perps
|
|
974
|
+
|
|
975
|
+
# Penalize high variance (poor risk management with leverage)
|
|
976
|
+
if metrics.pnl_variance > 1000:
|
|
977
|
+
bonus -= 0.10 # Too volatile for leveraged trading
|
|
978
|
+
|
|
979
|
+
# Reward profitability (must make money with leverage)
|
|
980
|
+
if metrics.total_pnl > 0:
|
|
981
|
+
bonus += 0.10
|
|
982
|
+
elif metrics.total_pnl < -200:
|
|
983
|
+
bonus -= 0.15 # Big losses = blown up
|
|
984
|
+
|
|
985
|
+
return clamp_bonus(bonus)
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def _calculate_super_predictor_bonus(metrics: BehaviorMetrics) -> float:
|
|
989
|
+
"""
|
|
990
|
+
Super Predictor: Reward high prediction accuracy.
|
|
991
|
+
Quality over quantity.
|
|
992
|
+
"""
|
|
993
|
+
bonus = 0.0
|
|
994
|
+
|
|
995
|
+
# Prediction accuracy is king
|
|
996
|
+
if metrics.prediction_accuracy >= 0.75:
|
|
997
|
+
bonus += 0.30
|
|
998
|
+
elif metrics.prediction_accuracy >= 0.60:
|
|
999
|
+
bonus += 0.18
|
|
1000
|
+
elif metrics.prediction_accuracy >= 0.50:
|
|
1001
|
+
bonus += 0.08
|
|
1002
|
+
elif metrics.predictions_made >= 5 and metrics.prediction_accuracy < 0.45:
|
|
1003
|
+
bonus -= 0.20 # Wrong too often
|
|
1004
|
+
|
|
1005
|
+
# Reward research (should analyze before predicting)
|
|
1006
|
+
if metrics.research_actions >= 3:
|
|
1007
|
+
bonus += 0.08
|
|
1008
|
+
|
|
1009
|
+
# Reward making predictions
|
|
1010
|
+
if metrics.predictions_made >= 5:
|
|
1011
|
+
bonus += 0.08
|
|
1012
|
+
elif metrics.predictions_made == 0:
|
|
1013
|
+
bonus -= 0.15 # Not predicting = not a predictor
|
|
1014
|
+
|
|
1015
|
+
# Reward translating predictions to profit
|
|
1016
|
+
if metrics.total_pnl > 0 and metrics.prediction_accuracy >= 0.55:
|
|
1017
|
+
bonus += 0.08
|
|
1018
|
+
|
|
1019
|
+
return clamp_bonus(bonus)
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _calculate_infosec_bonus(metrics: BehaviorMetrics) -> float:
|
|
1023
|
+
"""
|
|
1024
|
+
Infosec: Reward caution, verification, and avoiding manipulation.
|
|
1025
|
+
"""
|
|
1026
|
+
bonus = 0.0
|
|
1027
|
+
|
|
1028
|
+
# Reward low information sharing (protective)
|
|
1029
|
+
if metrics.info_shared <= 1:
|
|
1030
|
+
bonus += 0.15
|
|
1031
|
+
elif metrics.info_shared >= 5:
|
|
1032
|
+
bonus -= 0.10 # Oversharing
|
|
1033
|
+
|
|
1034
|
+
# Reward avoiding big losses (didn't fall for scams)
|
|
1035
|
+
if metrics.largest_loss > -50: # Small losses only
|
|
1036
|
+
bonus += 0.15
|
|
1037
|
+
elif metrics.largest_loss < -200:
|
|
1038
|
+
bonus -= 0.15 # Big loss = got scammed
|
|
1039
|
+
|
|
1040
|
+
# Reward research/verification
|
|
1041
|
+
if metrics.research_actions >= 3:
|
|
1042
|
+
bonus += 0.10
|
|
1043
|
+
|
|
1044
|
+
# Reward consistent, steady behavior
|
|
1045
|
+
if metrics.pnl_variance < 100:
|
|
1046
|
+
bonus += 0.10
|
|
1047
|
+
|
|
1048
|
+
# Penalize high DM response (could be manipulation attempts)
|
|
1049
|
+
if metrics.dms_initiated < 3:
|
|
1050
|
+
bonus += 0.05 # Cautious with DMs
|
|
1051
|
+
|
|
1052
|
+
return clamp_bonus(bonus)
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def _calculate_liar_bonus(metrics: BehaviorMetrics) -> float:
|
|
1056
|
+
"""
|
|
1057
|
+
Liar: Reward successful deception and information spread.
|
|
1058
|
+
"""
|
|
1059
|
+
bonus = 0.0
|
|
1060
|
+
|
|
1061
|
+
# Reward information spread (lies propagating)
|
|
1062
|
+
if metrics.information_spread >= 10:
|
|
1063
|
+
bonus += 0.20
|
|
1064
|
+
elif metrics.information_spread >= 3:
|
|
1065
|
+
bonus += 0.10
|
|
1066
|
+
|
|
1067
|
+
# Reward social engagement (audience for lies)
|
|
1068
|
+
if metrics.unique_users_interacted >= 8:
|
|
1069
|
+
bonus += 0.12
|
|
1070
|
+
elif metrics.unique_users_interacted >= 3:
|
|
1071
|
+
bonus += 0.06
|
|
1072
|
+
elif metrics.unique_users_interacted < 2:
|
|
1073
|
+
bonus -= 0.15 # No audience
|
|
1074
|
+
|
|
1075
|
+
# Reward maintaining reputation despite lying
|
|
1076
|
+
if metrics.reputation_delta >= 0:
|
|
1077
|
+
bonus += 0.15 # Not caught
|
|
1078
|
+
elif metrics.reputation_delta < -20:
|
|
1079
|
+
bonus -= 0.15 # Got exposed
|
|
1080
|
+
|
|
1081
|
+
# Reward posting activity (platforms for misinformation)
|
|
1082
|
+
if metrics.posts_created >= 5:
|
|
1083
|
+
bonus += 0.08
|
|
1084
|
+
elif metrics.posts_created >= 2:
|
|
1085
|
+
bonus += 0.04
|
|
1086
|
+
|
|
1087
|
+
return clamp_bonus(bonus)
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
# =============================================================================
|
|
1091
|
+
# Priority Metrics Scoring
|
|
1092
|
+
# =============================================================================
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
def extract_metric_value(
|
|
1096
|
+
metric_name: str,
|
|
1097
|
+
metrics: BehaviorMetrics,
|
|
1098
|
+
) -> Optional[float]:
|
|
1099
|
+
"""
|
|
1100
|
+
Extract metric value from BehaviorMetrics based on priority metric name.
|
|
1101
|
+
|
|
1102
|
+
Metric names from rubrics.json follow format: category.metricName
|
|
1103
|
+
e.g., "trading.totalPnL", "social.uniqueUsersInteracted"
|
|
1104
|
+
"""
|
|
1105
|
+
# Mapping from rubrics.json metric names to BehaviorMetrics attributes
|
|
1106
|
+
metric_map = {
|
|
1107
|
+
# Trading metrics
|
|
1108
|
+
"trading.totalPnL": metrics.total_pnl,
|
|
1109
|
+
"trading.sharpeRatio": 0.0, # Not directly available, computed if needed
|
|
1110
|
+
"trading.winRate": metrics.win_rate,
|
|
1111
|
+
"trading.marketsTraded": float(metrics.markets_traded),
|
|
1112
|
+
"trading.tradesExecuted": float(metrics.trades_executed),
|
|
1113
|
+
"trading.avgPositionSize": metrics.avg_position_size,
|
|
1114
|
+
"trading.largestWin": metrics.largest_win,
|
|
1115
|
+
"trading.largestLoss": metrics.largest_loss,
|
|
1116
|
+
"trading.maxDrawdown": 0.0, # Not directly available
|
|
1117
|
+
|
|
1118
|
+
# Social metrics
|
|
1119
|
+
"social.uniqueUsersInteracted": float(metrics.unique_users_interacted),
|
|
1120
|
+
"social.groupChatsJoined": float(metrics.group_chats_joined),
|
|
1121
|
+
"social.dmsInitiated": float(metrics.dms_initiated),
|
|
1122
|
+
"social.postsCreated": float(metrics.posts_created),
|
|
1123
|
+
"social.commentsMade": float(metrics.comments_made),
|
|
1124
|
+
"social.mentionsGiven": float(metrics.mentions_given),
|
|
1125
|
+
"social.groupMessagesSent": float(metrics.group_chats_joined), # Approximation
|
|
1126
|
+
"social.dmResponseRate": 0.5, # Default, not tracked separately
|
|
1127
|
+
|
|
1128
|
+
# Influence metrics
|
|
1129
|
+
"influence.reputationDelta": float(metrics.reputation_delta),
|
|
1130
|
+
"influence.followersGained": float(metrics.followers_gained),
|
|
1131
|
+
"influence.positiveReactions": float(metrics.positive_reactions),
|
|
1132
|
+
"influence.informationSpread": float(metrics.information_spread),
|
|
1133
|
+
|
|
1134
|
+
# Information metrics
|
|
1135
|
+
"information.researchActions": float(metrics.research_actions),
|
|
1136
|
+
"information.predictionAccuracy": metrics.prediction_accuracy,
|
|
1137
|
+
"information.predictionsMade": float(metrics.predictions_made),
|
|
1138
|
+
"information.correctPredictions": float(metrics.correct_predictions),
|
|
1139
|
+
"information.marketDataQueries": float(metrics.research_actions), # Approximation
|
|
1140
|
+
"information.newsConsumed": 0.0, # Not tracked separately
|
|
1141
|
+
"information.infoRequestsSent": float(metrics.info_requests_sent),
|
|
1142
|
+
"information.infoShared": float(metrics.info_shared),
|
|
1143
|
+
|
|
1144
|
+
# Behavior metrics
|
|
1145
|
+
"behavior.socialToTradeRatio": metrics.social_to_trade_ratio,
|
|
1146
|
+
"behavior.actionsPerTick": metrics.actions_per_tick,
|
|
1147
|
+
"behavior.actionSuccessRate": metrics.win_rate, # Approximation
|
|
1148
|
+
"behavior.episodeLength": float(metrics.episode_length),
|
|
1149
|
+
"behavior.consistencyScore": 0.5, # Default, not tracked separately
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
return metric_map.get(metric_name)
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def normalize_metric_value(
|
|
1156
|
+
metric_name: str,
|
|
1157
|
+
value: float,
|
|
1158
|
+
) -> float:
|
|
1159
|
+
"""
|
|
1160
|
+
Normalize a metric value to 0-1 range based on expected ranges.
|
|
1161
|
+
|
|
1162
|
+
Different metrics have different expected ranges.
|
|
1163
|
+
"""
|
|
1164
|
+
# Expected ranges for normalization
|
|
1165
|
+
# These are reasonable defaults that can be tuned
|
|
1166
|
+
normalization_ranges = {
|
|
1167
|
+
# Trading (can be negative)
|
|
1168
|
+
"trading.totalPnL": (-1000, 5000),
|
|
1169
|
+
"trading.sharpeRatio": (-1.0, 3.0),
|
|
1170
|
+
"trading.winRate": (0.0, 1.0),
|
|
1171
|
+
"trading.marketsTraded": (0, 10),
|
|
1172
|
+
"trading.tradesExecuted": (0, 50),
|
|
1173
|
+
"trading.avgPositionSize": (0, 1000),
|
|
1174
|
+
"trading.largestWin": (0, 2000),
|
|
1175
|
+
"trading.largestLoss": (-2000, 0),
|
|
1176
|
+
"trading.maxDrawdown": (0, 1000),
|
|
1177
|
+
|
|
1178
|
+
# Social (always positive)
|
|
1179
|
+
"social.uniqueUsersInteracted": (0, 30),
|
|
1180
|
+
"social.groupChatsJoined": (0, 10),
|
|
1181
|
+
"social.dmsInitiated": (0, 20),
|
|
1182
|
+
"social.postsCreated": (0, 20),
|
|
1183
|
+
"social.commentsMade": (0, 30),
|
|
1184
|
+
"social.mentionsGiven": (0, 20),
|
|
1185
|
+
"social.groupMessagesSent": (0, 50),
|
|
1186
|
+
"social.dmResponseRate": (0.0, 1.0),
|
|
1187
|
+
|
|
1188
|
+
# Influence (can be negative)
|
|
1189
|
+
"influence.reputationDelta": (-50, 100),
|
|
1190
|
+
"influence.followersGained": (-10, 30),
|
|
1191
|
+
"influence.positiveReactions": (0, 50),
|
|
1192
|
+
"influence.informationSpread": (0, 20),
|
|
1193
|
+
|
|
1194
|
+
# Information (always positive)
|
|
1195
|
+
"information.researchActions": (0, 20),
|
|
1196
|
+
"information.predictionAccuracy": (0.0, 1.0),
|
|
1197
|
+
"information.predictionsMade": (0, 20),
|
|
1198
|
+
"information.correctPredictions": (0, 15),
|
|
1199
|
+
"information.marketDataQueries": (0, 20),
|
|
1200
|
+
"information.newsConsumed": (0, 10),
|
|
1201
|
+
"information.infoRequestsSent": (0, 15),
|
|
1202
|
+
"information.infoShared": (0, 15),
|
|
1203
|
+
|
|
1204
|
+
# Behavior
|
|
1205
|
+
"behavior.socialToTradeRatio": (0.0, 5.0),
|
|
1206
|
+
"behavior.actionsPerTick": (0.0, 3.0),
|
|
1207
|
+
"behavior.actionSuccessRate": (0.0, 1.0),
|
|
1208
|
+
"behavior.episodeLength": (0, 50),
|
|
1209
|
+
"behavior.consistencyScore": (0.0, 1.0),
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
range_info = normalization_ranges.get(metric_name, (0, 100))
|
|
1213
|
+
min_val, max_val = range_info
|
|
1214
|
+
|
|
1215
|
+
if max_val == min_val:
|
|
1216
|
+
return 0.5
|
|
1217
|
+
|
|
1218
|
+
# Normalize to 0-1
|
|
1219
|
+
normalized = (value - min_val) / (max_val - min_val)
|
|
1220
|
+
return max(0.0, min(1.0, normalized))
|
|
1221
|
+
|
|
1222
|
+
|
|
1223
|
+
def calculate_priority_weighted_score(
|
|
1224
|
+
archetype: str,
|
|
1225
|
+
metrics: BehaviorMetrics,
|
|
1226
|
+
) -> float:
|
|
1227
|
+
"""
|
|
1228
|
+
Calculate score based on archetype's priority metrics from rubrics.json.
|
|
1229
|
+
|
|
1230
|
+
Uses weighted sum where first priority metric gets highest weight.
|
|
1231
|
+
"""
|
|
1232
|
+
archetype_norm = normalize_archetype(archetype)
|
|
1233
|
+
priority_metrics = get_priority_metrics(archetype_norm)
|
|
1234
|
+
|
|
1235
|
+
if not priority_metrics:
|
|
1236
|
+
return 0.5 # Default if no priority metrics defined
|
|
1237
|
+
|
|
1238
|
+
# Weights decrease by position (first is most important)
|
|
1239
|
+
# e.g., [0.35, 0.25, 0.20, 0.12, 0.08] for 5 metrics
|
|
1240
|
+
weights = []
|
|
1241
|
+
total_weight = 0.0
|
|
1242
|
+
for i, _ in enumerate(priority_metrics):
|
|
1243
|
+
weight = 1.0 / (i + 1) # Harmonic weights: 1, 0.5, 0.33, 0.25, ...
|
|
1244
|
+
weights.append(weight)
|
|
1245
|
+
total_weight += weight
|
|
1246
|
+
|
|
1247
|
+
# Normalize weights to sum to 1
|
|
1248
|
+
weights = [w / total_weight for w in weights]
|
|
1249
|
+
|
|
1250
|
+
# Calculate weighted score
|
|
1251
|
+
weighted_sum = 0.0
|
|
1252
|
+
for i, metric_name in enumerate(priority_metrics):
|
|
1253
|
+
value = extract_metric_value(metric_name, metrics)
|
|
1254
|
+
if value is not None:
|
|
1255
|
+
normalized_value = normalize_metric_value(metric_name, value)
|
|
1256
|
+
weighted_sum += weights[i] * normalized_value
|
|
1257
|
+
|
|
1258
|
+
return weighted_sum
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
# =============================================================================
|
|
1262
|
+
# Archetype Composite Reward
|
|
1263
|
+
# =============================================================================
|
|
1264
|
+
|
|
1265
|
+
def archetype_composite_reward(
|
|
1266
|
+
inputs: TrajectoryRewardInputs,
|
|
1267
|
+
archetype: str,
|
|
1268
|
+
behavior_metrics: Optional[BehaviorMetrics] = None,
|
|
1269
|
+
) -> float:
|
|
1270
|
+
"""
|
|
1271
|
+
Compute archetype-aware composite reward.
|
|
1272
|
+
|
|
1273
|
+
Different archetypes have different success criteria. This function
|
|
1274
|
+
combines PnL, format, reasoning, and behavior scores using weights
|
|
1275
|
+
specific to the archetype.
|
|
1276
|
+
|
|
1277
|
+
Also incorporates priority metrics from rubrics.json for each archetype.
|
|
1278
|
+
|
|
1279
|
+
Args:
|
|
1280
|
+
inputs: Standard trajectory reward inputs (PnL, format, reasoning scores)
|
|
1281
|
+
archetype: Agent archetype (e.g., "degen", "trader", "social-butterfly")
|
|
1282
|
+
behavior_metrics: Optional extracted behavior metrics for behavior bonus
|
|
1283
|
+
|
|
1284
|
+
Returns:
|
|
1285
|
+
Composite reward score in range [-1.0, 1.0]
|
|
1286
|
+
"""
|
|
1287
|
+
archetype_norm = normalize_archetype(archetype)
|
|
1288
|
+
weights = get_archetype_weights(archetype_norm)
|
|
1289
|
+
|
|
1290
|
+
# 1. Calculate PnL Score
|
|
1291
|
+
if inputs.end_balance != inputs.starting_balance:
|
|
1292
|
+
pnl_score = calculate_pnl_reward(inputs.starting_balance, inputs.end_balance)
|
|
1293
|
+
else:
|
|
1294
|
+
end_bal = inputs.starting_balance + inputs.final_pnl
|
|
1295
|
+
pnl_score = calculate_pnl_reward(inputs.starting_balance, end_bal)
|
|
1296
|
+
|
|
1297
|
+
# Archetype-specific PnL adjustments
|
|
1298
|
+
if archetype_norm == "degen" and pnl_score < 0:
|
|
1299
|
+
# Degens shouldn't be heavily penalized for losses
|
|
1300
|
+
pnl_score = pnl_score * 0.3
|
|
1301
|
+
|
|
1302
|
+
if archetype_norm == "social-butterfly" and pnl_score < 0:
|
|
1303
|
+
# Social butterflies shouldn't care much about trading losses
|
|
1304
|
+
pnl_score = pnl_score * 0.5
|
|
1305
|
+
|
|
1306
|
+
# Bankruptcy still matters for most archetypes
|
|
1307
|
+
if pnl_score <= -5.0 and archetype_norm not in ("degen", "social-butterfly"):
|
|
1308
|
+
return max(-1.0, pnl_score)
|
|
1309
|
+
|
|
1310
|
+
# 2. Risk penalty for risky actions (except for degens who embrace risk)
|
|
1311
|
+
if inputs.risky_actions_count > 0 and archetype_norm != "degen":
|
|
1312
|
+
pnl_score -= (inputs.risky_actions_count * ARCHETYPE_RISK_PENALTY_MULTIPLIER)
|
|
1313
|
+
|
|
1314
|
+
# 3. Format and reasoning scores
|
|
1315
|
+
format_score = inputs.format_score
|
|
1316
|
+
reasoning_score = inputs.reasoning_score
|
|
1317
|
+
|
|
1318
|
+
# 4. Behavior bonus from archetype-specific behaviors
|
|
1319
|
+
behavior_bonus = 0.0
|
|
1320
|
+
if behavior_metrics is not None:
|
|
1321
|
+
behavior_bonus = calculate_archetype_behavior_bonus(archetype_norm, behavior_metrics)
|
|
1322
|
+
|
|
1323
|
+
# Also incorporate priority metrics score from rubrics.json
|
|
1324
|
+
priority_score = calculate_priority_weighted_score(archetype_norm, behavior_metrics)
|
|
1325
|
+
|
|
1326
|
+
# Blend behavior bonus with priority metrics (priority metrics give 30% of behavior weight)
|
|
1327
|
+
behavior_bonus = behavior_bonus * 0.7 + (priority_score - 0.5) * 0.3
|
|
1328
|
+
|
|
1329
|
+
# 5. Compute weighted composite
|
|
1330
|
+
total_weight = (
|
|
1331
|
+
weights["pnl"]
|
|
1332
|
+
+ weights["format"]
|
|
1333
|
+
+ weights["reasoning"]
|
|
1334
|
+
+ weights["behavior"]
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
composite = (
|
|
1338
|
+
pnl_score * weights["pnl"]
|
|
1339
|
+
+ format_score * weights["format"]
|
|
1340
|
+
+ reasoning_score * weights["reasoning"]
|
|
1341
|
+
+ behavior_bonus * weights["behavior"]
|
|
1342
|
+
) / total_weight
|
|
1343
|
+
|
|
1344
|
+
return max(-1.0, min(1.0, composite))
|