@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,1344 @@
1
+ """
2
+ Reward Functions for Training
3
+
4
+ Computes various reward signals for RL training:
5
+ - PnL-based: Raw profit/loss performance
6
+ - Risk-adjusted: Sharpe-like reward accounting for variance
7
+ - Efficiency: Reward per action taken
8
+ - Action quality: Based on success rate and correctness
9
+ - Composite: Weighted combination of multiple signals
10
+ - Archetype-aware: Different archetypes have different success criteria
11
+
12
+ Also provides utilities for normalizing and comparing rewards.
13
+ """
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Optional
17
+ import math
18
+
19
+ from .rubric_loader import normalize_archetype, get_priority_metrics
20
+
21
+
22
+ # =============================================================================
23
+ # Archetype Scoring Constants
24
+ # =============================================================================
25
+ # Thresholds for behavior bonuses. Extracted from behavior functions for clarity.
26
+
27
+ # Degen thresholds
28
+ DEGEN_HIGH_TRADES = 20 # Excellent degen activity
29
+ DEGEN_GOOD_TRADES = 10 # Good degen activity
30
+ DEGEN_MIN_TRADES = 5 # Minimum for positive bonus
31
+ DEGEN_HIGH_VARIANCE = 500 # High P&L variance (bold trades)
32
+ DEGEN_MOD_VARIANCE = 100 # Moderate variance
33
+ DEGEN_HIGH_POSITION = 500 # Large position size
34
+ DEGEN_MOD_POSITION = 200 # Moderate position size
35
+
36
+ # Social Butterfly thresholds
37
+ SOCIAL_EXCELLENT_CONNECTIONS = 15 # Top networking
38
+ SOCIAL_GOOD_CONNECTIONS = 8 # Good networking
39
+ SOCIAL_MIN_CONNECTIONS = 3 # Minimum for bonus
40
+ SOCIAL_HIGH_GROUPS = 5 # Many group chats
41
+ SOCIAL_MIN_GROUPS = 2 # Minimum groups
42
+ SOCIAL_HIGH_DMS = 10 # High DM activity
43
+ SOCIAL_MIN_DMS = 3 # Minimum DMs
44
+
45
+ # Trader thresholds
46
+ TRADER_HIGH_WIN_RATE = 0.60 # Excellent discipline
47
+ TRADER_GOOD_WIN_RATE = 0.50 # Good discipline
48
+ TRADER_LOW_WIN_RATE = 0.40 # Poor discipline
49
+ TRADER_HIGH_DIVERSIFICATION = 4 # Well diversified
50
+ TRADER_MIN_DIVERSIFICATION = 2 # Some diversification
51
+
52
+ # Researcher thresholds
53
+ RESEARCHER_HIGH_ACTIONS = 10 # Heavy research
54
+ RESEARCHER_MOD_ACTIONS = 5 # Moderate research
55
+ RESEARCHER_HIGH_ACCURACY = 0.7 # Excellent accuracy
56
+ RESEARCHER_GOOD_ACCURACY = 0.5 # Good accuracy
57
+
58
+ # Bonus/penalty caps
59
+ MAX_BEHAVIOR_BONUS = 0.5 # Maximum behavior bonus
60
+ MIN_BEHAVIOR_PENALTY = -0.5 # Maximum behavior penalty
61
+
62
+ # Archetype-aware scoring multipliers
63
+ # Note: Legacy composite_reward uses 0.5, archetype version uses 0.3 (more lenient)
64
+ ARCHETYPE_RISK_PENALTY_MULTIPLIER = 0.3 # Per-risky-action penalty for non-degen archetypes
65
+
66
+ # Bonus amounts (tunable parameters)
67
+ BONUS_EXCELLENT = 0.20 # Excellent archetype-aligned behavior
68
+ BONUS_GOOD = 0.15 # Good archetype-aligned behavior
69
+ BONUS_MODERATE = 0.10 # Moderate archetype-aligned behavior
70
+ BONUS_MINOR = 0.05 # Minor positive signal
71
+ PENALTY_MODERATE = -0.10 # Moderate archetype violation
72
+ PENALTY_SEVERE = -0.15 # Severe archetype violation
73
+ PENALTY_CRITICAL = -0.20 # Critical archetype failure
74
+
75
+
76
+ def clamp_bonus(bonus: float) -> float:
77
+ """Clamp behavior bonus to valid range [-0.5, 0.5]."""
78
+ return max(MIN_BEHAVIOR_PENALTY, min(MAX_BEHAVIOR_BONUS, bonus))
79
+
80
+ # =============================================================================
81
+ # Archetype-Specific Reward Weights
82
+ # =============================================================================
83
+ # Each archetype has different success criteria. These weights determine
84
+ # how much each component contributes to the final score:
85
+ #
86
+ # - pnl: Financial performance (P&L-based reward)
87
+ # - format: Response format quality (proper structure, valid JSON)
88
+ # - reasoning: Quality of reasoning in LLM calls
89
+ # - behavior: Archetype-aligned behavioral bonus/penalty
90
+ #
91
+ # Design principles:
92
+ # 1. Weights sum to 1.0 for each archetype
93
+ # 2. Archetypes that don't focus on profit have lower pnl weight
94
+ # 3. Behavior weight is higher for personality-driven archetypes
95
+ # 4. Format/reasoning provide baseline quality signals
96
+
97
+ ARCHETYPE_REWARD_WEIGHTS: Dict[str, Dict[str, float]] = {
98
+ # Traders prioritize P&L and risk management
99
+ "trader": {
100
+ "pnl": 0.55,
101
+ "format": 0.20,
102
+ "reasoning": 0.15,
103
+ "behavior": 0.10,
104
+ },
105
+ # Degens prioritize activity and risk-taking over profitability
106
+ "degen": {
107
+ "pnl": 0.15, # Reduced - losses are acceptable
108
+ "format": 0.15,
109
+ "reasoning": 0.10,
110
+ "behavior": 0.60, # High bonus for degen behaviors
111
+ },
112
+ # Social butterflies deprioritize trading entirely
113
+ "social-butterfly": {
114
+ "pnl": 0.10,
115
+ "format": 0.20,
116
+ "reasoning": 0.15,
117
+ "behavior": 0.55,
118
+ },
119
+ # Scammers need to profit through manipulation
120
+ "scammer": {
121
+ "pnl": 0.35,
122
+ "format": 0.15,
123
+ "reasoning": 0.20,
124
+ "behavior": 0.30,
125
+ },
126
+ # Researchers prioritize analysis quality
127
+ "researcher": {
128
+ "pnl": 0.25,
129
+ "format": 0.25,
130
+ "reasoning": 0.30,
131
+ "behavior": 0.20,
132
+ },
133
+ # Information traders balance social intel with trading
134
+ "information-trader": {
135
+ "pnl": 0.35,
136
+ "format": 0.20,
137
+ "reasoning": 0.20,
138
+ "behavior": 0.25,
139
+ },
140
+ # Goody two-shoes prioritize reputation and helpfulness
141
+ "goody-twoshoes": {
142
+ "pnl": 0.15,
143
+ "format": 0.25,
144
+ "reasoning": 0.20,
145
+ "behavior": 0.40,
146
+ },
147
+ # Ass-kissers prioritize reputation gains through flattery
148
+ "ass-kisser": {
149
+ "pnl": 0.10,
150
+ "format": 0.20,
151
+ "reasoning": 0.15,
152
+ "behavior": 0.55,
153
+ },
154
+ # Perps traders prioritize risk-adjusted P&L
155
+ "perps-trader": {
156
+ "pnl": 0.50,
157
+ "format": 0.15,
158
+ "reasoning": 0.20,
159
+ "behavior": 0.15,
160
+ },
161
+ # Super predictors prioritize accuracy
162
+ "super-predictor": {
163
+ "pnl": 0.30,
164
+ "format": 0.20,
165
+ "reasoning": 0.25,
166
+ "behavior": 0.25,
167
+ },
168
+ # Infosec agents prioritize security and caution
169
+ "infosec": {
170
+ "pnl": 0.25,
171
+ "format": 0.25,
172
+ "reasoning": 0.30,
173
+ "behavior": 0.20,
174
+ },
175
+ # Liars prioritize successful deception
176
+ "liar": {
177
+ "pnl": 0.20,
178
+ "format": 0.15,
179
+ "reasoning": 0.25,
180
+ "behavior": 0.40,
181
+ },
182
+ # Default balanced weights
183
+ "default": {
184
+ "pnl": 0.50,
185
+ "format": 0.25,
186
+ "reasoning": 0.15,
187
+ "behavior": 0.10,
188
+ },
189
+ }
190
+
191
+
192
+ def _validate_archetype_weights() -> None:
193
+ """
194
+ Validate that all archetype weight dictionaries sum to 1.0.
195
+ Called at module load time to catch configuration errors early.
196
+ """
197
+ TOLERANCE = 1e-9
198
+ for archetype, weights in ARCHETYPE_REWARD_WEIGHTS.items():
199
+ total = sum(weights.values())
200
+ if abs(total - 1.0) > TOLERANCE:
201
+ raise ValueError(
202
+ f"Archetype '{archetype}' weights sum to {total}, expected 1.0. "
203
+ f"Weights: {weights}"
204
+ )
205
+
206
+
207
+ # Validate weights at module load time
208
+ _validate_archetype_weights()
209
+
210
+
211
+ def get_archetype_weights(archetype: str) -> Dict[str, float]:
212
+ """Get reward weights for an archetype."""
213
+ normalized = normalize_archetype(archetype)
214
+ return ARCHETYPE_REWARD_WEIGHTS.get(normalized, ARCHETYPE_REWARD_WEIGHTS["default"])
215
+
216
+
217
+ @dataclass
218
+ class TrajectoryRewardInputs:
219
+ """Inputs for computing rewards."""
220
+
221
+ # Financial Metrics
222
+ final_pnl: float = 0.0
223
+ starting_balance: float = 10000.0
224
+ end_balance: float = 10000.0
225
+ pnl_variance: float = 0.0
226
+ max_drawdown: float = 0.0
227
+
228
+ # Risk Metrics
229
+ max_exposure: float = 0.0
230
+ risky_actions_count: int = 0
231
+
232
+ # Quality Scores (from quality_utils)
233
+ format_score: float = 0.0
234
+ reasoning_score: float = 0.0
235
+
236
+ # Operational Metrics
237
+ num_steps: int = 0
238
+ trades_executed: int = 0
239
+ successful_trades: int = 0
240
+ total_actions: int = 0
241
+ successful_actions: int = 0
242
+
243
+
244
+ def calculate_pnl_reward(start_balance: float, end_balance: float) -> float:
245
+ """
246
+ Calculate PnL Reward.
247
+
248
+ Logic:
249
+ - Bankruptcy (<= 0): -10.0 Hard Penalty
250
+ - Positive PnL: +1.0 (Scaled by % return, capped)
251
+ - Negative PnL: -1.0 (Scaled by % loss, capped)
252
+ """
253
+ if end_balance <= 0:
254
+ return -10.0
255
+
256
+ if start_balance <= 0:
257
+ return 0.0
258
+
259
+ pnl = end_balance - start_balance
260
+ return_pct = pnl / start_balance
261
+
262
+ # Scale: 10% return = 1.0 reward
263
+ scaled_reward = return_pct * 10.0
264
+
265
+ return max(-1.0, min(1.0, scaled_reward))
266
+
267
+
268
+ def calculate_risk_reward(exposure: float, action_type: str) -> float:
269
+ """
270
+ Calculate Risk Management Reward.
271
+
272
+ Returns:
273
+ Penalty (-0.5) if buying when exposure > 80%, else 0.0
274
+ """
275
+ if not action_type:
276
+ return 0.0
277
+
278
+ act = action_type.lower()
279
+ is_buying = any(x in act for x in ['buy', 'long', 'open'])
280
+
281
+ if exposure > 0.80 and is_buying:
282
+ return -0.5
283
+
284
+ return 0.0
285
+
286
+
287
+ def pnl_reward(inputs: TrajectoryRewardInputs) -> float:
288
+ """
289
+ Compute PnL-based reward (Legacy wrapper).
290
+ """
291
+ if inputs.starting_balance <= 0:
292
+ return 0.0
293
+
294
+ return_pct = inputs.final_pnl / inputs.starting_balance
295
+ return max(-1.0, min(1.0, return_pct))
296
+
297
+
298
+ def risk_adjusted_reward(inputs: TrajectoryRewardInputs) -> float:
299
+ """
300
+ Compute risk-adjusted reward (Sharpe-like).
301
+ """
302
+ base = pnl_reward(inputs)
303
+
304
+ if inputs.pnl_variance > 0:
305
+ sharpe = base / math.sqrt(inputs.pnl_variance)
306
+ base = max(-1.0, min(1.0, sharpe))
307
+
308
+ if inputs.max_drawdown > 0 and inputs.starting_balance > 0:
309
+ drawdown_penalty = inputs.max_drawdown / inputs.starting_balance
310
+ base -= drawdown_penalty * 0.5
311
+
312
+ return max(-1.0, min(1.0, base))
313
+
314
+
315
+ def efficiency_reward(inputs: TrajectoryRewardInputs) -> float:
316
+ """
317
+ Compute efficiency reward (reward per action).
318
+ """
319
+ base = pnl_reward(inputs)
320
+
321
+ if inputs.total_actions > 0:
322
+ efficiency = base / math.log1p(inputs.total_actions)
323
+ return max(-1.0, min(1.0, efficiency))
324
+
325
+ return base
326
+
327
+
328
+ def action_quality_reward(inputs: TrajectoryRewardInputs) -> float:
329
+ """
330
+ Compute action quality reward based on success rate.
331
+ """
332
+ if inputs.total_actions == 0:
333
+ return 0.5
334
+
335
+ success_rate = inputs.successful_actions / inputs.total_actions
336
+ return success_rate
337
+
338
+
339
+ def composite_reward(
340
+ inputs: TrajectoryRewardInputs,
341
+ pnl_weight: float = 0.5,
342
+ format_weight: float = 0.3,
343
+ reasoning_weight: float = 0.2,
344
+ # Legacy weights
345
+ risk_weight: float = 0.0,
346
+ efficiency_weight: float = 0.0,
347
+ quality_weight: float = 0.0,
348
+ ) -> float:
349
+ """
350
+ Compute weighted composite reward.
351
+
352
+ If 'format_score' or 'reasoning_score' are present, uses the new weighting:
353
+ - PnL: 50%
354
+ - Format: 30%
355
+ - Reasoning: 20%
356
+
357
+ Otherwise falls back to legacy weighting.
358
+ """
359
+
360
+ # 1. Calculate PnL Score
361
+ if inputs.end_balance != inputs.starting_balance:
362
+ pnl_score = calculate_pnl_reward(
363
+ inputs.starting_balance, inputs.end_balance)
364
+ else:
365
+ # Fallback if specific balances aren't tracked separately
366
+ end_bal = inputs.starting_balance + inputs.final_pnl
367
+ pnl_score = calculate_pnl_reward(inputs.starting_balance, end_bal)
368
+
369
+ # Bankruptcy override
370
+ if pnl_score <= -5.0:
371
+ return pnl_score
372
+
373
+ # 2. Risk Penalty
374
+ if inputs.risky_actions_count > 0:
375
+ pnl_score -= (inputs.risky_actions_count * 0.5)
376
+
377
+ # 3. Scoring System
378
+ if inputs.format_score != 0 or inputs.reasoning_score != 0:
379
+ total_weight = pnl_weight + format_weight + reasoning_weight
380
+ if total_weight == 0:
381
+ return 0.0
382
+
383
+ composite = (
384
+ (pnl_score * pnl_weight) +
385
+ (inputs.format_score * format_weight) +
386
+ (inputs.reasoning_score * reasoning_weight)
387
+ ) / total_weight
388
+
389
+ return max(-1.0, min(1.0, composite))
390
+
391
+ # 4. Legacy Scoring System (Fallback)
392
+ # If using legacy, we need non-zero weights
393
+ if risk_weight == 0 and efficiency_weight == 0 and quality_weight == 0:
394
+ # Defaults for legacy system
395
+ l_pnl = 0.4
396
+ l_risk = 0.3
397
+ l_eff = 0.15
398
+ l_qual = 0.15
399
+ else:
400
+ l_pnl = pnl_weight
401
+ l_risk = risk_weight
402
+ l_eff = efficiency_weight
403
+ l_qual = quality_weight
404
+
405
+ total_weight = l_pnl + l_risk + l_eff + l_qual
406
+ if total_weight == 0:
407
+ return 0.0
408
+
409
+ composite = (
410
+ l_pnl * pnl_reward(inputs)
411
+ + l_risk * risk_adjusted_reward(inputs)
412
+ + l_eff * efficiency_reward(inputs)
413
+ + l_qual * action_quality_reward(inputs)
414
+ ) / total_weight
415
+
416
+ return max(-1.0, min(1.0, composite))
417
+
418
+
419
+ def relative_scores(rewards: list[float]) -> list[float]:
420
+ """
421
+ Convert absolute rewards to relative scores.
422
+
423
+ Maps rewards to [0, 1] based on their rank within the group.
424
+
425
+ Args:
426
+ rewards: List of reward values
427
+
428
+ Returns:
429
+ List of relative scores in [0, 1]
430
+ """
431
+ if len(rewards) < 2:
432
+ return [0.5] * len(rewards)
433
+
434
+ sorted_indices = sorted(range(len(rewards)), key=lambda i: rewards[i])
435
+ n = len(rewards)
436
+
437
+ scores = [0.0] * n
438
+ for rank, idx in enumerate(sorted_indices):
439
+ scores[idx] = rank / (n - 1)
440
+
441
+ return scores
442
+
443
+
444
+ def ranking_to_scores(rankings: list[int]) -> list[float]:
445
+ """
446
+ Convert rankings to normalized scores.
447
+
448
+ Args:
449
+ rankings: List of rankings (1 = best)
450
+
451
+ Returns:
452
+ List of scores in [0, 1] where higher = better
453
+ """
454
+ if len(rankings) < 2:
455
+ return [0.5] * len(rankings)
456
+
457
+ n = len(rankings)
458
+ return [(n - r) / (n - 1) for r in rankings]
459
+
460
+
461
+ def pairwise_preferences_to_scores(
462
+ n_items: int, preferences: list[tuple[int, int]]
463
+ ) -> list[float]:
464
+ """
465
+ Convert pairwise preferences to scores via Bradley-Terry model.
466
+
467
+ Args:
468
+ n_items: Number of items being compared
469
+ preferences: List of (winner, loser) pairs
470
+
471
+ Returns:
472
+ List of scores in [0, 1]
473
+ """
474
+ if n_items < 2 or not preferences:
475
+ return [0.5] * n_items
476
+
477
+ wins = [0] * n_items
478
+ comparisons = [0] * n_items
479
+
480
+ for winner, loser in preferences:
481
+ if 0 <= winner < n_items:
482
+ wins[winner] += 1
483
+ comparisons[winner] += 1
484
+ if 0 <= loser < n_items:
485
+ comparisons[loser] += 1
486
+
487
+ scores = []
488
+ for i in range(n_items):
489
+ if comparisons[i] > 0:
490
+ scores.append(wins[i] / comparisons[i])
491
+ else:
492
+ scores.append(0.5)
493
+
494
+ return scores
495
+
496
+
497
+ class RewardNormalizer:
498
+ """
499
+ Online reward normalizer using running statistics.
500
+
501
+ Maintains mean and variance for reward normalization.
502
+ """
503
+
504
+ def __init__(self, epsilon: float = 1e-8):
505
+ """
506
+ Initialize normalizer.
507
+
508
+ Args:
509
+ epsilon: Small value to prevent division by zero
510
+ """
511
+ self.mean = 0.0
512
+ self.var = 1.0
513
+ self.count = 0
514
+ self.epsilon = epsilon
515
+
516
+ def update(self, reward: float) -> None:
517
+ """
518
+ Update statistics with new reward.
519
+
520
+ Uses Welford's online algorithm for numerical stability.
521
+
522
+ Args:
523
+ reward: New reward value
524
+ """
525
+ self.count += 1
526
+ delta = reward - self.mean
527
+ self.mean += delta / self.count
528
+ delta2 = reward - self.mean
529
+ self.var += delta * delta2
530
+
531
+ def normalize(self, reward: float) -> float:
532
+ """
533
+ Normalize a reward using current statistics.
534
+
535
+ Args:
536
+ reward: Reward to normalize
537
+
538
+ Returns:
539
+ Normalized reward (approximately zero-mean, unit variance)
540
+ """
541
+ if self.count < 2:
542
+ return reward
543
+
544
+ std = math.sqrt(self.var / (self.count - 1) + self.epsilon)
545
+ return (reward - self.mean) / std
546
+
547
+ def update_batch(self, rewards: list[float]) -> None:
548
+ """
549
+ Update statistics with batch of rewards.
550
+
551
+ Args:
552
+ rewards: List of reward values
553
+ """
554
+ for r in rewards:
555
+ self.update(r)
556
+
557
+ def normalize_batch(self, rewards: list[float]) -> list[float]:
558
+ """
559
+ Normalize batch of rewards.
560
+
561
+ Args:
562
+ rewards: List of rewards to normalize
563
+
564
+ Returns:
565
+ List of normalized rewards
566
+ """
567
+ return [self.normalize(r) for r in rewards]
568
+
569
+
570
+ # =============================================================================
571
+ # Archetype Behavior Metrics
572
+ # =============================================================================
573
+
574
+ @dataclass
575
+ class BehaviorMetrics:
576
+ """Metrics extracted from trajectory for archetype-aware scoring."""
577
+
578
+ # Trading metrics
579
+ trades_executed: int = 0
580
+ profitable_trades: int = 0
581
+ win_rate: float = 0.0
582
+ total_pnl: float = 0.0
583
+ pnl_variance: float = 0.0
584
+ largest_win: float = 0.0
585
+ largest_loss: float = 0.0
586
+ markets_traded: int = 0
587
+ avg_position_size: float = 0.0
588
+
589
+ # Social metrics
590
+ unique_users_interacted: int = 0
591
+ group_chats_joined: int = 0
592
+ dms_initiated: int = 0
593
+ posts_created: int = 0
594
+ comments_made: int = 0
595
+ mentions_given: int = 0
596
+
597
+ # Influence metrics
598
+ followers_gained: int = 0
599
+ reputation_delta: int = 0
600
+ positive_reactions: int = 0
601
+ information_spread: int = 0
602
+
603
+ # Research/information metrics
604
+ research_actions: int = 0
605
+ predictions_made: int = 0
606
+ correct_predictions: int = 0
607
+ prediction_accuracy: float = 0.0
608
+ info_requests_sent: int = 0
609
+ info_shared: int = 0
610
+
611
+ # Behavior patterns
612
+ actions_per_tick: float = 0.0
613
+ social_to_trade_ratio: float = 0.0
614
+ episode_length: int = 0
615
+
616
+
617
+ def calculate_archetype_behavior_bonus(
618
+ archetype: str,
619
+ metrics: BehaviorMetrics,
620
+ ) -> float:
621
+ """
622
+ Calculate behavior bonus/penalty based on archetype-aligned actions.
623
+
624
+ Each archetype has specific behaviors that should be rewarded or penalized.
625
+ Returns a score from -0.5 to +0.5 that will be weighted in the composite.
626
+
627
+ Args:
628
+ archetype: Normalized archetype name
629
+ metrics: Extracted behavior metrics from trajectory
630
+
631
+ Returns:
632
+ Behavior bonus score in range [-0.5, 0.5]
633
+ """
634
+ archetype = normalize_archetype(archetype)
635
+
636
+ if archetype == "degen":
637
+ return _calculate_degen_bonus(metrics)
638
+ elif archetype == "social-butterfly":
639
+ return _calculate_social_butterfly_bonus(metrics)
640
+ elif archetype == "scammer":
641
+ return _calculate_scammer_bonus(metrics)
642
+ elif archetype == "trader":
643
+ return _calculate_trader_bonus(metrics)
644
+ elif archetype == "researcher":
645
+ return _calculate_researcher_bonus(metrics)
646
+ elif archetype == "information-trader":
647
+ return _calculate_information_trader_bonus(metrics)
648
+ elif archetype == "goody-twoshoes":
649
+ return _calculate_goody_twoshoes_bonus(metrics)
650
+ elif archetype == "ass-kisser":
651
+ return _calculate_ass_kisser_bonus(metrics)
652
+ elif archetype == "perps-trader":
653
+ return _calculate_perps_trader_bonus(metrics)
654
+ elif archetype == "super-predictor":
655
+ return _calculate_super_predictor_bonus(metrics)
656
+ elif archetype == "infosec":
657
+ return _calculate_infosec_bonus(metrics)
658
+ elif archetype == "liar":
659
+ return _calculate_liar_bonus(metrics)
660
+ else:
661
+ return 0.0 # Default: no bonus
662
+
663
+
664
+ def _calculate_degen_bonus(metrics: BehaviorMetrics) -> float:
665
+ """
666
+ Degen: Reward high activity, risk-taking, and volatility.
667
+ Penalize conservative behavior.
668
+
669
+ Scoring rationale:
670
+ - Degens are rewarded for high trade volume regardless of profitability
671
+ - High P&L variance indicates bold trading style
672
+ - Large position sizes show commitment to risk-taking
673
+ - Low activity is the antithesis of degen behavior
674
+ """
675
+ bonus = 0.0
676
+
677
+ # Reward high trade volume
678
+ if metrics.trades_executed >= DEGEN_HIGH_TRADES:
679
+ bonus += 0.20 # Excellent degen activity
680
+ elif metrics.trades_executed >= DEGEN_GOOD_TRADES:
681
+ bonus += 0.15 # Good activity
682
+ elif metrics.trades_executed >= DEGEN_MIN_TRADES:
683
+ bonus += 0.08 # Some activity
684
+ elif metrics.trades_executed < 2:
685
+ bonus -= 0.15 # Penalty for low activity
686
+
687
+ # Reward high variance (big swings = degen behavior)
688
+ if metrics.pnl_variance > DEGEN_HIGH_VARIANCE:
689
+ bonus += 0.15 # High volatility trading
690
+ elif metrics.pnl_variance > DEGEN_MOD_VARIANCE:
691
+ bonus += 0.08 # Moderate volatility
692
+
693
+ # Reward large position sizes
694
+ if metrics.avg_position_size > DEGEN_HIGH_POSITION:
695
+ bonus += 0.10 # Bold position sizing
696
+ elif metrics.avg_position_size > DEGEN_MOD_POSITION:
697
+ bonus += 0.05 # Moderate positions
698
+
699
+ # Reward big wins/losses (sign of bold trades)
700
+ if abs(metrics.largest_win) > 100 or abs(metrics.largest_loss) > 100:
701
+ bonus += 0.05
702
+
703
+ return clamp_bonus(bonus)
704
+
705
+
706
+ def _calculate_social_butterfly_bonus(metrics: BehaviorMetrics) -> float:
707
+ """
708
+ Social Butterfly: Reward extensive networking and engagement.
709
+ Penalize trading-focused behavior.
710
+
711
+ Scoring rationale:
712
+ - Social butterflies prioritize connections over profits
713
+ - Group chats and DMs indicate networking activity
714
+ - Posting/commenting shows community engagement
715
+ - Heavy trading focus contradicts the archetype
716
+ """
717
+ bonus = 0.0
718
+
719
+ # Reward unique connections
720
+ if metrics.unique_users_interacted >= SOCIAL_EXCELLENT_CONNECTIONS:
721
+ bonus += 0.20 # Excellent networking
722
+ elif metrics.unique_users_interacted >= SOCIAL_GOOD_CONNECTIONS:
723
+ bonus += 0.12 # Good networking
724
+ elif metrics.unique_users_interacted >= SOCIAL_MIN_CONNECTIONS:
725
+ bonus += 0.06 # Some networking
726
+ elif metrics.unique_users_interacted < 2:
727
+ bonus -= 0.15 # Penalty for isolation
728
+
729
+ # Reward group chat activity
730
+ if metrics.group_chats_joined >= SOCIAL_HIGH_GROUPS:
731
+ bonus += 0.15 # Heavy group involvement
732
+ elif metrics.group_chats_joined >= SOCIAL_MIN_GROUPS:
733
+ bonus += 0.08 # Some group activity
734
+
735
+ # Reward DM activity
736
+ if metrics.dms_initiated >= SOCIAL_HIGH_DMS:
737
+ bonus += 0.10 # High direct engagement
738
+ elif metrics.dms_initiated >= SOCIAL_MIN_DMS:
739
+ bonus += 0.05 # Some direct engagement
740
+
741
+ # Reward posting/commenting
742
+ total_posts = metrics.posts_created + metrics.comments_made
743
+ if total_posts >= 10:
744
+ bonus += 0.08 # Active poster
745
+ elif total_posts >= 3:
746
+ bonus += 0.04 # Some content creation
747
+
748
+ # Penalize heavy trading focus
749
+ if metrics.social_to_trade_ratio < 0.5 and metrics.trades_executed > 5:
750
+ bonus -= 0.10
751
+
752
+ return clamp_bonus(bonus)
753
+
754
+
755
+ def _calculate_scammer_bonus(metrics: BehaviorMetrics) -> float:
756
+ """
757
+ Scammer: Reward profit through social manipulation.
758
+ Penalize honest trading without social element.
759
+ """
760
+ bonus = 0.0
761
+
762
+ # Must have some social engagement (need marks to scam)
763
+ if metrics.unique_users_interacted >= 5:
764
+ bonus += 0.10
765
+ elif metrics.unique_users_interacted < 2:
766
+ bonus -= 0.20 # Hard penalty for no social manipulation
767
+
768
+ # Reward DM activity (private manipulation channels)
769
+ if metrics.dms_initiated >= 5:
770
+ bonus += 0.10
771
+ elif metrics.dms_initiated >= 2:
772
+ bonus += 0.05
773
+
774
+ # Must profit to be a successful scammer
775
+ if metrics.total_pnl > 0:
776
+ bonus += 0.15
777
+ else:
778
+ bonus -= 0.15 # Failed scammer
779
+
780
+ # Reward maintaining reputation (building trust to exploit)
781
+ if metrics.reputation_delta > 0:
782
+ bonus += 0.10
783
+ elif metrics.reputation_delta < -20:
784
+ bonus -= 0.10 # Got caught
785
+
786
+ return clamp_bonus(bonus)
787
+
788
+
789
+ def _calculate_trader_bonus(metrics: BehaviorMetrics) -> float:
790
+ """
791
+ Trader: Reward disciplined, profitable trading.
792
+ Penalize social distractions.
793
+ """
794
+ bonus = 0.0
795
+
796
+ # Reward good win rate
797
+ if metrics.win_rate >= TRADER_HIGH_WIN_RATE:
798
+ bonus += BONUS_GOOD
799
+ elif metrics.win_rate >= TRADER_GOOD_WIN_RATE:
800
+ bonus += 0.08
801
+ elif metrics.win_rate < TRADER_LOW_WIN_RATE and metrics.trades_executed >= 5:
802
+ bonus += PENALTY_MODERATE
803
+
804
+ # Reward diversification
805
+ if metrics.markets_traded >= TRADER_HIGH_DIVERSIFICATION:
806
+ bonus += BONUS_MODERATE
807
+ elif metrics.markets_traded >= TRADER_MIN_DIVERSIFICATION:
808
+ bonus += BONUS_MINOR
809
+
810
+ # Penalize high social to trade ratio (should be trading, not socializing)
811
+ if metrics.social_to_trade_ratio > 1.0:
812
+ bonus += PENALTY_MODERATE
813
+
814
+ # Reward consistent activity
815
+ if metrics.trades_executed >= 5:
816
+ bonus += BONUS_MINOR
817
+
818
+ return clamp_bonus(bonus)
819
+
820
+
821
+ def _calculate_researcher_bonus(metrics: BehaviorMetrics) -> float:
822
+ """
823
+ Researcher: Reward analysis and research activity.
824
+ Reward correlation between research and accurate predictions.
825
+ """
826
+ bonus = 0.0
827
+
828
+ # Reward research actions
829
+ if metrics.research_actions >= RESEARCHER_HIGH_ACTIONS:
830
+ bonus += BONUS_EXCELLENT
831
+ elif metrics.research_actions >= RESEARCHER_MOD_ACTIONS:
832
+ bonus += 0.12
833
+ elif metrics.research_actions >= 2:
834
+ bonus += 0.06
835
+ elif metrics.research_actions == 0:
836
+ bonus += PENALTY_SEVERE # Not researching = not a researcher
837
+
838
+ # Reward high prediction accuracy
839
+ if metrics.prediction_accuracy >= RESEARCHER_HIGH_ACCURACY:
840
+ bonus += BONUS_EXCELLENT
841
+ elif metrics.prediction_accuracy >= RESEARCHER_GOOD_ACCURACY:
842
+ bonus += BONUS_MODERATE
843
+
844
+ # Reward quality over quantity (fewer but better trades)
845
+ if metrics.win_rate >= TRADER_HIGH_WIN_RATE and metrics.trades_executed <= 10:
846
+ bonus += BONUS_MODERATE
847
+
848
+ return clamp_bonus(bonus)
849
+
850
+
851
+ def _calculate_information_trader_bonus(metrics: BehaviorMetrics) -> float:
852
+ """
853
+ Information Trader: Reward balance of social intel gathering and trading.
854
+ """
855
+ bonus = 0.0
856
+
857
+ # Need balanced social-to-trade ratio (0.5 to 1.5 is ideal)
858
+ if 0.5 <= metrics.social_to_trade_ratio <= 1.5:
859
+ bonus += 0.15
860
+ elif metrics.social_to_trade_ratio > 3.0:
861
+ bonus -= 0.10 # Too social, not trading on info
862
+ elif metrics.social_to_trade_ratio < 0.2 and metrics.trades_executed > 3:
863
+ bonus -= 0.10 # Pure trading, no intel gathering
864
+
865
+ # Reward group chat participation (info sources)
866
+ if metrics.group_chats_joined >= 3:
867
+ bonus += 0.10
868
+
869
+ # Reward DM conversations (private intel)
870
+ if metrics.dms_initiated >= 3:
871
+ bonus += 0.08
872
+
873
+ # Reward info requests (actively seeking intel)
874
+ if metrics.info_requests_sent >= 3:
875
+ bonus += 0.08
876
+
877
+ # Must still profit from the intel
878
+ if metrics.total_pnl > 0:
879
+ bonus += 0.10
880
+
881
+ return clamp_bonus(bonus)
882
+
883
+
884
+ def _calculate_goody_twoshoes_bonus(metrics: BehaviorMetrics) -> float:
885
+ """
886
+ Goody Two-Shoes: Reward helpfulness and reputation building.
887
+ """
888
+ bonus = 0.0
889
+
890
+ # Reward reputation gains (most important)
891
+ if metrics.reputation_delta >= 30:
892
+ bonus += 0.25
893
+ elif metrics.reputation_delta >= 10:
894
+ bonus += 0.15
895
+ elif metrics.reputation_delta >= 0:
896
+ bonus += 0.05
897
+ else:
898
+ bonus -= 0.15 # Losing reputation = not being good
899
+
900
+ # Reward information sharing
901
+ if metrics.info_shared >= 5:
902
+ bonus += 0.12
903
+ elif metrics.info_shared >= 2:
904
+ bonus += 0.06
905
+
906
+ # Reward positive reactions
907
+ if metrics.positive_reactions >= 10:
908
+ bonus += 0.10
909
+ elif metrics.positive_reactions >= 3:
910
+ bonus += 0.05
911
+
912
+ # Reward follower gains
913
+ if metrics.followers_gained >= 5:
914
+ bonus += 0.08
915
+
916
+ return clamp_bonus(bonus)
917
+
918
+
919
+ def _calculate_ass_kisser_bonus(metrics: BehaviorMetrics) -> float:
920
+ """
921
+ Ass-Kisser: Reward reputation and follower gains through flattery.
922
+ """
923
+ bonus = 0.0
924
+
925
+ # Reputation gains are everything
926
+ if metrics.reputation_delta >= 50:
927
+ bonus += 0.30
928
+ elif metrics.reputation_delta >= 20:
929
+ bonus += 0.20
930
+ elif metrics.reputation_delta >= 5:
931
+ bonus += 0.10
932
+ elif metrics.reputation_delta < 0:
933
+ bonus -= 0.20 # Failed flattery
934
+
935
+ # Reward follower gains
936
+ if metrics.followers_gained >= 10:
937
+ bonus += 0.15
938
+ elif metrics.followers_gained >= 3:
939
+ bonus += 0.08
940
+
941
+ # Reward commenting activity (public flattery)
942
+ if metrics.comments_made >= 10:
943
+ bonus += 0.08
944
+ elif metrics.comments_made >= 5:
945
+ bonus += 0.04
946
+
947
+ # Reward DM activity (personal flattery)
948
+ if metrics.dms_initiated >= 5:
949
+ bonus += 0.05
950
+
951
+ return clamp_bonus(bonus)
952
+
953
+
954
+ def _calculate_perps_trader_bonus(metrics: BehaviorMetrics) -> float:
955
+ """
956
+ Perps Trader: Reward risk-managed leveraged trading.
957
+ Penalize over-leverage and liquidations.
958
+ """
959
+ bonus = 0.0
960
+
961
+ # Reward good win rate (direction calling)
962
+ if metrics.win_rate >= 0.55:
963
+ bonus += 0.15
964
+ elif metrics.win_rate < 0.40 and metrics.trades_executed >= 5:
965
+ bonus -= 0.15 # Wrong direction too often
966
+
967
+ # Reward active perp trading
968
+ if metrics.trades_executed >= 10:
969
+ bonus += 0.10
970
+ elif metrics.trades_executed >= 5:
971
+ bonus += 0.05
972
+ elif metrics.trades_executed < 2:
973
+ bonus -= 0.10 # Not trading perps
974
+
975
+ # Penalize high variance (poor risk management with leverage)
976
+ if metrics.pnl_variance > 1000:
977
+ bonus -= 0.10 # Too volatile for leveraged trading
978
+
979
+ # Reward profitability (must make money with leverage)
980
+ if metrics.total_pnl > 0:
981
+ bonus += 0.10
982
+ elif metrics.total_pnl < -200:
983
+ bonus -= 0.15 # Big losses = blown up
984
+
985
+ return clamp_bonus(bonus)
986
+
987
+
988
+ def _calculate_super_predictor_bonus(metrics: BehaviorMetrics) -> float:
989
+ """
990
+ Super Predictor: Reward high prediction accuracy.
991
+ Quality over quantity.
992
+ """
993
+ bonus = 0.0
994
+
995
+ # Prediction accuracy is king
996
+ if metrics.prediction_accuracy >= 0.75:
997
+ bonus += 0.30
998
+ elif metrics.prediction_accuracy >= 0.60:
999
+ bonus += 0.18
1000
+ elif metrics.prediction_accuracy >= 0.50:
1001
+ bonus += 0.08
1002
+ elif metrics.predictions_made >= 5 and metrics.prediction_accuracy < 0.45:
1003
+ bonus -= 0.20 # Wrong too often
1004
+
1005
+ # Reward research (should analyze before predicting)
1006
+ if metrics.research_actions >= 3:
1007
+ bonus += 0.08
1008
+
1009
+ # Reward making predictions
1010
+ if metrics.predictions_made >= 5:
1011
+ bonus += 0.08
1012
+ elif metrics.predictions_made == 0:
1013
+ bonus -= 0.15 # Not predicting = not a predictor
1014
+
1015
+ # Reward translating predictions to profit
1016
+ if metrics.total_pnl > 0 and metrics.prediction_accuracy >= 0.55:
1017
+ bonus += 0.08
1018
+
1019
+ return clamp_bonus(bonus)
1020
+
1021
+
1022
+ def _calculate_infosec_bonus(metrics: BehaviorMetrics) -> float:
1023
+ """
1024
+ Infosec: Reward caution, verification, and avoiding manipulation.
1025
+ """
1026
+ bonus = 0.0
1027
+
1028
+ # Reward low information sharing (protective)
1029
+ if metrics.info_shared <= 1:
1030
+ bonus += 0.15
1031
+ elif metrics.info_shared >= 5:
1032
+ bonus -= 0.10 # Oversharing
1033
+
1034
+ # Reward avoiding big losses (didn't fall for scams)
1035
+ if metrics.largest_loss > -50: # Small losses only
1036
+ bonus += 0.15
1037
+ elif metrics.largest_loss < -200:
1038
+ bonus -= 0.15 # Big loss = got scammed
1039
+
1040
+ # Reward research/verification
1041
+ if metrics.research_actions >= 3:
1042
+ bonus += 0.10
1043
+
1044
+ # Reward consistent, steady behavior
1045
+ if metrics.pnl_variance < 100:
1046
+ bonus += 0.10
1047
+
1048
+ # Penalize high DM response (could be manipulation attempts)
1049
+ if metrics.dms_initiated < 3:
1050
+ bonus += 0.05 # Cautious with DMs
1051
+
1052
+ return clamp_bonus(bonus)
1053
+
1054
+
1055
+ def _calculate_liar_bonus(metrics: BehaviorMetrics) -> float:
1056
+ """
1057
+ Liar: Reward successful deception and information spread.
1058
+ """
1059
+ bonus = 0.0
1060
+
1061
+ # Reward information spread (lies propagating)
1062
+ if metrics.information_spread >= 10:
1063
+ bonus += 0.20
1064
+ elif metrics.information_spread >= 3:
1065
+ bonus += 0.10
1066
+
1067
+ # Reward social engagement (audience for lies)
1068
+ if metrics.unique_users_interacted >= 8:
1069
+ bonus += 0.12
1070
+ elif metrics.unique_users_interacted >= 3:
1071
+ bonus += 0.06
1072
+ elif metrics.unique_users_interacted < 2:
1073
+ bonus -= 0.15 # No audience
1074
+
1075
+ # Reward maintaining reputation despite lying
1076
+ if metrics.reputation_delta >= 0:
1077
+ bonus += 0.15 # Not caught
1078
+ elif metrics.reputation_delta < -20:
1079
+ bonus -= 0.15 # Got exposed
1080
+
1081
+ # Reward posting activity (platforms for misinformation)
1082
+ if metrics.posts_created >= 5:
1083
+ bonus += 0.08
1084
+ elif metrics.posts_created >= 2:
1085
+ bonus += 0.04
1086
+
1087
+ return clamp_bonus(bonus)
1088
+
1089
+
1090
+ # =============================================================================
1091
+ # Priority Metrics Scoring
1092
+ # =============================================================================
1093
+
1094
+
1095
+ def extract_metric_value(
1096
+ metric_name: str,
1097
+ metrics: BehaviorMetrics,
1098
+ ) -> Optional[float]:
1099
+ """
1100
+ Extract metric value from BehaviorMetrics based on priority metric name.
1101
+
1102
+ Metric names from rubrics.json follow format: category.metricName
1103
+ e.g., "trading.totalPnL", "social.uniqueUsersInteracted"
1104
+ """
1105
+ # Mapping from rubrics.json metric names to BehaviorMetrics attributes
1106
+ metric_map = {
1107
+ # Trading metrics
1108
+ "trading.totalPnL": metrics.total_pnl,
1109
+ "trading.sharpeRatio": 0.0, # Not directly available, computed if needed
1110
+ "trading.winRate": metrics.win_rate,
1111
+ "trading.marketsTraded": float(metrics.markets_traded),
1112
+ "trading.tradesExecuted": float(metrics.trades_executed),
1113
+ "trading.avgPositionSize": metrics.avg_position_size,
1114
+ "trading.largestWin": metrics.largest_win,
1115
+ "trading.largestLoss": metrics.largest_loss,
1116
+ "trading.maxDrawdown": 0.0, # Not directly available
1117
+
1118
+ # Social metrics
1119
+ "social.uniqueUsersInteracted": float(metrics.unique_users_interacted),
1120
+ "social.groupChatsJoined": float(metrics.group_chats_joined),
1121
+ "social.dmsInitiated": float(metrics.dms_initiated),
1122
+ "social.postsCreated": float(metrics.posts_created),
1123
+ "social.commentsMade": float(metrics.comments_made),
1124
+ "social.mentionsGiven": float(metrics.mentions_given),
1125
+ "social.groupMessagesSent": float(metrics.group_chats_joined), # Approximation
1126
+ "social.dmResponseRate": 0.5, # Default, not tracked separately
1127
+
1128
+ # Influence metrics
1129
+ "influence.reputationDelta": float(metrics.reputation_delta),
1130
+ "influence.followersGained": float(metrics.followers_gained),
1131
+ "influence.positiveReactions": float(metrics.positive_reactions),
1132
+ "influence.informationSpread": float(metrics.information_spread),
1133
+
1134
+ # Information metrics
1135
+ "information.researchActions": float(metrics.research_actions),
1136
+ "information.predictionAccuracy": metrics.prediction_accuracy,
1137
+ "information.predictionsMade": float(metrics.predictions_made),
1138
+ "information.correctPredictions": float(metrics.correct_predictions),
1139
+ "information.marketDataQueries": float(metrics.research_actions), # Approximation
1140
+ "information.newsConsumed": 0.0, # Not tracked separately
1141
+ "information.infoRequestsSent": float(metrics.info_requests_sent),
1142
+ "information.infoShared": float(metrics.info_shared),
1143
+
1144
+ # Behavior metrics
1145
+ "behavior.socialToTradeRatio": metrics.social_to_trade_ratio,
1146
+ "behavior.actionsPerTick": metrics.actions_per_tick,
1147
+ "behavior.actionSuccessRate": metrics.win_rate, # Approximation
1148
+ "behavior.episodeLength": float(metrics.episode_length),
1149
+ "behavior.consistencyScore": 0.5, # Default, not tracked separately
1150
+ }
1151
+
1152
+ return metric_map.get(metric_name)
1153
+
1154
+
1155
+ def normalize_metric_value(
1156
+ metric_name: str,
1157
+ value: float,
1158
+ ) -> float:
1159
+ """
1160
+ Normalize a metric value to 0-1 range based on expected ranges.
1161
+
1162
+ Different metrics have different expected ranges.
1163
+ """
1164
+ # Expected ranges for normalization
1165
+ # These are reasonable defaults that can be tuned
1166
+ normalization_ranges = {
1167
+ # Trading (can be negative)
1168
+ "trading.totalPnL": (-1000, 5000),
1169
+ "trading.sharpeRatio": (-1.0, 3.0),
1170
+ "trading.winRate": (0.0, 1.0),
1171
+ "trading.marketsTraded": (0, 10),
1172
+ "trading.tradesExecuted": (0, 50),
1173
+ "trading.avgPositionSize": (0, 1000),
1174
+ "trading.largestWin": (0, 2000),
1175
+ "trading.largestLoss": (-2000, 0),
1176
+ "trading.maxDrawdown": (0, 1000),
1177
+
1178
+ # Social (always positive)
1179
+ "social.uniqueUsersInteracted": (0, 30),
1180
+ "social.groupChatsJoined": (0, 10),
1181
+ "social.dmsInitiated": (0, 20),
1182
+ "social.postsCreated": (0, 20),
1183
+ "social.commentsMade": (0, 30),
1184
+ "social.mentionsGiven": (0, 20),
1185
+ "social.groupMessagesSent": (0, 50),
1186
+ "social.dmResponseRate": (0.0, 1.0),
1187
+
1188
+ # Influence (can be negative)
1189
+ "influence.reputationDelta": (-50, 100),
1190
+ "influence.followersGained": (-10, 30),
1191
+ "influence.positiveReactions": (0, 50),
1192
+ "influence.informationSpread": (0, 20),
1193
+
1194
+ # Information (always positive)
1195
+ "information.researchActions": (0, 20),
1196
+ "information.predictionAccuracy": (0.0, 1.0),
1197
+ "information.predictionsMade": (0, 20),
1198
+ "information.correctPredictions": (0, 15),
1199
+ "information.marketDataQueries": (0, 20),
1200
+ "information.newsConsumed": (0, 10),
1201
+ "information.infoRequestsSent": (0, 15),
1202
+ "information.infoShared": (0, 15),
1203
+
1204
+ # Behavior
1205
+ "behavior.socialToTradeRatio": (0.0, 5.0),
1206
+ "behavior.actionsPerTick": (0.0, 3.0),
1207
+ "behavior.actionSuccessRate": (0.0, 1.0),
1208
+ "behavior.episodeLength": (0, 50),
1209
+ "behavior.consistencyScore": (0.0, 1.0),
1210
+ }
1211
+
1212
+ range_info = normalization_ranges.get(metric_name, (0, 100))
1213
+ min_val, max_val = range_info
1214
+
1215
+ if max_val == min_val:
1216
+ return 0.5
1217
+
1218
+ # Normalize to 0-1
1219
+ normalized = (value - min_val) / (max_val - min_val)
1220
+ return max(0.0, min(1.0, normalized))
1221
+
1222
+
1223
+ def calculate_priority_weighted_score(
1224
+ archetype: str,
1225
+ metrics: BehaviorMetrics,
1226
+ ) -> float:
1227
+ """
1228
+ Calculate score based on archetype's priority metrics from rubrics.json.
1229
+
1230
+ Uses weighted sum where first priority metric gets highest weight.
1231
+ """
1232
+ archetype_norm = normalize_archetype(archetype)
1233
+ priority_metrics = get_priority_metrics(archetype_norm)
1234
+
1235
+ if not priority_metrics:
1236
+ return 0.5 # Default if no priority metrics defined
1237
+
1238
+ # Weights decrease by position (first is most important)
1239
+ # e.g., [0.35, 0.25, 0.20, 0.12, 0.08] for 5 metrics
1240
+ weights = []
1241
+ total_weight = 0.0
1242
+ for i, _ in enumerate(priority_metrics):
1243
+ weight = 1.0 / (i + 1) # Harmonic weights: 1, 0.5, 0.33, 0.25, ...
1244
+ weights.append(weight)
1245
+ total_weight += weight
1246
+
1247
+ # Normalize weights to sum to 1
1248
+ weights = [w / total_weight for w in weights]
1249
+
1250
+ # Calculate weighted score
1251
+ weighted_sum = 0.0
1252
+ for i, metric_name in enumerate(priority_metrics):
1253
+ value = extract_metric_value(metric_name, metrics)
1254
+ if value is not None:
1255
+ normalized_value = normalize_metric_value(metric_name, value)
1256
+ weighted_sum += weights[i] * normalized_value
1257
+
1258
+ return weighted_sum
1259
+
1260
+
1261
+ # =============================================================================
1262
+ # Archetype Composite Reward
1263
+ # =============================================================================
1264
+
1265
+ def archetype_composite_reward(
1266
+ inputs: TrajectoryRewardInputs,
1267
+ archetype: str,
1268
+ behavior_metrics: Optional[BehaviorMetrics] = None,
1269
+ ) -> float:
1270
+ """
1271
+ Compute archetype-aware composite reward.
1272
+
1273
+ Different archetypes have different success criteria. This function
1274
+ combines PnL, format, reasoning, and behavior scores using weights
1275
+ specific to the archetype.
1276
+
1277
+ Also incorporates priority metrics from rubrics.json for each archetype.
1278
+
1279
+ Args:
1280
+ inputs: Standard trajectory reward inputs (PnL, format, reasoning scores)
1281
+ archetype: Agent archetype (e.g., "degen", "trader", "social-butterfly")
1282
+ behavior_metrics: Optional extracted behavior metrics for behavior bonus
1283
+
1284
+ Returns:
1285
+ Composite reward score in range [-1.0, 1.0]
1286
+ """
1287
+ archetype_norm = normalize_archetype(archetype)
1288
+ weights = get_archetype_weights(archetype_norm)
1289
+
1290
+ # 1. Calculate PnL Score
1291
+ if inputs.end_balance != inputs.starting_balance:
1292
+ pnl_score = calculate_pnl_reward(inputs.starting_balance, inputs.end_balance)
1293
+ else:
1294
+ end_bal = inputs.starting_balance + inputs.final_pnl
1295
+ pnl_score = calculate_pnl_reward(inputs.starting_balance, end_bal)
1296
+
1297
+ # Archetype-specific PnL adjustments
1298
+ if archetype_norm == "degen" and pnl_score < 0:
1299
+ # Degens shouldn't be heavily penalized for losses
1300
+ pnl_score = pnl_score * 0.3
1301
+
1302
+ if archetype_norm == "social-butterfly" and pnl_score < 0:
1303
+ # Social butterflies shouldn't care much about trading losses
1304
+ pnl_score = pnl_score * 0.5
1305
+
1306
+ # Bankruptcy still matters for most archetypes
1307
+ if pnl_score <= -5.0 and archetype_norm not in ("degen", "social-butterfly"):
1308
+ return max(-1.0, pnl_score)
1309
+
1310
+ # 2. Risk penalty for risky actions (except for degens who embrace risk)
1311
+ if inputs.risky_actions_count > 0 and archetype_norm != "degen":
1312
+ pnl_score -= (inputs.risky_actions_count * ARCHETYPE_RISK_PENALTY_MULTIPLIER)
1313
+
1314
+ # 3. Format and reasoning scores
1315
+ format_score = inputs.format_score
1316
+ reasoning_score = inputs.reasoning_score
1317
+
1318
+ # 4. Behavior bonus from archetype-specific behaviors
1319
+ behavior_bonus = 0.0
1320
+ if behavior_metrics is not None:
1321
+ behavior_bonus = calculate_archetype_behavior_bonus(archetype_norm, behavior_metrics)
1322
+
1323
+ # Also incorporate priority metrics score from rubrics.json
1324
+ priority_score = calculate_priority_weighted_score(archetype_norm, behavior_metrics)
1325
+
1326
+ # Blend behavior bonus with priority metrics (priority metrics give 30% of behavior weight)
1327
+ behavior_bonus = behavior_bonus * 0.7 + (priority_score - 0.5) * 0.3
1328
+
1329
+ # 5. Compute weighted composite
1330
+ total_weight = (
1331
+ weights["pnl"]
1332
+ + weights["format"]
1333
+ + weights["reasoning"]
1334
+ + weights["behavior"]
1335
+ )
1336
+
1337
+ composite = (
1338
+ pnl_score * weights["pnl"]
1339
+ + format_score * weights["format"]
1340
+ + reasoning_score * weights["reasoning"]
1341
+ + behavior_bonus * weights["behavior"]
1342
+ ) / total_weight
1343
+
1344
+ return max(-1.0, min(1.0, composite))