@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/Dockerfile +75 -0
  2. package/LICENSE +21 -0
  3. package/Makefile +374 -0
  4. package/README.md +346 -0
  5. package/config/rubrics.json +137 -0
  6. package/docker-compose.test.yml +57 -0
  7. package/package.json +57 -0
  8. package/python/config/babylon_atropos.yaml +90 -0
  9. package/python/config/profiles/12gb.json +11 -0
  10. package/python/config/profiles/16gb.json +10 -0
  11. package/python/config/profiles/24gb.json +10 -0
  12. package/python/config/profiles/48gb.json +10 -0
  13. package/python/config/profiles/cpu.json +11 -0
  14. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  15. package/python/config/profiles/l40-2gpu.json +22 -0
  16. package/python/config/profiles/l40-4gpu.json +21 -0
  17. package/python/config/profiles/l40.json +17 -0
  18. package/python/config/tinker_training.yaml +143 -0
  19. package/python/curriculum_state.json +165 -0
  20. package/python/env.template +86 -0
  21. package/python/env.training.template +46 -0
  22. package/python/pyproject.toml +41 -0
  23. package/python/requirements-ci.txt +31 -0
  24. package/python/requirements.txt +87 -0
  25. package/python/scripts/__init__.py +4 -0
  26. package/python/scripts/benchmark_should_respond.py +190 -0
  27. package/python/scripts/debug_inference.py +62 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/optimize_prompt_grpo.py +269 -0
  36. package/python/scripts/run_ab_test.py +143 -0
  37. package/python/scripts/run_full_pipeline.py +544 -0
  38. package/python/scripts/run_tinker_training.py +192 -0
  39. package/python/scripts/run_training.py +914 -0
  40. package/python/scripts/test_generation.py +29 -0
  41. package/python/scripts/test_judge.py +155 -0
  42. package/python/scripts/test_pipeline.py +356 -0
  43. package/python/scripts/test_trained_model.py +380 -0
  44. package/python/scripts/train_grpo.py +360 -0
  45. package/python/scripts/train_jsonl.py +223 -0
  46. package/python/scripts/train_local.py +528 -0
  47. package/python/setup.py +20 -0
  48. package/python/src/__init__.py +190 -0
  49. package/python/src/data_bridge/__init__.py +24 -0
  50. package/python/src/data_bridge/converter.py +435 -0
  51. package/python/src/data_bridge/reader.py +393 -0
  52. package/python/src/models.py +283 -0
  53. package/python/src/training/__init__.py +605 -0
  54. package/python/src/training/ab_testing.py +404 -0
  55. package/python/src/training/action_executor.py +621 -0
  56. package/python/src/training/archetype_trainer.py +347 -0
  57. package/python/src/training/atropos_trainer.py +980 -0
  58. package/python/src/training/babylon_env.py +1254 -0
  59. package/python/src/training/error_recovery.py +647 -0
  60. package/python/src/training/evaluation.py +856 -0
  61. package/python/src/training/fast_simulator.py +880 -0
  62. package/python/src/training/format_validator.py +584 -0
  63. package/python/src/training/hybrid_env.py +522 -0
  64. package/python/src/training/kl_controller.py +628 -0
  65. package/python/src/training/multi_prompt_dataset.py +883 -0
  66. package/python/src/training/multi_turn.py +656 -0
  67. package/python/src/training/online_env.py +1084 -0
  68. package/python/src/training/quality_scorer.py +391 -0
  69. package/python/src/training/quality_utils.py +633 -0
  70. package/python/src/training/rewards.py +1344 -0
  71. package/python/src/training/rlaif_env.py +17 -0
  72. package/python/src/training/rollout_generator.py +502 -0
  73. package/python/src/training/rubric_loader.py +198 -0
  74. package/python/src/training/scenario_pool.py +1072 -0
  75. package/python/src/training/schemas.py +481 -0
  76. package/python/src/training/service_manager.py +552 -0
  77. package/python/src/training/simulation_bridge.py +535 -0
  78. package/python/src/training/tick_reward_attribution.py +399 -0
  79. package/python/src/training/tinker_client.py +575 -0
  80. package/python/src/training/tinker_trainer.py +646 -0
  81. package/python/src/training/tokenization_utils.py +402 -0
  82. package/python/tests/e2e/__init__.py +13 -0
  83. package/python/tests/e2e/conftest.py +258 -0
  84. package/python/tests/e2e/test_full_pipeline.py +643 -0
  85. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  86. package/python/tests/integration/__init__.py +12 -0
  87. package/python/tests/integration/conftest.py +383 -0
  88. package/python/tests/integration/test_db_integration.py +649 -0
  89. package/python/tests/integration/test_json_mode_integration.py +554 -0
  90. package/python/tests/test_action_executor.py +594 -0
  91. package/python/tests/test_archetype_scoring.py +1027 -0
  92. package/python/tests/test_atropos_integration.py +360 -0
  93. package/python/tests/test_evaluation.py +727 -0
  94. package/python/tests/test_format_validator.py +486 -0
  95. package/python/tests/test_kl_controller.py +432 -0
  96. package/python/tests/test_lr_scheduler.py +579 -0
  97. package/python/tests/test_multi_turn.py +590 -0
  98. package/python/tests/test_online_env.py +519 -0
  99. package/python/tests/test_quality_scorer.py +474 -0
  100. package/python/tests/test_scenario_pool.py +735 -0
  101. package/python/tests/test_service_manager.py +585 -0
  102. package/python/tests/test_simulation_rollout.py +581 -0
  103. package/python/tests/test_tokenization_utils.py +501 -0
  104. package/python/tests/test_training_orchestrator.py +497 -0
  105. package/python/tests/test_training_output_structure.py +661 -0
  106. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  107. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  108. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  109. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  110. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  111. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  112. package/research-output/training-runs/training-run-1771276293257.json +38 -0
  113. package/research-output/training-runs/training-run-1771276389280.json +38 -0
  114. package/research-output/training-runs/training-run-1771276502776.json +38 -0
  115. package/research-output/training-runs/training-run-1771277340748.json +38 -0
  116. package/research-output/training-runs/training-run-1773013658993.json +38 -0
  117. package/research-output/training-runs/training-run-1773013861014.json +38 -0
  118. package/research-output/training-runs/training-run-1773014215983.json +38 -0
  119. package/scripts/assess-training-data.ts +422 -0
  120. package/scripts/e2e-training-test.ts +550 -0
  121. package/scripts/export-rubrics.ts +64 -0
  122. package/scripts/generate-research-report.ts +1523 -0
  123. package/scripts/generate_dataset.sh +173 -0
  124. package/scripts/generate_should_respond.ts +267 -0
  125. package/scripts/generate_should_respond_dataset.ts +162 -0
  126. package/scripts/json-mode-benchmark.ts +399 -0
  127. package/scripts/rank_trajectories.ts +207 -0
  128. package/scripts/real-archetype-benchmark.ts +210 -0
  129. package/scripts/run-baseline-comparison.ts +116 -0
  130. package/scripts/run-full-pipeline.ts +272 -0
  131. package/scripts/run_rlaif_loop.ts +78 -0
  132. package/scripts/run_task_benchmark.ts +247 -0
  133. package/scripts/runpod_setup.sh +137 -0
  134. package/scripts/runpod_validate.sh +147 -0
  135. package/scripts/test-model-in-game.ts +955 -0
  136. package/scripts/test-scoring.ts +73 -0
  137. package/scripts/test-trained-model.ts +209 -0
  138. package/scripts/train-and-test.ts +824 -0
  139. package/scripts/verify-final.ts +118 -0
  140. package/src/adapter.ts +516 -0
  141. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  142. package/src/archetypes/derive-archetype.ts +249 -0
  143. package/src/archetypes/index.ts +22 -0
  144. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  145. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  146. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  147. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  148. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  149. package/src/benchmark/BenchmarkRunner.ts +685 -0
  150. package/src/benchmark/BenchmarkValidator.ts +204 -0
  151. package/src/benchmark/FastEvalRunner.ts +225 -0
  152. package/src/benchmark/MetricsValidator.ts +165 -0
  153. package/src/benchmark/MetricsVisualizer.ts +909 -0
  154. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  155. package/src/benchmark/ModelRegistry.ts +158 -0
  156. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  157. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  158. package/src/benchmark/SimulationEngine.ts +832 -0
  159. package/src/benchmark/TaskRunner.ts +94 -0
  160. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  161. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  162. package/src/benchmark/index.ts +91 -0
  163. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  164. package/src/benchmark/simulation-types.ts +78 -0
  165. package/src/dependencies.ts +475 -0
  166. package/src/generation/TrajectoryGenerator.ts +387 -0
  167. package/src/generation/index.ts +12 -0
  168. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  169. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  170. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  171. package/src/huggingface/index.ts +27 -0
  172. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  173. package/src/index.ts +102 -0
  174. package/src/init-training.ts +53 -0
  175. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  176. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  177. package/src/metrics/index.ts +8 -0
  178. package/src/metrics/types.ts +200 -0
  179. package/src/rubrics/__tests__/index.test.ts +184 -0
  180. package/src/rubrics/ass-kisser.ts +85 -0
  181. package/src/rubrics/degen.ts +80 -0
  182. package/src/rubrics/goody-twoshoes.ts +84 -0
  183. package/src/rubrics/index.ts +236 -0
  184. package/src/rubrics/information-trader.ts +84 -0
  185. package/src/rubrics/infosec.ts +101 -0
  186. package/src/rubrics/liar.ts +104 -0
  187. package/src/rubrics/perps-trader.ts +87 -0
  188. package/src/rubrics/researcher.ts +81 -0
  189. package/src/rubrics/scammer.ts +82 -0
  190. package/src/rubrics/social-butterfly.ts +73 -0
  191. package/src/rubrics/super-predictor.ts +97 -0
  192. package/src/rubrics/trader.ts +67 -0
  193. package/src/scoring/ArchetypeScoringService.ts +486 -0
  194. package/src/scoring/JudgePromptBuilder.ts +556 -0
  195. package/src/scoring/LLMJudgeCache.ts +401 -0
  196. package/src/scoring/index.ts +9 -0
  197. package/src/training/AutomationPipeline.ts +916 -0
  198. package/src/training/BenchmarkService.ts +518 -0
  199. package/src/training/ConfigValidator.ts +220 -0
  200. package/src/training/MarketOutcomesTracker.ts +187 -0
  201. package/src/training/ModelDeployer.ts +186 -0
  202. package/src/training/ModelFetcher.ts +76 -0
  203. package/src/training/ModelSelectionService.ts +341 -0
  204. package/src/training/ModelUsageVerifier.ts +160 -0
  205. package/src/training/MultiModelOrchestrator.ts +580 -0
  206. package/src/training/RLModelConfig.ts +407 -0
  207. package/src/training/RewardBackpropagationService.ts +149 -0
  208. package/src/training/RulerScoringService.ts +666 -0
  209. package/src/training/TrainingMonitor.ts +166 -0
  210. package/src/training/TrajectoryRecorder.ts +399 -0
  211. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  212. package/src/training/index.ts +100 -0
  213. package/src/training/logRLConfig.ts +34 -0
  214. package/src/training/pipeline.ts +129 -0
  215. package/src/training/storage/ModelStorageService.ts +279 -0
  216. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  217. package/src/training/storage/index.ts +17 -0
  218. package/src/training/types.ts +207 -0
  219. package/src/training/window-utils.ts +138 -0
  220. package/src/utils/index.ts +101 -0
  221. package/src/utils/logger.ts +59 -0
  222. package/src/utils/snowflake.ts +17 -0
  223. package/src/utils/synthetic-detector.ts +111 -0
  224. package/tsconfig.json +20 -0
@@ -0,0 +1,474 @@
1
+ """
2
+ Tests for Quality Scorer
3
+
4
+ Tests cover:
5
+ - Length penalty calculations
6
+ - Quality score calculations
7
+ - Archetype-specific bonuses
8
+ - Integration with format validator
9
+ """
10
+
11
+ import pytest
12
+
13
+ from src.training.quality_scorer import (
14
+ QualityScore,
15
+ calculate_thinking_length_penalty,
16
+ calculate_response_length_penalty,
17
+ calculate_combined_length_penalty,
18
+ score_response,
19
+ score_response_for_reward,
20
+ get_quality_bonus_for_archetype,
21
+ score_response_batch,
22
+ get_relative_quality_scores,
23
+ )
24
+ from src.training.scenario_pool import (
25
+ Scenario,
26
+ MarketState,
27
+ PerpetualState,
28
+ PortfolioState,
29
+ )
30
+
31
+
32
+ # =============================================================================
33
+ # Test Fixtures
34
+ # =============================================================================
35
+
36
+
37
+ def create_test_scenario() -> Scenario:
38
+ """Create a test scenario"""
39
+ return Scenario(
40
+ id="test-scenario",
41
+ source="synthetic",
42
+ markets=[
43
+ MarketState(
44
+ market_id="btc-100k",
45
+ question="Will BTC hit $100K?",
46
+ yes_price=0.65,
47
+ no_price=0.35,
48
+ volume_24h=500000.0,
49
+ liquidity=1000000.0,
50
+ expires_at=1735689600000,
51
+ ),
52
+ ],
53
+ perpetuals=[
54
+ PerpetualState(
55
+ ticker="BTC",
56
+ mark_price=100000.0,
57
+ index_price=99990.0,
58
+ funding_rate=0.0001,
59
+ open_interest=50000000.0,
60
+ volume_24h=500000000.0,
61
+ change_24h=0.02,
62
+ high_24h=102000.0,
63
+ low_24h=98000.0,
64
+ ),
65
+ ],
66
+ portfolio=PortfolioState(balance=50000.0),
67
+ )
68
+
69
+
70
+ # =============================================================================
71
+ # QualityScore Tests
72
+ # =============================================================================
73
+
74
+
75
+ class TestQualityScore:
76
+ """Tests for QualityScore dataclass"""
77
+
78
+ def test_creation(self):
79
+ score = QualityScore(
80
+ format_score=0.8,
81
+ reasoning_score=0.7,
82
+ execution_score=0.6,
83
+ length_penalty=-0.1,
84
+ )
85
+
86
+ assert score.format_score == 0.8
87
+ assert score.reasoning_score == 0.7
88
+ assert score.length_penalty == -0.1
89
+
90
+ def test_total_score(self):
91
+ score = QualityScore(
92
+ format_score=1.0,
93
+ reasoning_score=1.0,
94
+ execution_score=1.0,
95
+ length_penalty=0.0,
96
+ )
97
+
98
+ # Perfect score
99
+ assert score.total_score == pytest.approx(0.90, rel=0.01) # 40+30+20 = 90%
100
+
101
+ def test_total_score_with_penalty(self):
102
+ score1 = QualityScore(
103
+ format_score=0.8,
104
+ reasoning_score=0.6,
105
+ execution_score=0.5,
106
+ length_penalty=0.0,
107
+ )
108
+
109
+ score2 = QualityScore(
110
+ format_score=0.8,
111
+ reasoning_score=0.6,
112
+ execution_score=0.5,
113
+ length_penalty=-0.5,
114
+ )
115
+
116
+ # Penalty should reduce total
117
+ assert score1.total_score > score2.total_score
118
+
119
+ def test_combined_format_score(self):
120
+ score = QualityScore(
121
+ format_score=0.8,
122
+ length_penalty=-0.2,
123
+ )
124
+
125
+ # Combined should be lower due to penalty
126
+ assert score.combined_format_score < score.format_score
127
+
128
+ def test_to_dict(self):
129
+ score = QualityScore(
130
+ format_score=0.8,
131
+ reasoning_score=0.7,
132
+ has_thinking=True,
133
+ has_valid_action=True,
134
+ action_type="buy",
135
+ )
136
+
137
+ d = score.to_dict()
138
+
139
+ assert "total_score" in d
140
+ assert "format_score" in d
141
+ assert d["has_thinking"] is True
142
+ assert d["action_type"] == "buy"
143
+
144
+
145
+ # =============================================================================
146
+ # Length Penalty Tests
147
+ # =============================================================================
148
+
149
+
150
+ class TestThinkingLengthPenalty:
151
+ """Tests for thinking length penalty"""
152
+
153
+ def test_very_short_penalty(self):
154
+ penalty = calculate_thinking_length_penalty(10)
155
+ assert penalty == -0.5
156
+
157
+ def test_short_penalty(self):
158
+ penalty = calculate_thinking_length_penalty(50)
159
+ assert penalty == -0.3
160
+
161
+ def test_minimal_penalty(self):
162
+ penalty = calculate_thinking_length_penalty(120)
163
+ assert penalty == -0.1
164
+
165
+ def test_ideal_no_penalty(self):
166
+ penalty = calculate_thinking_length_penalty(250)
167
+ assert penalty == 0.0
168
+
169
+ def test_still_good_no_penalty(self):
170
+ penalty = calculate_thinking_length_penalty(500)
171
+ assert penalty == 0.0
172
+
173
+ def test_verbose_penalty(self):
174
+ penalty = calculate_thinking_length_penalty(800)
175
+ assert penalty == -0.1
176
+
177
+ def test_too_long_penalty(self):
178
+ penalty = calculate_thinking_length_penalty(1500)
179
+ assert penalty == -0.2
180
+
181
+
182
+ class TestResponseLengthPenalty:
183
+ """Tests for response length penalty"""
184
+
185
+ def test_very_short_penalty(self):
186
+ penalty = calculate_response_length_penalty(20)
187
+ assert penalty == -0.4
188
+
189
+ def test_short_penalty(self):
190
+ penalty = calculate_response_length_penalty(100)
191
+ assert penalty == -0.2
192
+
193
+ def test_ideal_no_penalty(self):
194
+ penalty = calculate_response_length_penalty(300)
195
+ assert penalty == 0.0
196
+
197
+ def test_still_good_no_penalty(self):
198
+ penalty = calculate_response_length_penalty(800)
199
+ assert penalty == 0.0
200
+
201
+ def test_verbose_penalty(self):
202
+ penalty = calculate_response_length_penalty(1500)
203
+ assert penalty == -0.1
204
+
205
+ def test_too_long_penalty(self):
206
+ penalty = calculate_response_length_penalty(3000)
207
+ assert penalty == -0.2
208
+
209
+
210
+ class TestCombinedLengthPenalty:
211
+ """Tests for combined length penalty"""
212
+
213
+ def test_both_ideal(self):
214
+ penalty = calculate_combined_length_penalty(250, 400)
215
+ assert penalty == 0.0
216
+
217
+ def test_thinking_too_short(self):
218
+ penalty = calculate_combined_length_penalty(10, 400)
219
+ assert penalty < 0
220
+
221
+ def test_both_too_long(self):
222
+ penalty = calculate_combined_length_penalty(1500, 3000)
223
+ assert penalty < -0.1
224
+
225
+
226
+ # =============================================================================
227
+ # Score Response Tests
228
+ # =============================================================================
229
+
230
+
231
+ class TestScoreResponse:
232
+ """Tests for score_response"""
233
+
234
+ def test_excellent_response(self):
235
+ response = """<think>
236
+ The market shows strong bullish momentum with BTC trading at $100,000.
237
+ Because the volume is high and funding rates are neutral, I expect
238
+ continued upward movement. The risk is limited given the strong trend.
239
+ </think>
240
+
241
+ {"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
242
+
243
+ score = score_response(response)
244
+
245
+ assert score.has_thinking is True
246
+ assert score.has_valid_action is True
247
+ assert score.format_score > 0.6
248
+ assert score.reasoning_score > 0.4
249
+ assert score.total_score > 0.5
250
+
251
+ def test_minimal_response(self):
252
+ response = """<think>Quick check</think>
253
+ {"action": "wait"}"""
254
+
255
+ score = score_response(response)
256
+
257
+ assert score.has_thinking is True
258
+ assert score.has_valid_action is True
259
+ assert score.length_penalty < 0 # Too short
260
+
261
+ def test_no_thinking(self):
262
+ response = '{"action": "buy", "market": "btc", "amount": 100}'
263
+
264
+ score = score_response(response)
265
+
266
+ assert score.has_thinking is False
267
+ assert score.format_score < 0.5
268
+
269
+ def test_no_action(self):
270
+ response = "<think>Long analysis here</think>\nNo action decided."
271
+
272
+ score = score_response(response)
273
+
274
+ assert score.has_thinking is True
275
+ assert score.has_valid_action is False
276
+
277
+ def test_verbose_penalty(self):
278
+ long_thinking = "x" * 1200
279
+ response = f"<think>{long_thinking}</think>{{\"action\": \"wait\"}}"
280
+
281
+ score = score_response(response)
282
+
283
+ assert score.length_penalty < 0
284
+
285
+ def test_with_scenario(self):
286
+ scenario = create_test_scenario()
287
+ response = """<think>Analyzing BTC market</think>
288
+ {"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
289
+
290
+ score = score_response(response, scenario=scenario)
291
+
292
+ assert score.has_valid_action is True
293
+
294
+
295
+ # =============================================================================
296
+ # Score Response for Reward Tests
297
+ # =============================================================================
298
+
299
+
300
+ class TestScoreResponseForReward:
301
+ """Tests for score_response_for_reward"""
302
+
303
+ def test_returns_tuple(self):
304
+ response = "<think>Analysis</think>{\"action\": \"wait\"}"
305
+
306
+ format_score, reasoning_score, metrics = score_response_for_reward(response)
307
+
308
+ assert 0.0 <= format_score <= 1.0
309
+ assert 0.0 <= reasoning_score <= 1.0
310
+ assert isinstance(metrics, dict)
311
+
312
+ def test_with_scenario(self):
313
+ scenario = create_test_scenario()
314
+ response = """<think>Market analysis</think>
315
+ {"action": "buy", "market": "btc-100k", "amount": 100}"""
316
+
317
+ format_score, reasoning_score, metrics = score_response_for_reward(
318
+ response, scenario=scenario
319
+ )
320
+
321
+ assert "action_pnl" in metrics
322
+
323
+
324
+ # =============================================================================
325
+ # Archetype Bonus Tests
326
+ # =============================================================================
327
+
328
+
329
+ class TestArchetypeBonus:
330
+ """Tests for archetype-specific quality bonuses"""
331
+
332
+ def test_degen_prefers_action(self):
333
+ active = QualityScore(
334
+ has_valid_action=True,
335
+ action_type="buy",
336
+ has_thinking=False,
337
+ )
338
+
339
+ passive = QualityScore(
340
+ has_valid_action=True,
341
+ action_type="wait",
342
+ has_thinking=True,
343
+ )
344
+
345
+ active_bonus = get_quality_bonus_for_archetype(active, "degen")
346
+ passive_bonus = get_quality_bonus_for_archetype(passive, "degen")
347
+
348
+ # Degen should prefer active trading
349
+ assert active_bonus > passive_bonus
350
+
351
+ def test_analyst_prefers_reasoning(self):
352
+ deep_thinking = QualityScore(
353
+ reasoning_score=0.9,
354
+ thinking_length=300,
355
+ has_valid_action=True,
356
+ )
357
+
358
+ shallow = QualityScore(
359
+ reasoning_score=0.3,
360
+ thinking_length=50,
361
+ has_valid_action=True,
362
+ )
363
+
364
+ deep_bonus = get_quality_bonus_for_archetype(deep_thinking, "analyst")
365
+ shallow_bonus = get_quality_bonus_for_archetype(shallow, "analyst")
366
+
367
+ assert deep_bonus > shallow_bonus
368
+
369
+ def test_trader_balanced(self):
370
+ balanced = QualityScore(
371
+ format_score=0.7,
372
+ reasoning_score=0.6,
373
+ execution_score=0.5,
374
+ has_valid_action=True,
375
+ has_thinking=True,
376
+ )
377
+
378
+ bonus = get_quality_bonus_for_archetype(balanced, "trader")
379
+
380
+ # Should get some bonus for balanced response
381
+ assert bonus > 0
382
+
383
+
384
+ # =============================================================================
385
+ # Batch Scoring Tests
386
+ # =============================================================================
387
+
388
+
389
+ class TestBatchScoring:
390
+ """Tests for batch scoring functions"""
391
+
392
+ def test_score_response_batch(self):
393
+ responses = [
394
+ "<think>Good analysis</think>{\"action\": \"wait\"}",
395
+ "<think>Brief</think>{\"action\": \"buy\", \"market\": \"x\", \"amount\": 1}",
396
+ "{\"action\": \"wait\"}",
397
+ ]
398
+
399
+ scores = score_response_batch(responses)
400
+
401
+ assert len(scores) == 3
402
+ assert all(isinstance(s, QualityScore) for s in scores)
403
+
404
+ def test_get_relative_quality_scores(self):
405
+ # Create scores with different quality
406
+ scores = [
407
+ QualityScore(format_score=0.9, reasoning_score=0.8, execution_score=0.7),
408
+ QualityScore(format_score=0.5, reasoning_score=0.4, execution_score=0.5),
409
+ QualityScore(format_score=0.3, reasoning_score=0.2, execution_score=0.3),
410
+ ]
411
+
412
+ relative = get_relative_quality_scores(scores)
413
+
414
+ assert len(relative) == 3
415
+ # Should sum to approximately 0 (centered)
416
+ assert abs(sum(relative)) < 0.01
417
+ # First should be positive, last should be negative
418
+ assert relative[0] > 0
419
+ assert relative[2] < 0
420
+
421
+
422
+ # =============================================================================
423
+ # Integration Tests
424
+ # =============================================================================
425
+
426
+
427
+ class TestIntegration:
428
+ """Integration tests for quality scoring"""
429
+
430
+ def test_full_scoring_flow(self):
431
+ """Test complete scoring flow with scenario"""
432
+ scenario = create_test_scenario()
433
+
434
+ excellent_response = """<think>
435
+ Comprehensive market analysis: BTC is trading at $100,000 with strong
436
+ bullish momentum. The funding rate is neutral, suggesting room for
437
+ continued upside. Because the risk/reward is favorable and volume
438
+ supports the move, I'll take a long position with careful sizing.
439
+ </think>
440
+
441
+ {"action": "open_perp", "ticker": "BTC", "size": 0.05, "direction": "long"}"""
442
+
443
+ poor_response = '{"action": "wait"}'
444
+
445
+ excellent_score = score_response(excellent_response, scenario)
446
+ poor_score = score_response(poor_response, scenario)
447
+
448
+ assert excellent_score.total_score > poor_score.total_score
449
+ assert excellent_score.format_score > poor_score.format_score
450
+ assert excellent_score.reasoning_score > poor_score.reasoning_score
451
+
452
+ def test_score_ordering(self):
453
+ """Test that scores order responses correctly"""
454
+ responses = [
455
+ """<think>
456
+ Detailed analysis with market price, volume, and risk consideration.
457
+ Because the momentum is strong and risk is managed, I'll trade.
458
+ </think>
459
+ {"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}""",
460
+
461
+ """<think>Quick check</think>
462
+ {"action": "wait"}""",
463
+
464
+ '{"action": "wait"}',
465
+ ]
466
+
467
+ scores = score_response_batch(responses)
468
+ total_scores = [s.total_score for s in scores]
469
+
470
+ # Should be in descending order
471
+ assert total_scores[0] > total_scores[1]
472
+ assert total_scores[1] > total_scores[2]
473
+
474
+