@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,519 @@
1
+ """
2
+ Tests for BabylonOnlineEnv
3
+
4
+ Tests cover:
5
+ - Prompt building
6
+ - Action parsing
7
+ - Response scoring
8
+ - Integration with scenario pool
9
+ """
10
+
11
+ import pytest
12
+ from unittest.mock import AsyncMock, MagicMock, patch, Mock
13
+
14
+ from src.training.online_env import (
15
+ build_trading_system_prompt,
16
+ build_observation_prompt,
17
+ parse_action_from_response,
18
+ extract_thinking,
19
+ score_response,
20
+ BabylonOnlineEnv,
21
+ BabylonOnlineEnvConfig,
22
+ )
23
+ from src.training.scenario_pool import (
24
+ Scenario,
25
+ ScenarioPoolConfig,
26
+ MarketState,
27
+ PerpetualState,
28
+ NewsItem,
29
+ PortfolioState,
30
+ )
31
+
32
+
33
+ # =============================================================================
34
+ # Prompt Building Tests
35
+ # =============================================================================
36
+
37
+
38
+ class TestBuildTradingSystemPrompt:
39
+ """Tests for build_trading_system_prompt"""
40
+
41
+ def test_default_trader(self):
42
+ prompt = build_trading_system_prompt("trader")
43
+
44
+ assert "trading agent" in prompt.lower()
45
+ assert "trader" in prompt.lower()
46
+ assert "<think>" in prompt
47
+ assert "</think>" in prompt
48
+ assert "action" in prompt.lower()
49
+
50
+ def test_degen_archetype(self):
51
+ prompt = build_trading_system_prompt("degen")
52
+
53
+ assert "high-frequency" in prompt.lower() or "volume" in prompt.lower()
54
+
55
+ def test_analyst_archetype(self):
56
+ prompt = build_trading_system_prompt("analyst")
57
+
58
+ assert "research" in prompt.lower() or "analysis" in prompt.lower()
59
+
60
+ def test_unknown_archetype_defaults(self):
61
+ prompt = build_trading_system_prompt("unknown")
62
+
63
+ # Should get default (trader) behavior
64
+ assert "trading agent" in prompt.lower()
65
+
66
+ def test_contains_action_examples(self):
67
+ prompt = build_trading_system_prompt()
68
+
69
+ assert "buy" in prompt
70
+ assert "sell" in prompt
71
+ assert "wait" in prompt
72
+
73
+
74
+ class TestBuildObservationPrompt:
75
+ """Tests for build_observation_prompt"""
76
+
77
+ def test_basic_scenario(self):
78
+ scenario = Scenario(
79
+ id="test-1",
80
+ source="synthetic",
81
+ portfolio=PortfolioState(balance=15000.0, total_pnl=500.0),
82
+ )
83
+
84
+ prompt = build_observation_prompt(scenario)
85
+
86
+ assert "15000" in prompt or "15,000" in prompt
87
+ assert "500" in prompt
88
+ assert "MARKET UPDATE" in prompt
89
+
90
+ def test_with_markets(self):
91
+ scenario = Scenario(
92
+ id="test-markets",
93
+ source="synthetic",
94
+ markets=[
95
+ MarketState(
96
+ market_id="m1",
97
+ question="Will BTC hit $100K?",
98
+ yes_price=0.65,
99
+ no_price=0.35,
100
+ volume_24h=100000.0,
101
+ liquidity=500000.0,
102
+ expires_at=1735689600000,
103
+ )
104
+ ],
105
+ )
106
+
107
+ prompt = build_observation_prompt(scenario)
108
+
109
+ assert "PREDICTION MARKETS" in prompt
110
+ assert "BTC" in prompt
111
+ assert "0.65" in prompt
112
+
113
+ def test_with_perpetuals(self):
114
+ scenario = Scenario(
115
+ id="test-perps",
116
+ source="synthetic",
117
+ perpetuals=[
118
+ PerpetualState(
119
+ ticker="ETH",
120
+ mark_price=3500.0,
121
+ index_price=3495.0,
122
+ funding_rate=0.0001,
123
+ open_interest=25000000.0,
124
+ volume_24h=50000000.0,
125
+ change_24h=0.02,
126
+ high_24h=3600.0,
127
+ low_24h=3400.0,
128
+ )
129
+ ],
130
+ )
131
+
132
+ prompt = build_observation_prompt(scenario)
133
+
134
+ assert "PERPETUAL MARKETS" in prompt
135
+ assert "ETH" in prompt
136
+ assert "3,500" in prompt or "3500" in prompt
137
+
138
+ def test_with_news(self):
139
+ scenario = Scenario(
140
+ id="test-news",
141
+ source="synthetic",
142
+ news=[
143
+ NewsItem(
144
+ headline="Bitcoin Rally Continues",
145
+ sentiment="bullish",
146
+ impact="high",
147
+ source="CryptoNews",
148
+ timestamp=1735689600000,
149
+ )
150
+ ],
151
+ )
152
+
153
+ prompt = build_observation_prompt(scenario)
154
+
155
+ assert "RECENT NEWS" in prompt
156
+ assert "Bitcoin Rally" in prompt
157
+ assert "CryptoNews" in prompt
158
+
159
+
160
+ # =============================================================================
161
+ # Action Parsing Tests
162
+ # =============================================================================
163
+
164
+
165
+ class TestParseActionFromResponse:
166
+ """Tests for parse_action_from_response"""
167
+
168
+ def test_simple_json(self):
169
+ response = '{"action": "buy", "market": "m1", "amount": 100}'
170
+
171
+ action = parse_action_from_response(response)
172
+
173
+ assert action is not None
174
+ assert action["action"] == "buy"
175
+ assert action["market"] == "m1"
176
+ assert action["amount"] == 100
177
+
178
+ def test_with_think_tags(self):
179
+ response = """<think>
180
+ I should analyze the market carefully.
181
+ BTC is showing bullish momentum.
182
+ </think>
183
+
184
+ {"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
185
+
186
+ action = parse_action_from_response(response)
187
+
188
+ assert action is not None
189
+ assert action["action"] == "open_perp"
190
+ assert action["ticker"] == "BTC"
191
+
192
+ def test_wait_action(self):
193
+ response = '{"action": "wait", "reason": "Need more data"}'
194
+
195
+ action = parse_action_from_response(response)
196
+
197
+ assert action is not None
198
+ assert action["action"] == "wait"
199
+
200
+ def test_invalid_json(self):
201
+ response = "This is not JSON at all"
202
+
203
+ action = parse_action_from_response(response)
204
+
205
+ assert action is None
206
+
207
+ def test_json_without_action_key(self):
208
+ response = '{"type": "buy", "amount": 100}'
209
+
210
+ action = parse_action_from_response(response)
211
+
212
+ assert action is None
213
+
214
+ def test_nested_json_in_text(self):
215
+ response = """Here's my analysis and decision:
216
+
217
+ Based on the data, I'll buy.
218
+
219
+ {"action": "buy", "market": "market-1", "amount": 50, "side": "yes"}
220
+
221
+ This should be profitable."""
222
+
223
+ action = parse_action_from_response(response)
224
+
225
+ assert action is not None
226
+ assert action["action"] == "buy"
227
+
228
+ def test_multiple_json_objects_takes_first_valid(self):
229
+ # The function finds the first JSON with an "action" key
230
+ # when first JSON doesn't have action, it may not find the second
231
+ # This is expected behavior - we look for action JSON in specific patterns
232
+ response = """Some text here
233
+
234
+ {"action": "sell", "market": "m2", "amount": 25}"""
235
+
236
+ action = parse_action_from_response(response)
237
+
238
+ assert action is not None
239
+ assert action["action"] == "sell"
240
+
241
+
242
+ class TestExtractThinking:
243
+ """Tests for extract_thinking"""
244
+
245
+ def test_valid_think_tags(self):
246
+ response = "<think>This is my analysis</think>\n{\"action\": \"wait\"}"
247
+
248
+ thinking = extract_thinking(response)
249
+
250
+ assert thinking == "This is my analysis"
251
+
252
+ def test_multiline_thinking(self):
253
+ response = """<think>
254
+ Line 1
255
+ Line 2
256
+ Line 3
257
+ </think>
258
+
259
+ {"action": "buy"}"""
260
+
261
+ thinking = extract_thinking(response)
262
+
263
+ assert "Line 1" in thinking
264
+ assert "Line 3" in thinking
265
+
266
+ def test_no_think_tags(self):
267
+ response = '{"action": "wait"}'
268
+
269
+ thinking = extract_thinking(response)
270
+
271
+ assert thinking == ""
272
+
273
+ def test_empty_think_tags(self):
274
+ response = "<think></think>action"
275
+
276
+ thinking = extract_thinking(response)
277
+
278
+ assert thinking == ""
279
+
280
+
281
+ # =============================================================================
282
+ # Scoring Tests
283
+ # =============================================================================
284
+
285
+
286
+ class TestScoreResponse:
287
+ """Tests for score_response"""
288
+
289
+ def test_well_formatted_response(self):
290
+ scenario = Scenario(id="test", source="synthetic")
291
+ response = """<think>
292
+ The market is showing bullish momentum with BTC trading at $100,000.
293
+ I should consider opening a long position because the funding rate is low.
294
+ Looking at the risk, a small position of 0.1 BTC seems reasonable.
295
+ </think>
296
+
297
+ {"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
298
+
299
+ score, metrics = score_response(response, scenario, "trader")
300
+
301
+ assert metrics["has_thinking"] is True
302
+ assert metrics["has_valid_action"] is True
303
+ assert metrics["action_type"] == "open_perp"
304
+ assert metrics["format_score"] > 0.5
305
+ assert metrics["reasoning_score"] > 0.3
306
+
307
+ def test_no_thinking_tags(self):
308
+ scenario = Scenario(id="test", source="synthetic")
309
+ response = '{"action": "wait", "reason": "unclear"}'
310
+
311
+ score, metrics = score_response(response, scenario, "trader")
312
+
313
+ assert metrics["has_thinking"] is False
314
+ assert metrics["has_valid_action"] is True
315
+ assert metrics["format_score"] < 0.5
316
+
317
+ def test_invalid_action(self):
318
+ scenario = Scenario(id="test", source="synthetic")
319
+ response = "<think>Analysis here</think>\nI'll wait for now."
320
+
321
+ score, metrics = score_response(response, scenario, "trader")
322
+
323
+ assert metrics["has_thinking"] is True
324
+ assert metrics["has_valid_action"] is False
325
+ assert metrics["action_type"] is None
326
+
327
+ def test_very_short_response_penalized(self):
328
+ scenario = Scenario(id="test", source="synthetic")
329
+ response = '{"action": "wait"}'
330
+
331
+ score, metrics = score_response(response, scenario, "trader")
332
+
333
+ # Short responses should have lower format scores
334
+ assert metrics["format_score"] <= 0.3
335
+
336
+ def test_reasoning_with_analysis_terms(self):
337
+ scenario = Scenario(id="test", source="synthetic")
338
+ response = """<think>
339
+ The price is showing strong momentum with high volume.
340
+ The trend is bullish and the market sentiment is positive.
341
+ Given the probability of success and managing risk, I'll proceed.
342
+ </think>
343
+
344
+ {"action": "buy", "market": "m1", "amount": 100, "side": "yes"}"""
345
+
346
+ score, metrics = score_response(response, scenario, "trader")
347
+
348
+ # Should have high reasoning score due to analysis terms
349
+ assert metrics["reasoning_score"] > 0.4
350
+
351
+ def test_different_archetypes_affect_score(self):
352
+ scenario = Scenario(id="test", source="synthetic")
353
+ # A trade-heavy response
354
+ response = """<think>Quick analysis - buying now.</think>
355
+ {"action": "buy", "market": "m1", "amount": 1000, "side": "yes"}"""
356
+
357
+ trader_score, _ = score_response(response, scenario, "trader")
358
+ degen_score, _ = score_response(response, scenario, "degen")
359
+
360
+ # Both should be scored (actual values depend on reward weights)
361
+ assert trader_score is not None
362
+ assert degen_score is not None
363
+
364
+
365
+ # =============================================================================
366
+ # Environment Tests
367
+ # =============================================================================
368
+
369
+
370
+ class TestBabylonOnlineEnvConfig:
371
+ """Tests for BabylonOnlineEnvConfig"""
372
+
373
+ def test_default_config(self):
374
+ config = BabylonOnlineEnvConfig()
375
+
376
+ assert config.group_size == 4
377
+ assert config.max_response_tokens == 512
378
+ assert config.temperature == 0.8
379
+ assert config.default_archetype == "trader"
380
+
381
+ def test_archetype_distribution(self):
382
+ config = BabylonOnlineEnvConfig()
383
+
384
+ assert "trader" in config.archetype_distribution
385
+ assert "degen" in config.archetype_distribution
386
+ assert sum(config.archetype_distribution.values()) == pytest.approx(1.0)
387
+
388
+
389
+ class TestBabylonOnlineEnv:
390
+ """Tests for BabylonOnlineEnv (mock-based)"""
391
+
392
+ def test_config_init(self):
393
+ env_config, server_configs = BabylonOnlineEnv.config_init()
394
+
395
+ assert isinstance(env_config, BabylonOnlineEnvConfig)
396
+ assert len(server_configs) > 0
397
+ assert env_config.group_size >= 2
398
+
399
+ @pytest.mark.asyncio
400
+ async def test_setup_initializes_pool(self):
401
+ """Test that setup creates and initializes scenario pool"""
402
+ # Use config_init to get proper configs that include server_configs
403
+ config, server_configs = BabylonOnlineEnv.config_init()
404
+
405
+ # Mock the server manager to avoid actual vLLM calls
406
+ with patch('atroposlib.envs.base.ServerManager'):
407
+ env = BabylonOnlineEnv(config, server_configs, testing=True)
408
+
409
+ await env.setup()
410
+
411
+ assert env.scenario_pool is not None
412
+ assert len(env.scenario_pool.scenarios) > 0
413
+
414
+ @pytest.mark.asyncio
415
+ async def test_get_next_item_returns_scenario(self):
416
+ """Test get_next_item returns a scenario and archetype"""
417
+ config, server_configs = BabylonOnlineEnv.config_init()
418
+
419
+ with patch('atroposlib.envs.base.ServerManager'):
420
+ env = BabylonOnlineEnv(config, server_configs, testing=True)
421
+
422
+ await env.setup()
423
+
424
+ item = await env.get_next_item()
425
+
426
+ assert item is not None
427
+ scenario, archetype = item
428
+ assert isinstance(scenario, Scenario)
429
+ assert archetype in config.archetype_distribution
430
+
431
+
432
+ class TestIntegration:
433
+ """Integration tests for online environment components"""
434
+
435
+ def test_full_prompt_building_flow(self):
436
+ """Test building prompts from scenario to final messages"""
437
+ scenario = Scenario(
438
+ id="integration-test",
439
+ source="synthetic",
440
+ markets=[
441
+ MarketState(
442
+ market_id="btc-100k",
443
+ question="Will BTC exceed $100K by EOY?",
444
+ yes_price=0.72,
445
+ no_price=0.28,
446
+ volume_24h=500000.0,
447
+ liquidity=1000000.0,
448
+ expires_at=1735689600000,
449
+ )
450
+ ],
451
+ perpetuals=[
452
+ PerpetualState(
453
+ ticker="BTC",
454
+ mark_price=98000.0,
455
+ index_price=97950.0,
456
+ funding_rate=0.0002,
457
+ open_interest=100000000.0,
458
+ volume_24h=500000000.0,
459
+ change_24h=0.03,
460
+ high_24h=99000.0,
461
+ low_24h=95000.0,
462
+ )
463
+ ],
464
+ news=[
465
+ NewsItem(
466
+ headline="Institutional Buying Accelerates",
467
+ sentiment="bullish",
468
+ impact="high",
469
+ source="Bloomberg",
470
+ timestamp=1735689600000,
471
+ )
472
+ ],
473
+ portfolio=PortfolioState(balance=50000.0, total_pnl=2500.0),
474
+ difficulty="hard",
475
+ )
476
+
477
+ system_prompt = build_trading_system_prompt("trader")
478
+ user_prompt = build_observation_prompt(scenario)
479
+
480
+ # Verify prompts are valid
481
+ assert len(system_prompt) > 100
482
+ assert len(user_prompt) > 100
483
+
484
+ # Verify key content is present
485
+ assert "98,000" in user_prompt or "98000" in user_prompt # BTC price
486
+ assert "100K" in user_prompt # Question
487
+ assert "Institutional" in user_prompt # News
488
+ assert "50,000" in user_prompt or "50000" in user_prompt # Balance
489
+
490
+ def test_scoring_pipeline(self):
491
+ """Test full scoring pipeline"""
492
+ scenario = Scenario(id="scoring-test", source="synthetic")
493
+
494
+ # Simulate different quality responses
495
+ responses = [
496
+ # High quality
497
+ """<think>
498
+ The market shows strong bullish momentum. BTC is at $98K with positive funding.
499
+ The prediction market for $100K is at 72% YES which seems fair given momentum.
500
+ I'll take a small long position because the risk/reward is favorable.
501
+ </think>
502
+
503
+ {"action": "open_perp", "ticker": "BTC", "size": 0.05, "direction": "long"}""",
504
+ # Medium quality
505
+ """<think>Buying BTC looks good.</think>
506
+ {"action": "buy", "market": "btc-100k", "amount": 100, "side": "yes"}""",
507
+ # Low quality
508
+ '{"action": "wait"}',
509
+ ]
510
+
511
+ scores = []
512
+ for resp in responses:
513
+ score, _ = score_response(resp, scenario, "trader")
514
+ scores.append(score)
515
+
516
+ # Higher quality responses should generally score higher
517
+ # (Though exact ordering depends on reward weights)
518
+ assert all(isinstance(s, float) for s in scores)
519
+