@elizaos/training 2.0.0-alpha.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/LICENSE +21 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +57 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/benchmark_should_respond.py +190 -0
- package/python/scripts/debug_inference.py +62 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/optimize_prompt_grpo.py +269 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_generation.py +29 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_grpo.py +360 -0
- package/python/scripts/train_jsonl.py +223 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/research-output/training-runs/training-run-1771276293257.json +38 -0
- package/research-output/training-runs/training-run-1771276389280.json +38 -0
- package/research-output/training-runs/training-run-1771276502776.json +38 -0
- package/research-output/training-runs/training-run-1771277340748.json +38 -0
- package/research-output/training-runs/training-run-1773013658993.json +38 -0
- package/research-output/training-runs/training-run-1773013861014.json +38 -0
- package/research-output/training-runs/training-run-1773014215983.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/generate_should_respond.ts +267 -0
- package/scripts/generate_should_respond_dataset.ts +162 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/rank_trajectories.ts +207 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/run_rlaif_loop.ts +78 -0
- package/scripts/run_task_benchmark.ts +247 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +204 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/TaskRunner.ts +94 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +91 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +475 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for Quality Scorer
|
|
3
|
+
|
|
4
|
+
Tests cover:
|
|
5
|
+
- Length penalty calculations
|
|
6
|
+
- Quality score calculations
|
|
7
|
+
- Archetype-specific bonuses
|
|
8
|
+
- Integration with format validator
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import pytest
|
|
12
|
+
|
|
13
|
+
from src.training.quality_scorer import (
|
|
14
|
+
QualityScore,
|
|
15
|
+
calculate_thinking_length_penalty,
|
|
16
|
+
calculate_response_length_penalty,
|
|
17
|
+
calculate_combined_length_penalty,
|
|
18
|
+
score_response,
|
|
19
|
+
score_response_for_reward,
|
|
20
|
+
get_quality_bonus_for_archetype,
|
|
21
|
+
score_response_batch,
|
|
22
|
+
get_relative_quality_scores,
|
|
23
|
+
)
|
|
24
|
+
from src.training.scenario_pool import (
|
|
25
|
+
Scenario,
|
|
26
|
+
MarketState,
|
|
27
|
+
PerpetualState,
|
|
28
|
+
PortfolioState,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# =============================================================================
|
|
33
|
+
# Test Fixtures
|
|
34
|
+
# =============================================================================
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def create_test_scenario() -> Scenario:
|
|
38
|
+
"""Create a test scenario"""
|
|
39
|
+
return Scenario(
|
|
40
|
+
id="test-scenario",
|
|
41
|
+
source="synthetic",
|
|
42
|
+
markets=[
|
|
43
|
+
MarketState(
|
|
44
|
+
market_id="btc-100k",
|
|
45
|
+
question="Will BTC hit $100K?",
|
|
46
|
+
yes_price=0.65,
|
|
47
|
+
no_price=0.35,
|
|
48
|
+
volume_24h=500000.0,
|
|
49
|
+
liquidity=1000000.0,
|
|
50
|
+
expires_at=1735689600000,
|
|
51
|
+
),
|
|
52
|
+
],
|
|
53
|
+
perpetuals=[
|
|
54
|
+
PerpetualState(
|
|
55
|
+
ticker="BTC",
|
|
56
|
+
mark_price=100000.0,
|
|
57
|
+
index_price=99990.0,
|
|
58
|
+
funding_rate=0.0001,
|
|
59
|
+
open_interest=50000000.0,
|
|
60
|
+
volume_24h=500000000.0,
|
|
61
|
+
change_24h=0.02,
|
|
62
|
+
high_24h=102000.0,
|
|
63
|
+
low_24h=98000.0,
|
|
64
|
+
),
|
|
65
|
+
],
|
|
66
|
+
portfolio=PortfolioState(balance=50000.0),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# =============================================================================
|
|
71
|
+
# QualityScore Tests
|
|
72
|
+
# =============================================================================
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TestQualityScore:
|
|
76
|
+
"""Tests for QualityScore dataclass"""
|
|
77
|
+
|
|
78
|
+
def test_creation(self):
|
|
79
|
+
score = QualityScore(
|
|
80
|
+
format_score=0.8,
|
|
81
|
+
reasoning_score=0.7,
|
|
82
|
+
execution_score=0.6,
|
|
83
|
+
length_penalty=-0.1,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
assert score.format_score == 0.8
|
|
87
|
+
assert score.reasoning_score == 0.7
|
|
88
|
+
assert score.length_penalty == -0.1
|
|
89
|
+
|
|
90
|
+
def test_total_score(self):
|
|
91
|
+
score = QualityScore(
|
|
92
|
+
format_score=1.0,
|
|
93
|
+
reasoning_score=1.0,
|
|
94
|
+
execution_score=1.0,
|
|
95
|
+
length_penalty=0.0,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Perfect score
|
|
99
|
+
assert score.total_score == pytest.approx(0.90, rel=0.01) # 40+30+20 = 90%
|
|
100
|
+
|
|
101
|
+
def test_total_score_with_penalty(self):
|
|
102
|
+
score1 = QualityScore(
|
|
103
|
+
format_score=0.8,
|
|
104
|
+
reasoning_score=0.6,
|
|
105
|
+
execution_score=0.5,
|
|
106
|
+
length_penalty=0.0,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
score2 = QualityScore(
|
|
110
|
+
format_score=0.8,
|
|
111
|
+
reasoning_score=0.6,
|
|
112
|
+
execution_score=0.5,
|
|
113
|
+
length_penalty=-0.5,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Penalty should reduce total
|
|
117
|
+
assert score1.total_score > score2.total_score
|
|
118
|
+
|
|
119
|
+
def test_combined_format_score(self):
|
|
120
|
+
score = QualityScore(
|
|
121
|
+
format_score=0.8,
|
|
122
|
+
length_penalty=-0.2,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Combined should be lower due to penalty
|
|
126
|
+
assert score.combined_format_score < score.format_score
|
|
127
|
+
|
|
128
|
+
def test_to_dict(self):
|
|
129
|
+
score = QualityScore(
|
|
130
|
+
format_score=0.8,
|
|
131
|
+
reasoning_score=0.7,
|
|
132
|
+
has_thinking=True,
|
|
133
|
+
has_valid_action=True,
|
|
134
|
+
action_type="buy",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
d = score.to_dict()
|
|
138
|
+
|
|
139
|
+
assert "total_score" in d
|
|
140
|
+
assert "format_score" in d
|
|
141
|
+
assert d["has_thinking"] is True
|
|
142
|
+
assert d["action_type"] == "buy"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# =============================================================================
|
|
146
|
+
# Length Penalty Tests
|
|
147
|
+
# =============================================================================
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class TestThinkingLengthPenalty:
|
|
151
|
+
"""Tests for thinking length penalty"""
|
|
152
|
+
|
|
153
|
+
def test_very_short_penalty(self):
|
|
154
|
+
penalty = calculate_thinking_length_penalty(10)
|
|
155
|
+
assert penalty == -0.5
|
|
156
|
+
|
|
157
|
+
def test_short_penalty(self):
|
|
158
|
+
penalty = calculate_thinking_length_penalty(50)
|
|
159
|
+
assert penalty == -0.3
|
|
160
|
+
|
|
161
|
+
def test_minimal_penalty(self):
|
|
162
|
+
penalty = calculate_thinking_length_penalty(120)
|
|
163
|
+
assert penalty == -0.1
|
|
164
|
+
|
|
165
|
+
def test_ideal_no_penalty(self):
|
|
166
|
+
penalty = calculate_thinking_length_penalty(250)
|
|
167
|
+
assert penalty == 0.0
|
|
168
|
+
|
|
169
|
+
def test_still_good_no_penalty(self):
|
|
170
|
+
penalty = calculate_thinking_length_penalty(500)
|
|
171
|
+
assert penalty == 0.0
|
|
172
|
+
|
|
173
|
+
def test_verbose_penalty(self):
|
|
174
|
+
penalty = calculate_thinking_length_penalty(800)
|
|
175
|
+
assert penalty == -0.1
|
|
176
|
+
|
|
177
|
+
def test_too_long_penalty(self):
|
|
178
|
+
penalty = calculate_thinking_length_penalty(1500)
|
|
179
|
+
assert penalty == -0.2
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class TestResponseLengthPenalty:
|
|
183
|
+
"""Tests for response length penalty"""
|
|
184
|
+
|
|
185
|
+
def test_very_short_penalty(self):
|
|
186
|
+
penalty = calculate_response_length_penalty(20)
|
|
187
|
+
assert penalty == -0.4
|
|
188
|
+
|
|
189
|
+
def test_short_penalty(self):
|
|
190
|
+
penalty = calculate_response_length_penalty(100)
|
|
191
|
+
assert penalty == -0.2
|
|
192
|
+
|
|
193
|
+
def test_ideal_no_penalty(self):
|
|
194
|
+
penalty = calculate_response_length_penalty(300)
|
|
195
|
+
assert penalty == 0.0
|
|
196
|
+
|
|
197
|
+
def test_still_good_no_penalty(self):
|
|
198
|
+
penalty = calculate_response_length_penalty(800)
|
|
199
|
+
assert penalty == 0.0
|
|
200
|
+
|
|
201
|
+
def test_verbose_penalty(self):
|
|
202
|
+
penalty = calculate_response_length_penalty(1500)
|
|
203
|
+
assert penalty == -0.1
|
|
204
|
+
|
|
205
|
+
def test_too_long_penalty(self):
|
|
206
|
+
penalty = calculate_response_length_penalty(3000)
|
|
207
|
+
assert penalty == -0.2
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class TestCombinedLengthPenalty:
|
|
211
|
+
"""Tests for combined length penalty"""
|
|
212
|
+
|
|
213
|
+
def test_both_ideal(self):
|
|
214
|
+
penalty = calculate_combined_length_penalty(250, 400)
|
|
215
|
+
assert penalty == 0.0
|
|
216
|
+
|
|
217
|
+
def test_thinking_too_short(self):
|
|
218
|
+
penalty = calculate_combined_length_penalty(10, 400)
|
|
219
|
+
assert penalty < 0
|
|
220
|
+
|
|
221
|
+
def test_both_too_long(self):
|
|
222
|
+
penalty = calculate_combined_length_penalty(1500, 3000)
|
|
223
|
+
assert penalty < -0.1
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# =============================================================================
|
|
227
|
+
# Score Response Tests
|
|
228
|
+
# =============================================================================
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class TestScoreResponse:
|
|
232
|
+
"""Tests for score_response"""
|
|
233
|
+
|
|
234
|
+
def test_excellent_response(self):
|
|
235
|
+
response = """<think>
|
|
236
|
+
The market shows strong bullish momentum with BTC trading at $100,000.
|
|
237
|
+
Because the volume is high and funding rates are neutral, I expect
|
|
238
|
+
continued upward movement. The risk is limited given the strong trend.
|
|
239
|
+
</think>
|
|
240
|
+
|
|
241
|
+
{"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
|
|
242
|
+
|
|
243
|
+
score = score_response(response)
|
|
244
|
+
|
|
245
|
+
assert score.has_thinking is True
|
|
246
|
+
assert score.has_valid_action is True
|
|
247
|
+
assert score.format_score > 0.6
|
|
248
|
+
assert score.reasoning_score > 0.4
|
|
249
|
+
assert score.total_score > 0.5
|
|
250
|
+
|
|
251
|
+
def test_minimal_response(self):
|
|
252
|
+
response = """<think>Quick check</think>
|
|
253
|
+
{"action": "wait"}"""
|
|
254
|
+
|
|
255
|
+
score = score_response(response)
|
|
256
|
+
|
|
257
|
+
assert score.has_thinking is True
|
|
258
|
+
assert score.has_valid_action is True
|
|
259
|
+
assert score.length_penalty < 0 # Too short
|
|
260
|
+
|
|
261
|
+
def test_no_thinking(self):
|
|
262
|
+
response = '{"action": "buy", "market": "btc", "amount": 100}'
|
|
263
|
+
|
|
264
|
+
score = score_response(response)
|
|
265
|
+
|
|
266
|
+
assert score.has_thinking is False
|
|
267
|
+
assert score.format_score < 0.5
|
|
268
|
+
|
|
269
|
+
def test_no_action(self):
|
|
270
|
+
response = "<think>Long analysis here</think>\nNo action decided."
|
|
271
|
+
|
|
272
|
+
score = score_response(response)
|
|
273
|
+
|
|
274
|
+
assert score.has_thinking is True
|
|
275
|
+
assert score.has_valid_action is False
|
|
276
|
+
|
|
277
|
+
def test_verbose_penalty(self):
|
|
278
|
+
long_thinking = "x" * 1200
|
|
279
|
+
response = f"<think>{long_thinking}</think>{{\"action\": \"wait\"}}"
|
|
280
|
+
|
|
281
|
+
score = score_response(response)
|
|
282
|
+
|
|
283
|
+
assert score.length_penalty < 0
|
|
284
|
+
|
|
285
|
+
def test_with_scenario(self):
|
|
286
|
+
scenario = create_test_scenario()
|
|
287
|
+
response = """<think>Analyzing BTC market</think>
|
|
288
|
+
{"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
|
|
289
|
+
|
|
290
|
+
score = score_response(response, scenario=scenario)
|
|
291
|
+
|
|
292
|
+
assert score.has_valid_action is True
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# =============================================================================
|
|
296
|
+
# Score Response for Reward Tests
|
|
297
|
+
# =============================================================================
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class TestScoreResponseForReward:
|
|
301
|
+
"""Tests for score_response_for_reward"""
|
|
302
|
+
|
|
303
|
+
def test_returns_tuple(self):
|
|
304
|
+
response = "<think>Analysis</think>{\"action\": \"wait\"}"
|
|
305
|
+
|
|
306
|
+
format_score, reasoning_score, metrics = score_response_for_reward(response)
|
|
307
|
+
|
|
308
|
+
assert 0.0 <= format_score <= 1.0
|
|
309
|
+
assert 0.0 <= reasoning_score <= 1.0
|
|
310
|
+
assert isinstance(metrics, dict)
|
|
311
|
+
|
|
312
|
+
def test_with_scenario(self):
|
|
313
|
+
scenario = create_test_scenario()
|
|
314
|
+
response = """<think>Market analysis</think>
|
|
315
|
+
{"action": "buy", "market": "btc-100k", "amount": 100}"""
|
|
316
|
+
|
|
317
|
+
format_score, reasoning_score, metrics = score_response_for_reward(
|
|
318
|
+
response, scenario=scenario
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
assert "action_pnl" in metrics
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# =============================================================================
|
|
325
|
+
# Archetype Bonus Tests
|
|
326
|
+
# =============================================================================
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class TestArchetypeBonus:
|
|
330
|
+
"""Tests for archetype-specific quality bonuses"""
|
|
331
|
+
|
|
332
|
+
def test_degen_prefers_action(self):
|
|
333
|
+
active = QualityScore(
|
|
334
|
+
has_valid_action=True,
|
|
335
|
+
action_type="buy",
|
|
336
|
+
has_thinking=False,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
passive = QualityScore(
|
|
340
|
+
has_valid_action=True,
|
|
341
|
+
action_type="wait",
|
|
342
|
+
has_thinking=True,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
active_bonus = get_quality_bonus_for_archetype(active, "degen")
|
|
346
|
+
passive_bonus = get_quality_bonus_for_archetype(passive, "degen")
|
|
347
|
+
|
|
348
|
+
# Degen should prefer active trading
|
|
349
|
+
assert active_bonus > passive_bonus
|
|
350
|
+
|
|
351
|
+
def test_analyst_prefers_reasoning(self):
|
|
352
|
+
deep_thinking = QualityScore(
|
|
353
|
+
reasoning_score=0.9,
|
|
354
|
+
thinking_length=300,
|
|
355
|
+
has_valid_action=True,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
shallow = QualityScore(
|
|
359
|
+
reasoning_score=0.3,
|
|
360
|
+
thinking_length=50,
|
|
361
|
+
has_valid_action=True,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
deep_bonus = get_quality_bonus_for_archetype(deep_thinking, "analyst")
|
|
365
|
+
shallow_bonus = get_quality_bonus_for_archetype(shallow, "analyst")
|
|
366
|
+
|
|
367
|
+
assert deep_bonus > shallow_bonus
|
|
368
|
+
|
|
369
|
+
def test_trader_balanced(self):
|
|
370
|
+
balanced = QualityScore(
|
|
371
|
+
format_score=0.7,
|
|
372
|
+
reasoning_score=0.6,
|
|
373
|
+
execution_score=0.5,
|
|
374
|
+
has_valid_action=True,
|
|
375
|
+
has_thinking=True,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
bonus = get_quality_bonus_for_archetype(balanced, "trader")
|
|
379
|
+
|
|
380
|
+
# Should get some bonus for balanced response
|
|
381
|
+
assert bonus > 0
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# =============================================================================
|
|
385
|
+
# Batch Scoring Tests
|
|
386
|
+
# =============================================================================
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class TestBatchScoring:
|
|
390
|
+
"""Tests for batch scoring functions"""
|
|
391
|
+
|
|
392
|
+
def test_score_response_batch(self):
|
|
393
|
+
responses = [
|
|
394
|
+
"<think>Good analysis</think>{\"action\": \"wait\"}",
|
|
395
|
+
"<think>Brief</think>{\"action\": \"buy\", \"market\": \"x\", \"amount\": 1}",
|
|
396
|
+
"{\"action\": \"wait\"}",
|
|
397
|
+
]
|
|
398
|
+
|
|
399
|
+
scores = score_response_batch(responses)
|
|
400
|
+
|
|
401
|
+
assert len(scores) == 3
|
|
402
|
+
assert all(isinstance(s, QualityScore) for s in scores)
|
|
403
|
+
|
|
404
|
+
def test_get_relative_quality_scores(self):
|
|
405
|
+
# Create scores with different quality
|
|
406
|
+
scores = [
|
|
407
|
+
QualityScore(format_score=0.9, reasoning_score=0.8, execution_score=0.7),
|
|
408
|
+
QualityScore(format_score=0.5, reasoning_score=0.4, execution_score=0.5),
|
|
409
|
+
QualityScore(format_score=0.3, reasoning_score=0.2, execution_score=0.3),
|
|
410
|
+
]
|
|
411
|
+
|
|
412
|
+
relative = get_relative_quality_scores(scores)
|
|
413
|
+
|
|
414
|
+
assert len(relative) == 3
|
|
415
|
+
# Should sum to approximately 0 (centered)
|
|
416
|
+
assert abs(sum(relative)) < 0.01
|
|
417
|
+
# First should be positive, last should be negative
|
|
418
|
+
assert relative[0] > 0
|
|
419
|
+
assert relative[2] < 0
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# =============================================================================
|
|
423
|
+
# Integration Tests
|
|
424
|
+
# =============================================================================
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
class TestIntegration:
|
|
428
|
+
"""Integration tests for quality scoring"""
|
|
429
|
+
|
|
430
|
+
def test_full_scoring_flow(self):
|
|
431
|
+
"""Test complete scoring flow with scenario"""
|
|
432
|
+
scenario = create_test_scenario()
|
|
433
|
+
|
|
434
|
+
excellent_response = """<think>
|
|
435
|
+
Comprehensive market analysis: BTC is trading at $100,000 with strong
|
|
436
|
+
bullish momentum. The funding rate is neutral, suggesting room for
|
|
437
|
+
continued upside. Because the risk/reward is favorable and volume
|
|
438
|
+
supports the move, I'll take a long position with careful sizing.
|
|
439
|
+
</think>
|
|
440
|
+
|
|
441
|
+
{"action": "open_perp", "ticker": "BTC", "size": 0.05, "direction": "long"}"""
|
|
442
|
+
|
|
443
|
+
poor_response = '{"action": "wait"}'
|
|
444
|
+
|
|
445
|
+
excellent_score = score_response(excellent_response, scenario)
|
|
446
|
+
poor_score = score_response(poor_response, scenario)
|
|
447
|
+
|
|
448
|
+
assert excellent_score.total_score > poor_score.total_score
|
|
449
|
+
assert excellent_score.format_score > poor_score.format_score
|
|
450
|
+
assert excellent_score.reasoning_score > poor_score.reasoning_score
|
|
451
|
+
|
|
452
|
+
def test_score_ordering(self):
|
|
453
|
+
"""Test that scores order responses correctly"""
|
|
454
|
+
responses = [
|
|
455
|
+
"""<think>
|
|
456
|
+
Detailed analysis with market price, volume, and risk consideration.
|
|
457
|
+
Because the momentum is strong and risk is managed, I'll trade.
|
|
458
|
+
</think>
|
|
459
|
+
{"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}""",
|
|
460
|
+
|
|
461
|
+
"""<think>Quick check</think>
|
|
462
|
+
{"action": "wait"}""",
|
|
463
|
+
|
|
464
|
+
'{"action": "wait"}',
|
|
465
|
+
]
|
|
466
|
+
|
|
467
|
+
scores = score_response_batch(responses)
|
|
468
|
+
total_scores = [s.total_score for s in scores]
|
|
469
|
+
|
|
470
|
+
# Should be in descending order
|
|
471
|
+
assert total_scores[0] > total_scores[1]
|
|
472
|
+
assert total_scores[1] > total_scores[2]
|
|
473
|
+
|
|
474
|
+
|