@elizaos/training 2.0.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/data/.gitkeep +0 -0
- package/data/degen/.gitkeep +2 -0
- package/data/trader/.gitkeep +2 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +58 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +206 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +89 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +439 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Format Validator for LLM Responses
|
|
3
|
+
|
|
4
|
+
Validates and scores response format quality for GRPO training.
|
|
5
|
+
|
|
6
|
+
Scoring dimensions:
|
|
7
|
+
1. Think Tag Validation - Proper use of <think>...</think> tags
|
|
8
|
+
2. Action JSON Validation - Valid JSON with required fields
|
|
9
|
+
3. Length Analysis - Appropriate response/thinking lengths
|
|
10
|
+
4. Structure Quality - Overall response organization
|
|
11
|
+
|
|
12
|
+
The scores feed into the reward function to encourage:
|
|
13
|
+
- Structured reasoning before action
|
|
14
|
+
- Valid, executable action format
|
|
15
|
+
- Appropriate verbosity (not too short, not too long)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
import logging
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Dict, List, Optional, Tuple
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# Constants
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Think tag patterns
|
|
33
|
+
THINK_TAG_OPEN = re.compile(r'<think>', re.IGNORECASE)
|
|
34
|
+
THINK_TAG_CLOSE = re.compile(r'</think>', re.IGNORECASE)
|
|
35
|
+
THINK_TAG_FULL = re.compile(r'<think>(.*?)</think>', re.IGNORECASE | re.DOTALL)
|
|
36
|
+
|
|
37
|
+
# Length thresholds
|
|
38
|
+
MIN_THINKING_LENGTH = 50 # Minimum chars for meaningful reasoning
|
|
39
|
+
IDEAL_THINKING_MIN = 100 # Ideal minimum
|
|
40
|
+
IDEAL_THINKING_MAX = 500 # Ideal maximum
|
|
41
|
+
MAX_THINKING_LENGTH = 1000 # Maximum before penalty
|
|
42
|
+
|
|
43
|
+
MIN_RESPONSE_LENGTH = 30 # Minimum viable response
|
|
44
|
+
IDEAL_RESPONSE_MIN = 100 # Ideal minimum
|
|
45
|
+
IDEAL_RESPONSE_MAX = 800 # Ideal maximum
|
|
46
|
+
MAX_RESPONSE_LENGTH = 2000 # Maximum before penalty
|
|
47
|
+
|
|
48
|
+
# Action validation
|
|
49
|
+
VALID_ACTION_TYPES = {
|
|
50
|
+
"buy", "sell",
|
|
51
|
+
"open_perp", "close_perp",
|
|
52
|
+
"wait",
|
|
53
|
+
"trade", "predict",
|
|
54
|
+
"post", "create_post",
|
|
55
|
+
"send_dm", "dm",
|
|
56
|
+
"research", "analyze",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Reasoning quality terms
|
|
60
|
+
ANALYSIS_TERMS = {
|
|
61
|
+
"price", "volume", "trend", "momentum", "bullish", "bearish",
|
|
62
|
+
"risk", "position", "market", "funding", "probability", "sentiment",
|
|
63
|
+
"support", "resistance", "breakout", "consolidation",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
DECISION_TERMS = {
|
|
67
|
+
"because", "therefore", "since", "given", "considering",
|
|
68
|
+
"based on", "due to", "hence", "thus", "consequently",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
RISK_TERMS = {
|
|
72
|
+
"risk", "downside", "stop", "loss", "careful", "conservative",
|
|
73
|
+
"exposure", "hedge", "limit", "protect", "cautious",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
NUMERICAL_PATTERN = re.compile(r'\d+\.?\d*%?')
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# =============================================================================
|
|
80
|
+
# Validation Results
|
|
81
|
+
# =============================================================================
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class ThinkTagResult:
|
|
86
|
+
"""Result of think tag validation"""
|
|
87
|
+
has_open_tag: bool = False
|
|
88
|
+
has_close_tag: bool = False
|
|
89
|
+
is_properly_paired: bool = False
|
|
90
|
+
thinking_content: str = ""
|
|
91
|
+
thinking_length: int = 0
|
|
92
|
+
tag_count: int = 0
|
|
93
|
+
issues: List[str] = None
|
|
94
|
+
|
|
95
|
+
def __post_init__(self):
|
|
96
|
+
if self.issues is None:
|
|
97
|
+
self.issues = []
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def is_valid(self) -> bool:
|
|
101
|
+
return self.is_properly_paired and len(self.issues) == 0
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def score(self) -> float:
|
|
105
|
+
"""Calculate format score for think tags (0-1)"""
|
|
106
|
+
if not self.has_open_tag and not self.has_close_tag:
|
|
107
|
+
return 0.0 # No thinking at all
|
|
108
|
+
|
|
109
|
+
if not self.is_properly_paired:
|
|
110
|
+
return 0.2 # Has tags but malformed
|
|
111
|
+
|
|
112
|
+
# Base score for proper tags
|
|
113
|
+
score = 0.5
|
|
114
|
+
|
|
115
|
+
# Length-based adjustments
|
|
116
|
+
if self.thinking_length >= MIN_THINKING_LENGTH:
|
|
117
|
+
score += 0.2
|
|
118
|
+
if self.thinking_length >= IDEAL_THINKING_MIN:
|
|
119
|
+
score += 0.15
|
|
120
|
+
if self.thinking_length > MAX_THINKING_LENGTH:
|
|
121
|
+
score -= 0.1 # Too verbose
|
|
122
|
+
|
|
123
|
+
# Penalty for issues
|
|
124
|
+
score -= len(self.issues) * 0.1
|
|
125
|
+
|
|
126
|
+
return max(0.0, min(1.0, score))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class ActionValidationResult:
|
|
131
|
+
"""Result of action JSON validation"""
|
|
132
|
+
has_action: bool = False
|
|
133
|
+
is_valid_json: bool = False
|
|
134
|
+
action_type: Optional[str] = None
|
|
135
|
+
is_known_action: bool = False
|
|
136
|
+
has_required_fields: bool = False
|
|
137
|
+
raw_json: str = ""
|
|
138
|
+
parsed_action: Optional[Dict] = None
|
|
139
|
+
issues: List[str] = None
|
|
140
|
+
|
|
141
|
+
def __post_init__(self):
|
|
142
|
+
if self.issues is None:
|
|
143
|
+
self.issues = []
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def is_valid(self) -> bool:
|
|
147
|
+
return self.has_action and self.is_valid_json and self.is_known_action
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def score(self) -> float:
|
|
151
|
+
"""Calculate format score for action (0-1)"""
|
|
152
|
+
if not self.has_action:
|
|
153
|
+
return 0.0
|
|
154
|
+
|
|
155
|
+
if not self.is_valid_json:
|
|
156
|
+
return 0.2 # Attempted but failed
|
|
157
|
+
|
|
158
|
+
score = 0.4 # Base for valid JSON
|
|
159
|
+
|
|
160
|
+
if self.is_known_action:
|
|
161
|
+
score += 0.3
|
|
162
|
+
|
|
163
|
+
if self.has_required_fields:
|
|
164
|
+
score += 0.2
|
|
165
|
+
|
|
166
|
+
# Penalty for issues
|
|
167
|
+
score -= len(self.issues) * 0.1
|
|
168
|
+
|
|
169
|
+
return max(0.0, min(1.0, score))
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class ReasoningQualityResult:
|
|
174
|
+
"""Result of reasoning quality analysis"""
|
|
175
|
+
analysis_term_count: int = 0
|
|
176
|
+
decision_term_count: int = 0
|
|
177
|
+
risk_term_count: int = 0
|
|
178
|
+
numerical_count: int = 0
|
|
179
|
+
has_market_analysis: bool = False
|
|
180
|
+
has_decision_justification: bool = False
|
|
181
|
+
has_risk_consideration: bool = False
|
|
182
|
+
issues: List[str] = None
|
|
183
|
+
|
|
184
|
+
def __post_init__(self):
|
|
185
|
+
if self.issues is None:
|
|
186
|
+
self.issues = []
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def score(self) -> float:
|
|
190
|
+
"""Calculate reasoning quality score (0-1)"""
|
|
191
|
+
score = 0.0
|
|
192
|
+
|
|
193
|
+
# Analysis terms
|
|
194
|
+
score += min(0.3, self.analysis_term_count * 0.03)
|
|
195
|
+
|
|
196
|
+
# Decision justification
|
|
197
|
+
if self.has_decision_justification:
|
|
198
|
+
score += 0.2
|
|
199
|
+
|
|
200
|
+
# Risk consideration
|
|
201
|
+
if self.has_risk_consideration:
|
|
202
|
+
score += 0.2
|
|
203
|
+
|
|
204
|
+
# Numerical analysis
|
|
205
|
+
if self.numerical_count > 2:
|
|
206
|
+
score += 0.15
|
|
207
|
+
elif self.numerical_count > 0:
|
|
208
|
+
score += 0.1
|
|
209
|
+
|
|
210
|
+
# Market-specific analysis
|
|
211
|
+
if self.has_market_analysis:
|
|
212
|
+
score += 0.15
|
|
213
|
+
|
|
214
|
+
return max(0.0, min(1.0, score))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class LengthAnalysisResult:
|
|
219
|
+
"""Result of length analysis"""
|
|
220
|
+
total_length: int = 0
|
|
221
|
+
thinking_length: int = 0
|
|
222
|
+
action_length: int = 0
|
|
223
|
+
is_too_short: bool = False
|
|
224
|
+
is_too_long: bool = False
|
|
225
|
+
thinking_is_too_short: bool = False
|
|
226
|
+
thinking_is_too_long: bool = False
|
|
227
|
+
|
|
228
|
+
@property
|
|
229
|
+
def score(self) -> float:
|
|
230
|
+
"""Calculate length appropriateness score (0-1)"""
|
|
231
|
+
score = 1.0
|
|
232
|
+
|
|
233
|
+
if self.is_too_short:
|
|
234
|
+
score -= 0.4
|
|
235
|
+
if self.is_too_long:
|
|
236
|
+
score -= 0.2
|
|
237
|
+
if self.thinking_is_too_short:
|
|
238
|
+
score -= 0.2
|
|
239
|
+
if self.thinking_is_too_long:
|
|
240
|
+
score -= 0.1
|
|
241
|
+
|
|
242
|
+
return max(0.0, score)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@dataclass
|
|
246
|
+
class FormatValidationResult:
|
|
247
|
+
"""Complete format validation result"""
|
|
248
|
+
think_tags: ThinkTagResult
|
|
249
|
+
action: ActionValidationResult
|
|
250
|
+
reasoning: ReasoningQualityResult
|
|
251
|
+
length: LengthAnalysisResult
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def format_score(self) -> float:
|
|
255
|
+
"""
|
|
256
|
+
Calculate overall format score (0-1).
|
|
257
|
+
|
|
258
|
+
Weighted combination:
|
|
259
|
+
- Think tags: 35%
|
|
260
|
+
- Action: 35%
|
|
261
|
+
- Length: 15%
|
|
262
|
+
- Reasoning structure: 15%
|
|
263
|
+
"""
|
|
264
|
+
return (
|
|
265
|
+
self.think_tags.score * 0.35 +
|
|
266
|
+
self.action.score * 0.35 +
|
|
267
|
+
self.length.score * 0.15 +
|
|
268
|
+
self.reasoning.score * 0.15
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
@property
|
|
272
|
+
def reasoning_score(self) -> float:
|
|
273
|
+
"""
|
|
274
|
+
Calculate reasoning quality score (0-1).
|
|
275
|
+
|
|
276
|
+
Based primarily on thinking content quality.
|
|
277
|
+
"""
|
|
278
|
+
return self.reasoning.score
|
|
279
|
+
|
|
280
|
+
@property
|
|
281
|
+
def is_valid(self) -> bool:
|
|
282
|
+
"""Check if response has valid format"""
|
|
283
|
+
return (
|
|
284
|
+
self.think_tags.is_valid and
|
|
285
|
+
self.action.is_valid and
|
|
286
|
+
not self.length.is_too_short
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def get_summary(self) -> Dict:
|
|
290
|
+
"""Get summary of validation results"""
|
|
291
|
+
return {
|
|
292
|
+
"format_score": round(self.format_score, 3),
|
|
293
|
+
"reasoning_score": round(self.reasoning_score, 3),
|
|
294
|
+
"think_tag_score": round(self.think_tags.score, 3),
|
|
295
|
+
"action_score": round(self.action.score, 3),
|
|
296
|
+
"length_score": round(self.length.score, 3),
|
|
297
|
+
"has_thinking": self.think_tags.is_properly_paired,
|
|
298
|
+
"has_valid_action": self.action.is_valid,
|
|
299
|
+
"action_type": self.action.action_type,
|
|
300
|
+
"thinking_length": self.think_tags.thinking_length,
|
|
301
|
+
"total_length": self.length.total_length,
|
|
302
|
+
"issues": (
|
|
303
|
+
self.think_tags.issues +
|
|
304
|
+
self.action.issues +
|
|
305
|
+
self.reasoning.issues
|
|
306
|
+
),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# =============================================================================
|
|
311
|
+
# Validators
|
|
312
|
+
# =============================================================================
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def validate_think_tags(response: str) -> ThinkTagResult:
|
|
316
|
+
"""
|
|
317
|
+
Validate think tag usage in response.
|
|
318
|
+
|
|
319
|
+
Checks:
|
|
320
|
+
- Presence of opening and closing tags
|
|
321
|
+
- Proper pairing and nesting
|
|
322
|
+
- Content between tags
|
|
323
|
+
"""
|
|
324
|
+
result = ThinkTagResult()
|
|
325
|
+
|
|
326
|
+
# Find all opening tags
|
|
327
|
+
open_matches = list(THINK_TAG_OPEN.finditer(response))
|
|
328
|
+
close_matches = list(THINK_TAG_CLOSE.finditer(response))
|
|
329
|
+
|
|
330
|
+
result.has_open_tag = len(open_matches) > 0
|
|
331
|
+
result.has_close_tag = len(close_matches) > 0
|
|
332
|
+
result.tag_count = len(open_matches) + len(close_matches)
|
|
333
|
+
|
|
334
|
+
# Check for mismatched counts
|
|
335
|
+
if len(open_matches) != len(close_matches):
|
|
336
|
+
result.issues.append(f"Mismatched tags: {len(open_matches)} open, {len(close_matches)} close")
|
|
337
|
+
|
|
338
|
+
# Extract content using full pattern
|
|
339
|
+
full_matches = THINK_TAG_FULL.findall(response)
|
|
340
|
+
|
|
341
|
+
if full_matches:
|
|
342
|
+
result.is_properly_paired = True
|
|
343
|
+
result.thinking_content = "\n".join(full_matches)
|
|
344
|
+
result.thinking_length = len(result.thinking_content.strip())
|
|
345
|
+
|
|
346
|
+
# Check for empty thinking
|
|
347
|
+
if result.thinking_length < 10:
|
|
348
|
+
result.issues.append("Thinking content is too short")
|
|
349
|
+
elif result.has_open_tag and result.has_close_tag:
|
|
350
|
+
# Tags exist but content extraction failed
|
|
351
|
+
result.issues.append("Tags found but content extraction failed")
|
|
352
|
+
|
|
353
|
+
# Check for nested tags (not supported)
|
|
354
|
+
if len(open_matches) > 1:
|
|
355
|
+
result.issues.append("Multiple think tag pairs detected")
|
|
356
|
+
|
|
357
|
+
# Check tag order
|
|
358
|
+
if result.has_open_tag and result.has_close_tag:
|
|
359
|
+
first_open = open_matches[0].start() if open_matches else 0
|
|
360
|
+
first_close = close_matches[0].start() if close_matches else 0
|
|
361
|
+
if first_close < first_open:
|
|
362
|
+
result.issues.append("Closing tag before opening tag")
|
|
363
|
+
|
|
364
|
+
return result
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def validate_action_json(response: str) -> ActionValidationResult:
|
|
368
|
+
"""
|
|
369
|
+
Validate action JSON in response.
|
|
370
|
+
|
|
371
|
+
Extracts JSON and validates structure.
|
|
372
|
+
"""
|
|
373
|
+
result = ActionValidationResult()
|
|
374
|
+
|
|
375
|
+
# Try to extract JSON after </think> tag first
|
|
376
|
+
json_text = response
|
|
377
|
+
if "</think>" in response.lower():
|
|
378
|
+
parts = response.lower().split("</think>")
|
|
379
|
+
if len(parts) >= 2:
|
|
380
|
+
# Use original case for JSON extraction
|
|
381
|
+
think_end = response.lower().rfind("</think>") + len("</think>")
|
|
382
|
+
json_text = response[think_end:].strip()
|
|
383
|
+
|
|
384
|
+
# Find JSON object
|
|
385
|
+
json_match = re.search(r'\{[^{}]*\}', json_text)
|
|
386
|
+
if not json_match:
|
|
387
|
+
# Try full response
|
|
388
|
+
json_match = re.search(r'\{[^{}]*\}', response)
|
|
389
|
+
|
|
390
|
+
if json_match:
|
|
391
|
+
result.raw_json = json_match.group()
|
|
392
|
+
result.has_action = True
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
parsed = json.loads(result.raw_json)
|
|
396
|
+
result.is_valid_json = True
|
|
397
|
+
result.parsed_action = parsed
|
|
398
|
+
|
|
399
|
+
# Check for action field
|
|
400
|
+
action_type = parsed.get("action")
|
|
401
|
+
if action_type:
|
|
402
|
+
result.action_type = str(action_type).lower()
|
|
403
|
+
result.is_known_action = result.action_type in VALID_ACTION_TYPES
|
|
404
|
+
|
|
405
|
+
if not result.is_known_action:
|
|
406
|
+
result.issues.append(f"Unknown action type: {result.action_type}")
|
|
407
|
+
|
|
408
|
+
# Check required fields
|
|
409
|
+
result.has_required_fields = _check_action_fields(result.action_type, parsed)
|
|
410
|
+
|
|
411
|
+
if not result.has_required_fields:
|
|
412
|
+
result.issues.append(f"Missing required fields for {result.action_type}")
|
|
413
|
+
else:
|
|
414
|
+
result.issues.append("JSON missing 'action' field")
|
|
415
|
+
|
|
416
|
+
except json.JSONDecodeError as e:
|
|
417
|
+
result.issues.append(f"JSON parse error: {str(e)[:50]}")
|
|
418
|
+
else:
|
|
419
|
+
result.issues.append("No JSON object found in response")
|
|
420
|
+
|
|
421
|
+
return result
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _check_action_fields(action_type: str, parsed: Dict) -> bool:
|
|
425
|
+
"""Check if required fields are present for action type"""
|
|
426
|
+
required_fields = {
|
|
427
|
+
"buy": ["market", "amount"],
|
|
428
|
+
"sell": ["market", "amount"],
|
|
429
|
+
"open_perp": ["ticker", "size", "direction"],
|
|
430
|
+
"close_perp": ["ticker", "size"],
|
|
431
|
+
"wait": [],
|
|
432
|
+
"trade": ["market"],
|
|
433
|
+
"predict": ["market"],
|
|
434
|
+
"post": ["content"],
|
|
435
|
+
"create_post": ["content"],
|
|
436
|
+
"send_dm": ["recipient"],
|
|
437
|
+
"dm": ["recipient"],
|
|
438
|
+
"research": [],
|
|
439
|
+
"analyze": [],
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
fields_needed = required_fields.get(action_type, [])
|
|
443
|
+
return all(field in parsed for field in fields_needed)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def analyze_reasoning_quality(thinking_content: str) -> ReasoningQualityResult:
|
|
447
|
+
"""
|
|
448
|
+
Analyze quality of reasoning in thinking content.
|
|
449
|
+
|
|
450
|
+
Checks for presence of analysis terms, justifications, and risk awareness.
|
|
451
|
+
"""
|
|
452
|
+
result = ReasoningQualityResult()
|
|
453
|
+
|
|
454
|
+
if not thinking_content:
|
|
455
|
+
return result
|
|
456
|
+
|
|
457
|
+
content_lower = thinking_content.lower()
|
|
458
|
+
|
|
459
|
+
# Count analysis terms
|
|
460
|
+
for term in ANALYSIS_TERMS:
|
|
461
|
+
if term in content_lower:
|
|
462
|
+
result.analysis_term_count += 1
|
|
463
|
+
|
|
464
|
+
# Check decision terms
|
|
465
|
+
for term in DECISION_TERMS:
|
|
466
|
+
if term in content_lower:
|
|
467
|
+
result.decision_term_count += 1
|
|
468
|
+
result.has_decision_justification = result.decision_term_count > 0
|
|
469
|
+
|
|
470
|
+
# Check risk terms
|
|
471
|
+
for term in RISK_TERMS:
|
|
472
|
+
if term in content_lower:
|
|
473
|
+
result.risk_term_count += 1
|
|
474
|
+
result.has_risk_consideration = result.risk_term_count > 0
|
|
475
|
+
|
|
476
|
+
# Count numerical references
|
|
477
|
+
numbers = NUMERICAL_PATTERN.findall(thinking_content)
|
|
478
|
+
result.numerical_count = len(numbers)
|
|
479
|
+
|
|
480
|
+
# Check for market-specific analysis
|
|
481
|
+
market_terms = {"btc", "eth", "bitcoin", "ethereum", "crypto", "stock", "market"}
|
|
482
|
+
result.has_market_analysis = any(term in content_lower for term in market_terms)
|
|
483
|
+
|
|
484
|
+
# Quality issues
|
|
485
|
+
if result.analysis_term_count < 2:
|
|
486
|
+
result.issues.append("Limited market analysis vocabulary")
|
|
487
|
+
|
|
488
|
+
if not result.has_decision_justification:
|
|
489
|
+
result.issues.append("No decision justification phrases")
|
|
490
|
+
|
|
491
|
+
return result
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def analyze_length(
|
|
495
|
+
response: str,
|
|
496
|
+
thinking_content: str,
|
|
497
|
+
action_json: str,
|
|
498
|
+
) -> LengthAnalysisResult:
|
|
499
|
+
"""
|
|
500
|
+
Analyze response length characteristics.
|
|
501
|
+
"""
|
|
502
|
+
result = LengthAnalysisResult()
|
|
503
|
+
|
|
504
|
+
result.total_length = len(response)
|
|
505
|
+
result.thinking_length = len(thinking_content)
|
|
506
|
+
result.action_length = len(action_json)
|
|
507
|
+
|
|
508
|
+
# Check total length
|
|
509
|
+
result.is_too_short = result.total_length < MIN_RESPONSE_LENGTH
|
|
510
|
+
result.is_too_long = result.total_length > MAX_RESPONSE_LENGTH
|
|
511
|
+
|
|
512
|
+
# Check thinking length
|
|
513
|
+
result.thinking_is_too_short = result.thinking_length < MIN_THINKING_LENGTH
|
|
514
|
+
result.thinking_is_too_long = result.thinking_length > MAX_THINKING_LENGTH
|
|
515
|
+
|
|
516
|
+
return result
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
# =============================================================================
|
|
520
|
+
# Main Validation Function
|
|
521
|
+
# =============================================================================
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def validate_response_format(response: str) -> FormatValidationResult:
|
|
525
|
+
"""
|
|
526
|
+
Validate complete response format.
|
|
527
|
+
|
|
528
|
+
Performs all validation checks and returns comprehensive result.
|
|
529
|
+
"""
|
|
530
|
+
# Validate think tags
|
|
531
|
+
think_result = validate_think_tags(response)
|
|
532
|
+
|
|
533
|
+
# Validate action JSON
|
|
534
|
+
action_result = validate_action_json(response)
|
|
535
|
+
|
|
536
|
+
# Analyze reasoning quality
|
|
537
|
+
reasoning_result = analyze_reasoning_quality(think_result.thinking_content)
|
|
538
|
+
|
|
539
|
+
# Analyze length
|
|
540
|
+
length_result = analyze_length(
|
|
541
|
+
response,
|
|
542
|
+
think_result.thinking_content,
|
|
543
|
+
action_result.raw_json,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
return FormatValidationResult(
|
|
547
|
+
think_tags=think_result,
|
|
548
|
+
action=action_result,
|
|
549
|
+
reasoning=reasoning_result,
|
|
550
|
+
length=length_result,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def get_format_and_reasoning_scores(response: str) -> Tuple[float, float]:
|
|
555
|
+
"""
|
|
556
|
+
Convenience function to get format and reasoning scores.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
(format_score, reasoning_score) both in range [0, 1]
|
|
560
|
+
"""
|
|
561
|
+
result = validate_response_format(response)
|
|
562
|
+
return result.format_score, result.reasoning_score
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def validate_for_training(response: str) -> Dict:
|
|
566
|
+
"""
|
|
567
|
+
Validate response format for training reward calculation.
|
|
568
|
+
|
|
569
|
+
Returns dict compatible with reward function inputs.
|
|
570
|
+
"""
|
|
571
|
+
result = validate_response_format(response)
|
|
572
|
+
summary = result.get_summary()
|
|
573
|
+
|
|
574
|
+
return {
|
|
575
|
+
"format_score": summary["format_score"],
|
|
576
|
+
"reasoning_score": summary["reasoning_score"],
|
|
577
|
+
"has_thinking": summary["has_thinking"],
|
|
578
|
+
"has_valid_action": summary["has_valid_action"],
|
|
579
|
+
"action_type": summary["action_type"],
|
|
580
|
+
"thinking_length": summary["thinking_length"],
|
|
581
|
+
"issues": summary["issues"],
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
|