@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,584 @@
1
+ """
2
+ Format Validator for LLM Responses
3
+
4
+ Validates and scores response format quality for GRPO training.
5
+
6
+ Scoring dimensions:
7
+ 1. Think Tag Validation - Proper use of <think>...</think> tags
8
+ 2. Action JSON Validation - Valid JSON with required fields
9
+ 3. Length Analysis - Appropriate response/thinking lengths
10
+ 4. Structure Quality - Overall response organization
11
+
12
+ The scores feed into the reward function to encourage:
13
+ - Structured reasoning before action
14
+ - Valid, executable action format
15
+ - Appropriate verbosity (not too short, not too long)
16
+ """
17
+
18
+ import json
19
+ import re
20
+ import logging
21
+ from dataclasses import dataclass
22
+ from typing import Dict, List, Optional, Tuple
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # =============================================================================
28
+ # Constants
29
+ # =============================================================================
30
+
31
+
32
+ # Think tag patterns
33
+ THINK_TAG_OPEN = re.compile(r'<think>', re.IGNORECASE)
34
+ THINK_TAG_CLOSE = re.compile(r'</think>', re.IGNORECASE)
35
+ THINK_TAG_FULL = re.compile(r'<think>(.*?)</think>', re.IGNORECASE | re.DOTALL)
36
+
37
+ # Length thresholds
38
+ MIN_THINKING_LENGTH = 50 # Minimum chars for meaningful reasoning
39
+ IDEAL_THINKING_MIN = 100 # Ideal minimum
40
+ IDEAL_THINKING_MAX = 500 # Ideal maximum
41
+ MAX_THINKING_LENGTH = 1000 # Maximum before penalty
42
+
43
+ MIN_RESPONSE_LENGTH = 30 # Minimum viable response
44
+ IDEAL_RESPONSE_MIN = 100 # Ideal minimum
45
+ IDEAL_RESPONSE_MAX = 800 # Ideal maximum
46
+ MAX_RESPONSE_LENGTH = 2000 # Maximum before penalty
47
+
48
+ # Action validation
49
+ VALID_ACTION_TYPES = {
50
+ "buy", "sell",
51
+ "open_perp", "close_perp",
52
+ "wait",
53
+ "trade", "predict",
54
+ "post", "create_post",
55
+ "send_dm", "dm",
56
+ "research", "analyze",
57
+ }
58
+
59
+ # Reasoning quality terms
60
+ ANALYSIS_TERMS = {
61
+ "price", "volume", "trend", "momentum", "bullish", "bearish",
62
+ "risk", "position", "market", "funding", "probability", "sentiment",
63
+ "support", "resistance", "breakout", "consolidation",
64
+ }
65
+
66
+ DECISION_TERMS = {
67
+ "because", "therefore", "since", "given", "considering",
68
+ "based on", "due to", "hence", "thus", "consequently",
69
+ }
70
+
71
+ RISK_TERMS = {
72
+ "risk", "downside", "stop", "loss", "careful", "conservative",
73
+ "exposure", "hedge", "limit", "protect", "cautious",
74
+ }
75
+
76
+ NUMERICAL_PATTERN = re.compile(r'\d+\.?\d*%?')
77
+
78
+
79
+ # =============================================================================
80
+ # Validation Results
81
+ # =============================================================================
82
+
83
+
84
+ @dataclass
85
+ class ThinkTagResult:
86
+ """Result of think tag validation"""
87
+ has_open_tag: bool = False
88
+ has_close_tag: bool = False
89
+ is_properly_paired: bool = False
90
+ thinking_content: str = ""
91
+ thinking_length: int = 0
92
+ tag_count: int = 0
93
+ issues: List[str] = None
94
+
95
+ def __post_init__(self):
96
+ if self.issues is None:
97
+ self.issues = []
98
+
99
+ @property
100
+ def is_valid(self) -> bool:
101
+ return self.is_properly_paired and len(self.issues) == 0
102
+
103
+ @property
104
+ def score(self) -> float:
105
+ """Calculate format score for think tags (0-1)"""
106
+ if not self.has_open_tag and not self.has_close_tag:
107
+ return 0.0 # No thinking at all
108
+
109
+ if not self.is_properly_paired:
110
+ return 0.2 # Has tags but malformed
111
+
112
+ # Base score for proper tags
113
+ score = 0.5
114
+
115
+ # Length-based adjustments
116
+ if self.thinking_length >= MIN_THINKING_LENGTH:
117
+ score += 0.2
118
+ if self.thinking_length >= IDEAL_THINKING_MIN:
119
+ score += 0.15
120
+ if self.thinking_length > MAX_THINKING_LENGTH:
121
+ score -= 0.1 # Too verbose
122
+
123
+ # Penalty for issues
124
+ score -= len(self.issues) * 0.1
125
+
126
+ return max(0.0, min(1.0, score))
127
+
128
+
129
+ @dataclass
130
+ class ActionValidationResult:
131
+ """Result of action JSON validation"""
132
+ has_action: bool = False
133
+ is_valid_json: bool = False
134
+ action_type: Optional[str] = None
135
+ is_known_action: bool = False
136
+ has_required_fields: bool = False
137
+ raw_json: str = ""
138
+ parsed_action: Optional[Dict] = None
139
+ issues: List[str] = None
140
+
141
+ def __post_init__(self):
142
+ if self.issues is None:
143
+ self.issues = []
144
+
145
+ @property
146
+ def is_valid(self) -> bool:
147
+ return self.has_action and self.is_valid_json and self.is_known_action
148
+
149
+ @property
150
+ def score(self) -> float:
151
+ """Calculate format score for action (0-1)"""
152
+ if not self.has_action:
153
+ return 0.0
154
+
155
+ if not self.is_valid_json:
156
+ return 0.2 # Attempted but failed
157
+
158
+ score = 0.4 # Base for valid JSON
159
+
160
+ if self.is_known_action:
161
+ score += 0.3
162
+
163
+ if self.has_required_fields:
164
+ score += 0.2
165
+
166
+ # Penalty for issues
167
+ score -= len(self.issues) * 0.1
168
+
169
+ return max(0.0, min(1.0, score))
170
+
171
+
172
+ @dataclass
173
+ class ReasoningQualityResult:
174
+ """Result of reasoning quality analysis"""
175
+ analysis_term_count: int = 0
176
+ decision_term_count: int = 0
177
+ risk_term_count: int = 0
178
+ numerical_count: int = 0
179
+ has_market_analysis: bool = False
180
+ has_decision_justification: bool = False
181
+ has_risk_consideration: bool = False
182
+ issues: List[str] = None
183
+
184
+ def __post_init__(self):
185
+ if self.issues is None:
186
+ self.issues = []
187
+
188
+ @property
189
+ def score(self) -> float:
190
+ """Calculate reasoning quality score (0-1)"""
191
+ score = 0.0
192
+
193
+ # Analysis terms
194
+ score += min(0.3, self.analysis_term_count * 0.03)
195
+
196
+ # Decision justification
197
+ if self.has_decision_justification:
198
+ score += 0.2
199
+
200
+ # Risk consideration
201
+ if self.has_risk_consideration:
202
+ score += 0.2
203
+
204
+ # Numerical analysis
205
+ if self.numerical_count > 2:
206
+ score += 0.15
207
+ elif self.numerical_count > 0:
208
+ score += 0.1
209
+
210
+ # Market-specific analysis
211
+ if self.has_market_analysis:
212
+ score += 0.15
213
+
214
+ return max(0.0, min(1.0, score))
215
+
216
+
217
+ @dataclass
218
+ class LengthAnalysisResult:
219
+ """Result of length analysis"""
220
+ total_length: int = 0
221
+ thinking_length: int = 0
222
+ action_length: int = 0
223
+ is_too_short: bool = False
224
+ is_too_long: bool = False
225
+ thinking_is_too_short: bool = False
226
+ thinking_is_too_long: bool = False
227
+
228
+ @property
229
+ def score(self) -> float:
230
+ """Calculate length appropriateness score (0-1)"""
231
+ score = 1.0
232
+
233
+ if self.is_too_short:
234
+ score -= 0.4
235
+ if self.is_too_long:
236
+ score -= 0.2
237
+ if self.thinking_is_too_short:
238
+ score -= 0.2
239
+ if self.thinking_is_too_long:
240
+ score -= 0.1
241
+
242
+ return max(0.0, score)
243
+
244
+
245
+ @dataclass
246
+ class FormatValidationResult:
247
+ """Complete format validation result"""
248
+ think_tags: ThinkTagResult
249
+ action: ActionValidationResult
250
+ reasoning: ReasoningQualityResult
251
+ length: LengthAnalysisResult
252
+
253
+ @property
254
+ def format_score(self) -> float:
255
+ """
256
+ Calculate overall format score (0-1).
257
+
258
+ Weighted combination:
259
+ - Think tags: 35%
260
+ - Action: 35%
261
+ - Length: 15%
262
+ - Reasoning structure: 15%
263
+ """
264
+ return (
265
+ self.think_tags.score * 0.35 +
266
+ self.action.score * 0.35 +
267
+ self.length.score * 0.15 +
268
+ self.reasoning.score * 0.15
269
+ )
270
+
271
+ @property
272
+ def reasoning_score(self) -> float:
273
+ """
274
+ Calculate reasoning quality score (0-1).
275
+
276
+ Based primarily on thinking content quality.
277
+ """
278
+ return self.reasoning.score
279
+
280
+ @property
281
+ def is_valid(self) -> bool:
282
+ """Check if response has valid format"""
283
+ return (
284
+ self.think_tags.is_valid and
285
+ self.action.is_valid and
286
+ not self.length.is_too_short
287
+ )
288
+
289
+ def get_summary(self) -> Dict:
290
+ """Get summary of validation results"""
291
+ return {
292
+ "format_score": round(self.format_score, 3),
293
+ "reasoning_score": round(self.reasoning_score, 3),
294
+ "think_tag_score": round(self.think_tags.score, 3),
295
+ "action_score": round(self.action.score, 3),
296
+ "length_score": round(self.length.score, 3),
297
+ "has_thinking": self.think_tags.is_properly_paired,
298
+ "has_valid_action": self.action.is_valid,
299
+ "action_type": self.action.action_type,
300
+ "thinking_length": self.think_tags.thinking_length,
301
+ "total_length": self.length.total_length,
302
+ "issues": (
303
+ self.think_tags.issues +
304
+ self.action.issues +
305
+ self.reasoning.issues
306
+ ),
307
+ }
308
+
309
+
310
+ # =============================================================================
311
+ # Validators
312
+ # =============================================================================
313
+
314
+
315
+ def validate_think_tags(response: str) -> ThinkTagResult:
316
+ """
317
+ Validate think tag usage in response.
318
+
319
+ Checks:
320
+ - Presence of opening and closing tags
321
+ - Proper pairing and nesting
322
+ - Content between tags
323
+ """
324
+ result = ThinkTagResult()
325
+
326
+ # Find all opening tags
327
+ open_matches = list(THINK_TAG_OPEN.finditer(response))
328
+ close_matches = list(THINK_TAG_CLOSE.finditer(response))
329
+
330
+ result.has_open_tag = len(open_matches) > 0
331
+ result.has_close_tag = len(close_matches) > 0
332
+ result.tag_count = len(open_matches) + len(close_matches)
333
+
334
+ # Check for mismatched counts
335
+ if len(open_matches) != len(close_matches):
336
+ result.issues.append(f"Mismatched tags: {len(open_matches)} open, {len(close_matches)} close")
337
+
338
+ # Extract content using full pattern
339
+ full_matches = THINK_TAG_FULL.findall(response)
340
+
341
+ if full_matches:
342
+ result.is_properly_paired = True
343
+ result.thinking_content = "\n".join(full_matches)
344
+ result.thinking_length = len(result.thinking_content.strip())
345
+
346
+ # Check for empty thinking
347
+ if result.thinking_length < 10:
348
+ result.issues.append("Thinking content is too short")
349
+ elif result.has_open_tag and result.has_close_tag:
350
+ # Tags exist but content extraction failed
351
+ result.issues.append("Tags found but content extraction failed")
352
+
353
+ # Check for nested tags (not supported)
354
+ if len(open_matches) > 1:
355
+ result.issues.append("Multiple think tag pairs detected")
356
+
357
+ # Check tag order
358
+ if result.has_open_tag and result.has_close_tag:
359
+ first_open = open_matches[0].start() if open_matches else 0
360
+ first_close = close_matches[0].start() if close_matches else 0
361
+ if first_close < first_open:
362
+ result.issues.append("Closing tag before opening tag")
363
+
364
+ return result
365
+
366
+
367
+ def validate_action_json(response: str) -> ActionValidationResult:
368
+ """
369
+ Validate action JSON in response.
370
+
371
+ Extracts JSON and validates structure.
372
+ """
373
+ result = ActionValidationResult()
374
+
375
+ # Try to extract JSON after </think> tag first
376
+ json_text = response
377
+ if "</think>" in response.lower():
378
+ parts = response.lower().split("</think>")
379
+ if len(parts) >= 2:
380
+ # Use original case for JSON extraction
381
+ think_end = response.lower().rfind("</think>") + len("</think>")
382
+ json_text = response[think_end:].strip()
383
+
384
+ # Find JSON object
385
+ json_match = re.search(r'\{[^{}]*\}', json_text)
386
+ if not json_match:
387
+ # Try full response
388
+ json_match = re.search(r'\{[^{}]*\}', response)
389
+
390
+ if json_match:
391
+ result.raw_json = json_match.group()
392
+ result.has_action = True
393
+
394
+ try:
395
+ parsed = json.loads(result.raw_json)
396
+ result.is_valid_json = True
397
+ result.parsed_action = parsed
398
+
399
+ # Check for action field
400
+ action_type = parsed.get("action")
401
+ if action_type:
402
+ result.action_type = str(action_type).lower()
403
+ result.is_known_action = result.action_type in VALID_ACTION_TYPES
404
+
405
+ if not result.is_known_action:
406
+ result.issues.append(f"Unknown action type: {result.action_type}")
407
+
408
+ # Check required fields
409
+ result.has_required_fields = _check_action_fields(result.action_type, parsed)
410
+
411
+ if not result.has_required_fields:
412
+ result.issues.append(f"Missing required fields for {result.action_type}")
413
+ else:
414
+ result.issues.append("JSON missing 'action' field")
415
+
416
+ except json.JSONDecodeError as e:
417
+ result.issues.append(f"JSON parse error: {str(e)[:50]}")
418
+ else:
419
+ result.issues.append("No JSON object found in response")
420
+
421
+ return result
422
+
423
+
424
+ def _check_action_fields(action_type: str, parsed: Dict) -> bool:
425
+ """Check if required fields are present for action type"""
426
+ required_fields = {
427
+ "buy": ["market", "amount"],
428
+ "sell": ["market", "amount"],
429
+ "open_perp": ["ticker", "size", "direction"],
430
+ "close_perp": ["ticker", "size"],
431
+ "wait": [],
432
+ "trade": ["market"],
433
+ "predict": ["market"],
434
+ "post": ["content"],
435
+ "create_post": ["content"],
436
+ "send_dm": ["recipient"],
437
+ "dm": ["recipient"],
438
+ "research": [],
439
+ "analyze": [],
440
+ }
441
+
442
+ fields_needed = required_fields.get(action_type, [])
443
+ return all(field in parsed for field in fields_needed)
444
+
445
+
446
+ def analyze_reasoning_quality(thinking_content: str) -> ReasoningQualityResult:
447
+ """
448
+ Analyze quality of reasoning in thinking content.
449
+
450
+ Checks for presence of analysis terms, justifications, and risk awareness.
451
+ """
452
+ result = ReasoningQualityResult()
453
+
454
+ if not thinking_content:
455
+ return result
456
+
457
+ content_lower = thinking_content.lower()
458
+
459
+ # Count analysis terms
460
+ for term in ANALYSIS_TERMS:
461
+ if term in content_lower:
462
+ result.analysis_term_count += 1
463
+
464
+ # Check decision terms
465
+ for term in DECISION_TERMS:
466
+ if term in content_lower:
467
+ result.decision_term_count += 1
468
+ result.has_decision_justification = result.decision_term_count > 0
469
+
470
+ # Check risk terms
471
+ for term in RISK_TERMS:
472
+ if term in content_lower:
473
+ result.risk_term_count += 1
474
+ result.has_risk_consideration = result.risk_term_count > 0
475
+
476
+ # Count numerical references
477
+ numbers = NUMERICAL_PATTERN.findall(thinking_content)
478
+ result.numerical_count = len(numbers)
479
+
480
+ # Check for market-specific analysis
481
+ market_terms = {"btc", "eth", "bitcoin", "ethereum", "crypto", "stock", "market"}
482
+ result.has_market_analysis = any(term in content_lower for term in market_terms)
483
+
484
+ # Quality issues
485
+ if result.analysis_term_count < 2:
486
+ result.issues.append("Limited market analysis vocabulary")
487
+
488
+ if not result.has_decision_justification:
489
+ result.issues.append("No decision justification phrases")
490
+
491
+ return result
492
+
493
+
494
+ def analyze_length(
495
+ response: str,
496
+ thinking_content: str,
497
+ action_json: str,
498
+ ) -> LengthAnalysisResult:
499
+ """
500
+ Analyze response length characteristics.
501
+ """
502
+ result = LengthAnalysisResult()
503
+
504
+ result.total_length = len(response)
505
+ result.thinking_length = len(thinking_content)
506
+ result.action_length = len(action_json)
507
+
508
+ # Check total length
509
+ result.is_too_short = result.total_length < MIN_RESPONSE_LENGTH
510
+ result.is_too_long = result.total_length > MAX_RESPONSE_LENGTH
511
+
512
+ # Check thinking length
513
+ result.thinking_is_too_short = result.thinking_length < MIN_THINKING_LENGTH
514
+ result.thinking_is_too_long = result.thinking_length > MAX_THINKING_LENGTH
515
+
516
+ return result
517
+
518
+
519
+ # =============================================================================
520
+ # Main Validation Function
521
+ # =============================================================================
522
+
523
+
524
+ def validate_response_format(response: str) -> FormatValidationResult:
525
+ """
526
+ Validate complete response format.
527
+
528
+ Performs all validation checks and returns comprehensive result.
529
+ """
530
+ # Validate think tags
531
+ think_result = validate_think_tags(response)
532
+
533
+ # Validate action JSON
534
+ action_result = validate_action_json(response)
535
+
536
+ # Analyze reasoning quality
537
+ reasoning_result = analyze_reasoning_quality(think_result.thinking_content)
538
+
539
+ # Analyze length
540
+ length_result = analyze_length(
541
+ response,
542
+ think_result.thinking_content,
543
+ action_result.raw_json,
544
+ )
545
+
546
+ return FormatValidationResult(
547
+ think_tags=think_result,
548
+ action=action_result,
549
+ reasoning=reasoning_result,
550
+ length=length_result,
551
+ )
552
+
553
+
554
+ def get_format_and_reasoning_scores(response: str) -> Tuple[float, float]:
555
+ """
556
+ Convenience function to get format and reasoning scores.
557
+
558
+ Returns:
559
+ (format_score, reasoning_score) both in range [0, 1]
560
+ """
561
+ result = validate_response_format(response)
562
+ return result.format_score, result.reasoning_score
563
+
564
+
565
+ def validate_for_training(response: str) -> Dict:
566
+ """
567
+ Validate response format for training reward calculation.
568
+
569
+ Returns dict compatible with reward function inputs.
570
+ """
571
+ result = validate_response_format(response)
572
+ summary = result.get_summary()
573
+
574
+ return {
575
+ "format_score": summary["format_score"],
576
+ "reasoning_score": summary["reasoning_score"],
577
+ "has_thinking": summary["has_thinking"],
578
+ "has_valid_action": summary["has_valid_action"],
579
+ "action_type": summary["action_type"],
580
+ "thinking_length": summary["thinking_length"],
581
+ "issues": summary["issues"],
582
+ }
583
+
584
+