ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,671 @@
1
+ """ML4T Backtest integration contract for backtest evaluation and comparison.
2
+
3
+ This module defines the API contract between ML4T Diagnostic and ML4T Backtest for:
4
+ 1. Exporting evaluation results to backtest storage
5
+ 2. Comparing live vs backtest performance (Bayesian comparison)
6
+ 3. Supporting paper vs live promotion workflows
7
+
8
+ Example workflow - Backtest evaluation export:
9
+ >>> from ml4t.diagnostic.evaluation import PortfolioEvaluator
10
+ >>> from ml4t.diagnostic.integration import EvaluationExport
11
+ >>>
12
+ >>> # 1. Evaluate backtest results
13
+ >>> evaluator = PortfolioEvaluator(config)
14
+ >>> results = evaluator.evaluate(returns_df)
15
+ >>>
16
+ >>> # 2. Export for ML4T Backtest storage
17
+ >>> export = results.to_backtest_export(
18
+ ... strategy_id="momentum_v1",
19
+ ... environment="backtest"
20
+ ... )
21
+ >>>
22
+ >>> # 3. Store in ML4T Backtest database
23
+ >>> # backtest_engine.store_evaluation(export.to_dict())
24
+
25
+ Example workflow - Live vs Backtest comparison:
26
+ >>> from ml4t.diagnostic.integration import ComparisonRequest
27
+ >>>
28
+ >>> # 1. Create comparison request
29
+ >>> request = ComparisonRequest(
30
+ ... strategy_id="momentum_v1",
31
+ ... backtest_results=backtest_results.to_dict(),
32
+ ... live_results=live_results.to_dict(),
33
+ ... comparison_type="bayesian"
34
+ ... )
35
+ >>>
36
+ >>> # 2. Run Bayesian comparison
37
+ >>> from ml4t.diagnostic.evaluation import BayesianComparison
38
+ >>> comparison = BayesianComparison.from_request(request)
39
+ >>> result = comparison.compare()
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ from datetime import UTC, datetime, timedelta
45
+ from enum import Enum
46
+ from typing import Any, Literal
47
+
48
+ from pydantic import BaseModel, Field, field_validator, model_validator
49
+
50
+
51
+ class EnvironmentType(str, Enum):
52
+ """Strategy execution environment.
53
+
54
+ - BACKTEST: Historical simulation
55
+ - PAPER: Forward testing with simulated execution
56
+ - LIVE: Real trading with real capital
57
+ """
58
+
59
+ BACKTEST = "backtest"
60
+ PAPER = "paper"
61
+ LIVE = "live"
62
+
63
+
64
+ class ComparisonType(str, Enum):
65
+ """Type of performance comparison.
66
+
67
+ - BAYESIAN: Bayesian hypothesis testing (recommended)
68
+ - BOOTSTRAP: Bootstrap confidence intervals
69
+ - PARAMETRIC: T-test and F-test (assumes normality)
70
+ - CUSUM: CUSUM drift detection
71
+ """
72
+
73
+ BAYESIAN = "bayesian"
74
+ BOOTSTRAP = "bootstrap"
75
+ PARAMETRIC = "parametric"
76
+ CUSUM = "cusum"
77
+
78
+
79
+ class TradeRecord(BaseModel):
80
+ """Individual trade record for trade-level SHAP diagnostics.
81
+
82
+ This schema represents a single completed trade from a backtest or live trading.
83
+ Used by ml4t-diagnostics for trade-level analysis, SHAP attribution, and
84
+ error pattern clustering.
85
+
86
+ The schema supports both simple (single-leg) and complex (multi-leg) trades,
87
+ with optional metadata for regime detection and classification.
88
+
89
+ Required Fields:
90
+ timestamp: Trade exit timestamp (when position was closed)
91
+ symbol: Asset symbol (e.g., "AAPL", "BTC-USD")
92
+ entry_price: Average entry price
93
+ exit_price: Average exit price
94
+ pnl: Realized profit/loss (in quote currency)
95
+ duration: Time between entry and exit
96
+
97
+ Optional Fields:
98
+ direction: Trade direction (long/short)
99
+ metadata: Arbitrary metadata (e.g., entry signals, regime info)
100
+ regime_info: Market regime at time of trade
101
+ quantity: Position size
102
+ entry_timestamp: When position was opened
103
+ fees: Total transaction fees
104
+ slippage: Estimated or actual slippage
105
+
106
+ Validation:
107
+ - PnL consistency with prices (for long/short trades)
108
+ - Duration is positive
109
+ - Prices are positive
110
+ - Timestamps are valid
111
+
112
+ Example - Simple long trade:
113
+ >>> from datetime import datetime, timedelta
114
+ >>> trade = TradeRecord(
115
+ ... timestamp=datetime(2024, 1, 15, 10, 30),
116
+ ... symbol="AAPL",
117
+ ... entry_price=150.00,
118
+ ... exit_price=155.00,
119
+ ... pnl=500.00, # (155-150) * 100 shares
120
+ ... duration=timedelta(days=5),
121
+ ... direction="long",
122
+ ... quantity=100
123
+ ... )
124
+
125
+ Example - Short trade with metadata:
126
+ >>> trade = TradeRecord(
127
+ ... timestamp=datetime(2024, 2, 1, 14, 0),
128
+ ... symbol="BTC-USD",
129
+ ... entry_price=45000.0,
130
+ ... exit_price=44000.0,
131
+ ... pnl=1000.0, # (45000-44000) * 1 BTC
132
+ ... duration=timedelta(hours=6),
133
+ ... direction="short",
134
+ ... quantity=1.0,
135
+ ... metadata={
136
+ ... "entry_signal": "momentum_reversal",
137
+ ... "volatility_regime": "high",
138
+ ... "market_regime": "trending_down"
139
+ ... },
140
+ ... fees=50.0,
141
+ ... slippage=20.0
142
+ ... )
143
+
144
+ Example - For SHAP diagnostics workflow:
145
+ >>> # 1. Extract worst trades from backtest
146
+ >>> worst_trades = [t for t in all_trades if t.pnl < threshold]
147
+ >>>
148
+ >>> # 2. Analyze with SHAP
149
+ >>> from ml4t.diagnostic.evaluation import TradeShapAnalyzer
150
+ >>> analyzer = TradeShapAnalyzer(model, features)
151
+ >>> patterns = analyzer.explain_worst_trades(worst_trades)
152
+ >>>
153
+ >>> # 3. Get actionable hypotheses
154
+ >>> for pattern in patterns:
155
+ ... print(pattern.hypothesis)
156
+ ... print(pattern.actions)
157
+ """
158
+
159
+ # Required fields
160
+ timestamp: datetime = Field(
161
+ ...,
162
+ description="Trade exit timestamp (when position was closed)",
163
+ )
164
+ symbol: str = Field(
165
+ ...,
166
+ min_length=1,
167
+ description="Asset symbol (e.g., 'AAPL', 'BTC-USD', 'ES_F')",
168
+ )
169
+ entry_price: float = Field(
170
+ ...,
171
+ gt=0.0,
172
+ description="Average entry price (must be positive)",
173
+ )
174
+ exit_price: float = Field(
175
+ ...,
176
+ gt=0.0,
177
+ description="Average exit price (must be positive)",
178
+ )
179
+ pnl: float = Field(
180
+ ...,
181
+ description="Realized profit/loss in quote currency (can be negative)",
182
+ )
183
+ duration: timedelta = Field(
184
+ ...,
185
+ description="Time between entry and exit (must be positive)",
186
+ )
187
+
188
+ # Optional fields
189
+ direction: Literal["long", "short"] | None = Field(
190
+ None,
191
+ description="Trade direction (long=buy then sell, short=sell then buy)",
192
+ )
193
+ metadata: dict[str, Any] | None = Field(
194
+ None,
195
+ description="Arbitrary metadata (signals, regime info, stop loss triggers, etc.)",
196
+ )
197
+ regime_info: dict[str, str] | None = Field(
198
+ None,
199
+ description="Market regime at trade time (e.g., {'volatility': 'high', 'trend': 'up'})",
200
+ )
201
+ quantity: float | None = Field(
202
+ None,
203
+ gt=0.0,
204
+ description="Position size (number of shares/contracts/coins)",
205
+ )
206
+ entry_timestamp: datetime | None = Field(
207
+ None,
208
+ description="Position entry timestamp (if available)",
209
+ )
210
+ fees: float | None = Field(
211
+ None,
212
+ ge=0.0,
213
+ description="Total transaction fees (commissions + exchange fees)",
214
+ )
215
+ slippage: float | None = Field(
216
+ None,
217
+ ge=0.0,
218
+ description="Estimated or actual slippage cost",
219
+ )
220
+
221
+ @field_validator("duration")
222
+ @classmethod
223
+ def validate_duration_positive(cls, v: timedelta) -> timedelta:
224
+ """Ensure duration is positive."""
225
+ if v.total_seconds() <= 0:
226
+ raise ValueError(f"Duration must be positive, got {v}")
227
+ return v
228
+
229
+ @model_validator(mode="after")
230
+ def validate_pnl_consistency(self) -> TradeRecord:
231
+ """Validate PnL is consistent with prices and direction.
232
+
233
+ For trades with known direction and quantity, verify that the PnL
234
+ calculation matches the price difference.
235
+
236
+ Allows for small discrepancies due to fees and slippage.
237
+ """
238
+ if self.direction is None or self.quantity is None:
239
+ # Cannot validate without direction and quantity
240
+ return self
241
+
242
+ # Calculate expected PnL from price difference
243
+ price_diff = self.exit_price - self.entry_price
244
+
245
+ if self.direction == "long":
246
+ expected_pnl = price_diff * self.quantity
247
+ else: # short
248
+ expected_pnl = -price_diff * self.quantity
249
+
250
+ # Account for fees and slippage
251
+ total_costs = (self.fees or 0.0) + (self.slippage or 0.0)
252
+ expected_pnl -= total_costs
253
+
254
+ # Allow 1% tolerance for rounding and other small discrepancies
255
+ tolerance = abs(expected_pnl) * 0.01 + 0.01 # Minimum 1 cent tolerance
256
+
257
+ actual_diff = abs(self.pnl - expected_pnl)
258
+ if actual_diff > tolerance:
259
+ raise ValueError(
260
+ f"PnL inconsistent with prices. "
261
+ f"Expected ~{expected_pnl:.2f} (from prices), got {self.pnl:.2f}. "
262
+ f"Difference: {actual_diff:.2f}, tolerance: {tolerance:.2f}. "
263
+ f"Check direction, quantity, fees, or slippage."
264
+ )
265
+
266
+ return self
267
+
268
+ @model_validator(mode="after")
269
+ def validate_timestamps(self) -> TradeRecord:
270
+ """Validate timestamp ordering if entry_timestamp provided."""
271
+ if self.entry_timestamp is not None:
272
+ if self.entry_timestamp >= self.timestamp:
273
+ raise ValueError(
274
+ f"Entry timestamp ({self.entry_timestamp}) must be before exit timestamp ({self.timestamp})"
275
+ )
276
+
277
+ # Verify duration matches timestamps
278
+ calculated_duration = self.timestamp - self.entry_timestamp
279
+ # Allow 1 second tolerance for rounding
280
+ if abs((calculated_duration - self.duration).total_seconds()) > 1.0:
281
+ raise ValueError(
282
+ f"Duration ({self.duration}) inconsistent with timestamps. "
283
+ f"Calculated: {calculated_duration} from entry/exit timestamps."
284
+ )
285
+
286
+ return self
287
+
288
+ def to_dict(self) -> dict[str, Any]:
289
+ """Export to dictionary format for storage.
290
+
291
+ Returns:
292
+ Dictionary with all trade data, suitable for JSON serialization
293
+
294
+ Example:
295
+ >>> trade.to_dict()
296
+ {
297
+ 'timestamp': '2024-01-15T10:30:00',
298
+ 'symbol': 'AAPL',
299
+ 'entry_price': 150.0,
300
+ 'exit_price': 155.0,
301
+ 'pnl': 500.0,
302
+ 'duration': 432000.0, # seconds
303
+ 'direction': 'long',
304
+ ...
305
+ }
306
+ """
307
+ data = self.model_dump(mode="json")
308
+ # Convert timedelta to total seconds for JSON compatibility
309
+ if "duration" in data:
310
+ data["duration"] = self.duration.total_seconds()
311
+ return data
312
+
313
+ @classmethod
314
+ def from_dict(cls, data: dict[str, Any]) -> TradeRecord:
315
+ """Create TradeRecord from dictionary.
316
+
317
+ Args:
318
+ data: Dictionary with trade data (from to_dict() or ML4T Backtest)
319
+
320
+ Returns:
321
+ TradeRecord instance
322
+
323
+ Example:
324
+ >>> data = {
325
+ ... 'timestamp': '2024-01-15T10:30:00',
326
+ ... 'symbol': 'AAPL',
327
+ ... 'entry_price': 150.0,
328
+ ... 'exit_price': 155.0,
329
+ ... 'pnl': 500.0,
330
+ ... 'duration': 432000.0 # seconds
331
+ ... }
332
+ >>> trade = TradeRecord.from_dict(data)
333
+ """
334
+ # Convert duration from seconds if needed
335
+ if "duration" in data and isinstance(data["duration"], int | float):
336
+ data["duration"] = timedelta(seconds=data["duration"])
337
+ return cls(**data)
338
+
339
+
340
+ class StrategyMetadata(BaseModel):
341
+ """Metadata about the strategy being evaluated.
342
+
343
+ This provides context for ML4T Backtest to track evaluations across
344
+ different versions, environments, and time periods.
345
+
346
+ Attributes:
347
+ strategy_id: Unique strategy identifier (e.g., "momentum_v1")
348
+ version: Strategy version (e.g., "1.2.3")
349
+ environment: Execution environment (backtest/paper/live)
350
+ start_date: Evaluation period start
351
+ end_date: Evaluation period end
352
+ config_hash: Hash of strategy configuration for reproducibility
353
+ description: Optional human-readable description
354
+
355
+ Example:
356
+ >>> metadata = StrategyMetadata(
357
+ ... strategy_id="momentum_rsi",
358
+ ... version="1.0.0",
359
+ ... environment=EnvironmentType.BACKTEST,
360
+ ... start_date=datetime(2020, 1, 1),
361
+ ... end_date=datetime(2023, 12, 31)
362
+ ... )
363
+ """
364
+
365
+ strategy_id: str = Field(..., description="Unique strategy identifier")
366
+ version: str | None = Field(None, description="Strategy version (semver)")
367
+ environment: EnvironmentType = Field(..., description="Execution environment")
368
+ start_date: datetime = Field(..., description="Evaluation period start")
369
+ end_date: datetime = Field(..., description="Evaluation period end")
370
+ config_hash: str | None = Field(None, description="Strategy config hash for reproducibility")
371
+ description: str | None = Field(None, description="Human-readable description")
372
+ tags: dict[str, str] | None = Field(
373
+ None, description="Optional tags (e.g., {'asset_class': 'crypto'})"
374
+ )
375
+
376
+
377
+ class EvaluationExport(BaseModel):
378
+ """Complete evaluation results for ML4T Backtest storage.
379
+
380
+ This is the primary export format for storing ML4T Diagnostic results in
381
+ ML4T Backtest's database. Contains all metrics, metadata, and diagnostics.
382
+
383
+ Attributes:
384
+ metadata: Strategy metadata (ID, version, environment)
385
+ metrics: Core performance metrics (Sharpe, CAGR, drawdown, etc.)
386
+ diagnostics: Optional diagnostic results (stationarity, correlation, etc.)
387
+ sharpe_framework: Optional enhanced Sharpe results (PSR, DSR, etc.)
388
+ timestamp: Evaluation timestamp (UTC)
389
+ diagnostic_version: ML4T Diagnostic library version for compatibility tracking
390
+
391
+ Example:
392
+ >>> export = EvaluationExport(
393
+ ... metadata=metadata,
394
+ ... metrics={
395
+ ... "sharpe_ratio": 1.85,
396
+ ... "cagr": 0.24,
397
+ ... "max_drawdown": -0.18
398
+ ... },
399
+ ... timestamp=datetime.utcnow()
400
+ ... )
401
+ >>> backtest_engine.store_evaluation(export.to_dict())
402
+ """
403
+
404
+ metadata: StrategyMetadata = Field(..., description="Strategy metadata")
405
+ metrics: dict[str, float] = Field(..., description="Core performance metrics")
406
+ diagnostics: dict[str, dict] | None = Field(None, description="Optional diagnostic results")
407
+ sharpe_framework: dict[str, float] | None = Field(
408
+ None, description="Enhanced Sharpe results (PSR, DSR, MinTRL)"
409
+ )
410
+ timestamp: datetime = Field(
411
+ default_factory=lambda: datetime.now(tz=UTC),
412
+ description="Evaluation timestamp (UTC)",
413
+ )
414
+ diagnostic_version: str | None = Field(
415
+ None, description="ML4T Diagnostic version for compatibility"
416
+ )
417
+
418
+ def to_dict(self) -> dict:
419
+ """Export to ML4T Backtest-compatible dictionary format.
420
+
421
+ Returns dictionary suitable for JSON serialization and storage
422
+ in ML4T Backtest's database.
423
+
424
+ Returns:
425
+ Dictionary with all evaluation data
426
+
427
+ Example:
428
+ >>> export.to_dict()
429
+ {
430
+ 'metadata': {
431
+ 'strategy_id': 'momentum_v1',
432
+ 'environment': 'backtest',
433
+ ...
434
+ },
435
+ 'metrics': {...},
436
+ 'timestamp': '2024-11-03T12:00:00Z'
437
+ }
438
+ """
439
+ return self.model_dump(mode="json")
440
+
441
+ def to_json(self) -> str:
442
+ """Export to JSON string for storage.
443
+
444
+ Returns:
445
+ JSON string representation
446
+
447
+ Example:
448
+ >>> json_str = export.to_json()
449
+ >>> # Store in database or file
450
+ >>> with open('evaluation.json', 'w') as f:
451
+ ... f.write(json_str)
452
+ """
453
+ return self.model_dump_json(indent=2)
454
+
455
+
456
+ class ComparisonRequest(BaseModel):
457
+ """Request for comparing performance across environments.
458
+
459
+ Used for Bayesian comparison of live vs backtest, or paper vs backtest.
460
+ ML4T Diagnostic uses this to determine if live performance matches expectations.
461
+
462
+ Attributes:
463
+ strategy_id: Strategy being compared
464
+ backtest_export: Backtest evaluation results
465
+ live_export: Live/paper evaluation results
466
+ comparison_type: Type of statistical comparison
467
+ confidence_level: Confidence level for tests (default: 0.95)
468
+ hypothesis: Hypothesis being tested
469
+
470
+ Example:
471
+ >>> request = ComparisonRequest(
472
+ ... strategy_id="momentum_v1",
473
+ ... backtest_export=backtest_results,
474
+ ... live_export=live_results,
475
+ ... comparison_type=ComparisonType.BAYESIAN,
476
+ ... hypothesis="live >= backtest"
477
+ ... )
478
+ """
479
+
480
+ strategy_id: str = Field(..., description="Strategy identifier")
481
+ backtest_export: EvaluationExport = Field(..., description="Backtest evaluation")
482
+ live_export: EvaluationExport = Field(..., description="Live/paper evaluation")
483
+ comparison_type: ComparisonType = Field(
484
+ ComparisonType.BAYESIAN, description="Type of comparison"
485
+ )
486
+ confidence_level: float = Field(0.95, ge=0.5, le=0.99, description="Confidence level")
487
+ hypothesis: str | None = Field(None, description="Hypothesis (e.g., 'live >= backtest')")
488
+
489
+
490
+ class ComparisonResult(BaseModel):
491
+ """Result of live vs backtest comparison.
492
+
493
+ Contains statistical evidence for whether live performance matches
494
+ backtest expectations. Used for paper-to-live promotion decisions.
495
+
496
+ Attributes:
497
+ strategy_id: Strategy being compared
498
+ comparison_type: Type of comparison performed
499
+ decision: Recommendation (PROMOTE, REJECT, UNCERTAIN)
500
+ confidence: Confidence in decision [0.0, 1.0]
501
+ metrics_comparison: Comparison of key metrics
502
+ statistical_tests: Statistical test results
503
+ bayesian_evidence: Optional Bayesian evidence (if Bayesian comparison)
504
+ recommendation: Human-readable recommendation
505
+ timestamp: Comparison timestamp
506
+
507
+ Example:
508
+ >>> result = ComparisonResult(
509
+ ... strategy_id="momentum_v1",
510
+ ... comparison_type=ComparisonType.BAYESIAN,
511
+ ... decision="PROMOTE",
512
+ ... confidence=0.92,
513
+ ... metrics_comparison={
514
+ ... "sharpe_ratio": {"backtest": 1.85, "live": 1.72, "diff": -0.13}
515
+ ... },
516
+ ... recommendation="Live performance consistent with backtest"
517
+ ... )
518
+ """
519
+
520
+ strategy_id: str = Field(..., description="Strategy identifier")
521
+ comparison_type: ComparisonType = Field(..., description="Comparison type")
522
+ decision: str = Field(..., description="Decision (PROMOTE, REJECT, UNCERTAIN, MONITOR)")
523
+ confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence in decision")
524
+ metrics_comparison: dict[str, dict[str, float]] = Field(
525
+ ..., description="Comparison of metrics (backtest vs live)"
526
+ )
527
+ statistical_tests: dict[str, dict] = Field(..., description="Statistical test results")
528
+ bayesian_evidence: dict[str, float] | None = Field(
529
+ None, description="Bayesian evidence (BF, posterior prob)"
530
+ )
531
+ recommendation: str = Field(..., description="Human-readable recommendation")
532
+ timestamp: datetime = Field(
533
+ default_factory=lambda: datetime.now(tz=UTC), description="Comparison timestamp"
534
+ )
535
+ warnings: list[str] | None = Field(None, description="Optional warnings")
536
+
537
+ def to_dict(self) -> dict:
538
+ """Export to dictionary format.
539
+
540
+ Returns:
541
+ Dictionary with comparison results
542
+
543
+ Example:
544
+ >>> result.to_dict()
545
+ {
546
+ 'strategy_id': 'momentum_v1',
547
+ 'decision': 'PROMOTE',
548
+ 'confidence': 0.92,
549
+ ...
550
+ }
551
+ """
552
+ return self.model_dump(mode="json")
553
+
554
+ def summary(self) -> str:
555
+ """Human-readable summary of comparison.
556
+
557
+ Returns:
558
+ Formatted summary string
559
+
560
+ Example:
561
+ >>> print(result.summary())
562
+ Strategy Comparison: momentum_v1
563
+ ================================
564
+ Decision: PROMOTE (confidence: 0.92)
565
+
566
+ Metrics Comparison:
567
+ Sharpe Ratio: 1.85 (BT) → 1.72 (Live) [Δ=-0.13]
568
+
569
+ Recommendation: Live performance consistent with backtest
570
+ """
571
+ lines = [f"Strategy Comparison: {self.strategy_id}", "=" * 50]
572
+ lines.append(f"Decision: {self.decision} (confidence: {self.confidence:.2f})")
573
+ lines.append("")
574
+
575
+ # Metrics comparison
576
+ lines.append("Metrics Comparison:")
577
+ for metric, values in self.metrics_comparison.items():
578
+ bt = values.get("backtest", 0)
579
+ live = values.get("live", 0)
580
+ diff = values.get("diff", 0)
581
+ metric_name = metric.replace("_", " ").title()
582
+ lines.append(f" {metric_name}: {bt:.3f} (BT) → {live:.3f} (Live) [Δ={diff:+.3f}]")
583
+
584
+ lines.append("")
585
+ lines.append(f"Recommendation: {self.recommendation}")
586
+
587
+ # Warnings
588
+ if self.warnings:
589
+ lines.append("")
590
+ lines.append("Warnings:")
591
+ for warning in self.warnings:
592
+ lines.append(f" ⚠️ {warning}")
593
+
594
+ return "\n".join(lines)
595
+
596
+
597
+ class PromotionWorkflow(BaseModel):
598
+ """Paper-to-live promotion workflow configuration.
599
+
600
+ Defines the criteria and process for promoting a strategy from
601
+ paper trading to live trading based on evaluation results.
602
+
603
+ Attributes:
604
+ strategy_id: Strategy being promoted
605
+ paper_duration_days: Minimum paper trading duration
606
+ promotion_criteria: Required conditions for promotion
607
+ approval_required: Whether human approval is needed
608
+ risk_limits: Risk limits for live trading
609
+
610
+ Example:
611
+ >>> workflow = PromotionWorkflow(
612
+ ... strategy_id="momentum_v1",
613
+ ... paper_duration_days=30,
614
+ ... promotion_criteria={
615
+ ... "min_sharpe": 1.5,
616
+ ... "max_drawdown": -0.15,
617
+ ... "min_trades": 100,
618
+ ... "bayesian_confidence": 0.90
619
+ ... },
620
+ ... approval_required=True
621
+ ... )
622
+ """
623
+
624
+ strategy_id: str = Field(..., description="Strategy identifier")
625
+ paper_duration_days: int = Field(..., ge=1, description="Minimum paper trading days")
626
+ promotion_criteria: dict[str, float] = Field(
627
+ ..., description="Required conditions for promotion"
628
+ )
629
+ approval_required: bool = Field(True, description="Whether human approval needed")
630
+ risk_limits: dict[str, float] | None = Field(None, description="Risk limits for live trading")
631
+
632
+ def evaluate_promotion(self, comparison_result: ComparisonResult) -> bool:
633
+ """Evaluate if promotion criteria are met.
634
+
635
+ Args:
636
+ comparison_result: Result of paper vs backtest comparison
637
+
638
+ Returns:
639
+ True if promotion criteria satisfied
640
+
641
+ Example:
642
+ >>> workflow.evaluate_promotion(comparison_result)
643
+ True # Ready for promotion
644
+ """
645
+ # Check decision
646
+ if comparison_result.decision != "PROMOTE":
647
+ return False
648
+
649
+ # Check confidence
650
+ min_confidence = self.promotion_criteria.get("bayesian_confidence", 0.9)
651
+ if comparison_result.confidence < min_confidence:
652
+ return False
653
+
654
+ # Check metrics
655
+ for metric, threshold in self.promotion_criteria.items():
656
+ if metric in comparison_result.metrics_comparison:
657
+ comparison_result.metrics_comparison[metric].get("live", 0)
658
+ if metric.startswith("min_"):
659
+ metric_name = metric[4:] # Remove 'min_' prefix
660
+ if metric_name in comparison_result.metrics_comparison and (
661
+ comparison_result.metrics_comparison[metric_name]["live"] < threshold
662
+ ):
663
+ return False
664
+ elif metric.startswith("max_"):
665
+ metric_name = metric[4:] # Remove 'max_' prefix
666
+ if metric_name in comparison_result.metrics_comparison and (
667
+ comparison_result.metrics_comparison[metric_name]["live"] > threshold
668
+ ):
669
+ return False
670
+
671
+ return True