ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,219 @@
1
+ """Probability of Backtest Overfitting (PBO).
2
+
3
+ PBO measures the probability that a strategy selected as best in-sample
4
+ performs below median out-of-sample. A high PBO indicates overfitting.
5
+
6
+ This module is intentionally separate from DSR/Sharpe inference because
7
+ PBO is a model selection diagnostic, not a statistical inference tool.
8
+
9
+ References
10
+ ----------
11
+ Bailey, D. H., & López de Prado, M. (2014). "The Probability of Backtest
12
+ Overfitting." Journal of Computational Finance, 20(4), 39-69.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+ from typing import Any
19
+
20
+ import numpy as np
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class PBOResult:
25
+ """Result of Probability of Backtest Overfitting calculation.
26
+
27
+ Attributes
28
+ ----------
29
+ pbo : float
30
+ Probability of Backtest Overfitting (0 to 1).
31
+ pbo_pct : float
32
+ PBO as percentage (0 to 100).
33
+ n_combinations : int
34
+ Number of IS/OOS combinations evaluated.
35
+ n_strategies : int
36
+ Number of strategies compared.
37
+ is_best_rank_oos_median : float
38
+ Median OOS rank of IS-best strategy.
39
+ is_best_rank_oos_mean : float
40
+ Mean OOS rank of IS-best strategy.
41
+ degradation_mean : float
42
+ Average OOS performance degradation vs IS.
43
+ degradation_std : float
44
+ Std of degradation.
45
+ """
46
+
47
+ pbo: float
48
+ pbo_pct: float
49
+ n_combinations: int
50
+ n_strategies: int
51
+ is_best_rank_oos_median: float
52
+ is_best_rank_oos_mean: float
53
+ degradation_mean: float
54
+ degradation_std: float
55
+
56
+ def interpret(self) -> str:
57
+ """Generate human-readable interpretation."""
58
+ if self.pbo < 0.10:
59
+ risk_level = "LOW"
60
+ assessment = "Strategy selection appears robust"
61
+ elif self.pbo < 0.30:
62
+ risk_level = "MODERATE"
63
+ assessment = "Some overfitting risk - consider out-of-sample validation"
64
+ elif self.pbo < 0.50:
65
+ risk_level = "HIGH"
66
+ assessment = "Significant overfitting risk - results may not generalize"
67
+ else:
68
+ risk_level = "SEVERE"
69
+ assessment = "IS selection is counterproductive - consider alternative methods"
70
+
71
+ return (
72
+ f"Probability of Backtest Overfitting (PBO)\n"
73
+ f" PBO: {self.pbo_pct:.1f}%\n"
74
+ f" Risk Level: {risk_level}\n"
75
+ f" Assessment: {assessment}\n"
76
+ f"\n"
77
+ f" Combinations: {self.n_combinations}\n"
78
+ f" Strategies: {self.n_strategies}\n"
79
+ f" IS-Best OOS Rank: {self.is_best_rank_oos_median:.1f} (median), "
80
+ f"{self.is_best_rank_oos_mean:.1f} (mean)\n"
81
+ f" Performance Degradation: {self.degradation_mean:.4f} +/- {self.degradation_std:.4f}"
82
+ )
83
+
84
+ def to_dict(self) -> dict[str, float]:
85
+ """Convert to dictionary."""
86
+ return {
87
+ "pbo": self.pbo,
88
+ "pbo_pct": self.pbo_pct,
89
+ "n_combinations": self.n_combinations,
90
+ "n_strategies": self.n_strategies,
91
+ "is_best_rank_oos_median": self.is_best_rank_oos_median,
92
+ "is_best_rank_oos_mean": self.is_best_rank_oos_mean,
93
+ "degradation_mean": self.degradation_mean,
94
+ "degradation_std": self.degradation_std,
95
+ }
96
+
97
+
98
+ def compute_pbo(
99
+ is_performance: np.ndarray[Any, np.dtype[Any]],
100
+ oos_performance: np.ndarray[Any, np.dtype[Any]],
101
+ ) -> PBOResult:
102
+ """Compute Probability of Backtest Overfitting (PBO).
103
+
104
+ PBO measures the probability that a strategy selected as best in-sample
105
+ performs below median out-of-sample. A high PBO indicates overfitting.
106
+
107
+ Definition
108
+ ----------
109
+ From Bailey & López de Prado (2014):
110
+
111
+ .. math::
112
+
113
+ PBO = P(rank_{OOS}(\\arg\\max_{IS}) > N/2)
114
+
115
+ In plain English: what's the probability that the best in-sample strategy
116
+ ranks in the bottom half out-of-sample?
117
+
118
+ Interpretation
119
+ --------------
120
+ - PBO = 0%: No overfitting (best IS is also best OOS)
121
+ - PBO = 50%: Random selection (IS performance uncorrelated with OOS)
122
+ - PBO > 50%: Severe overfitting (IS selection is counterproductive)
123
+
124
+ Parameters
125
+ ----------
126
+ is_performance : np.ndarray, shape (n_folds, n_strategies) or (n_combinations,)
127
+ In-sample performance metrics (Sharpe, IC, returns) for each strategy.
128
+ oos_performance : np.ndarray, shape (n_folds, n_strategies) or (n_combinations,)
129
+ Out-of-sample performance metrics (same structure as is_performance).
130
+
131
+ Returns
132
+ -------
133
+ PBOResult
134
+ Result object with PBO and diagnostic metrics.
135
+ Call .interpret() for human-readable assessment.
136
+
137
+ Raises
138
+ ------
139
+ ValueError
140
+ If arrays have different shapes or fewer than 2 strategies.
141
+
142
+ Examples
143
+ --------
144
+ >>> import numpy as np
145
+ >>> # 10 CV folds, 5 strategies
146
+ >>> is_perf = np.random.randn(10, 5)
147
+ >>> oos_perf = np.random.randn(10, 5)
148
+ >>> result = compute_pbo(is_perf, oos_perf)
149
+ >>> print(result.interpret())
150
+
151
+ References
152
+ ----------
153
+ Bailey, D. H., & López de Prado, M. (2014). "The Probability of Backtest
154
+ Overfitting." Journal of Computational Finance, 20(4), 39-69.
155
+ """
156
+ is_performance = np.asarray(is_performance)
157
+ oos_performance = np.asarray(oos_performance)
158
+
159
+ if is_performance.shape != oos_performance.shape:
160
+ raise ValueError(
161
+ f"is_performance and oos_performance must have same shape. "
162
+ f"Got {is_performance.shape} vs {oos_performance.shape}"
163
+ )
164
+
165
+ # Handle 1D input (single combination with multiple strategies)
166
+ if is_performance.ndim == 1:
167
+ is_performance = is_performance.reshape(1, -1)
168
+ oos_performance = oos_performance.reshape(1, -1)
169
+
170
+ n_combinations, n_strategies = is_performance.shape
171
+
172
+ if n_strategies < 2:
173
+ raise ValueError(f"Need at least 2 strategies, got {n_strategies}")
174
+
175
+ # For each combination, find the IS-best strategy and its OOS rank
176
+ is_best_oos_ranks = []
177
+ degradations = []
178
+
179
+ for i in range(n_combinations):
180
+ is_row = is_performance[i, :]
181
+ oos_row = oos_performance[i, :]
182
+
183
+ # Find strategy with best IS performance
184
+ is_best_idx = np.argmax(is_row)
185
+ is_best_is_perf = is_row[is_best_idx]
186
+ is_best_oos_perf = oos_row[is_best_idx]
187
+
188
+ # Compute OOS rank of IS-best strategy (1 = best, N = worst)
189
+ oos_ranks = n_strategies - np.argsort(np.argsort(oos_row))
190
+ is_best_oos_rank = oos_ranks[is_best_idx]
191
+ is_best_oos_ranks.append(is_best_oos_rank)
192
+
193
+ # Compute degradation (IS - OOS performance)
194
+ degradations.append(is_best_is_perf - is_best_oos_perf)
195
+
196
+ ranks_arr = np.array(is_best_oos_ranks)
197
+ degrad_arr = np.array(degradations)
198
+
199
+ # PBO = P(IS-best ranks in bottom half OOS)
200
+ median_rank = (n_strategies + 1) / 2
201
+ n_below_median = np.sum(ranks_arr > median_rank)
202
+ pbo = n_below_median / n_combinations
203
+
204
+ return PBOResult(
205
+ pbo=float(pbo),
206
+ pbo_pct=float(pbo * 100),
207
+ n_combinations=int(n_combinations),
208
+ n_strategies=int(n_strategies),
209
+ is_best_rank_oos_median=float(np.median(ranks_arr)),
210
+ is_best_rank_oos_mean=float(np.mean(ranks_arr)),
211
+ degradation_mean=float(np.mean(degrad_arr)),
212
+ degradation_std=float(np.std(degrad_arr)),
213
+ )
214
+
215
+
216
+ __all__ = [
217
+ "PBOResult",
218
+ "compute_pbo",
219
+ ]
@@ -0,0 +1,228 @@
1
+ """Bootstrap methods for statistical inference on time series data.
2
+
3
+ This module implements bootstrap methods that preserve temporal dependence
4
+ structure, which is critical for financial time series:
5
+ - Stationary bootstrap (Politis & Romano, 1994)
6
+ - Block bootstrap variants
7
+
8
+ These methods are essential for valid statistical inference when data
9
+ exhibits autocorrelation, which is common in financial returns.
10
+ """
11
+
12
+ import warnings
13
+ from typing import TYPE_CHECKING, Any, Union
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import polars as pl
18
+ from scipy.stats import spearmanr
19
+
20
+ from ml4t.diagnostic.backends.adapter import DataFrameAdapter
21
+
22
+ if TYPE_CHECKING:
23
+ from numpy.typing import NDArray
24
+
25
+
26
+ def stationary_bootstrap_ic(
27
+ predictions: Union[pl.Series, pd.Series, "NDArray[Any]"],
28
+ returns: Union[pl.Series, pd.Series, "NDArray[Any]"],
29
+ n_samples: int = 1000,
30
+ block_size: float | None = None,
31
+ confidence_level: float = 0.95,
32
+ return_details: bool = True,
33
+ ) -> float | dict[str, Any]:
34
+ """Calculate p-value and confidence intervals for IC using stationary bootstrap.
35
+
36
+ This method is more rigorous than the HAC approximation as it:
37
+ 1. Preserves the temporal dependence structure of the data
38
+ 2. Does not rely on asymptotic approximations for rank correlation
39
+ 3. Provides accurate confidence intervals for finite samples
40
+
41
+ The stationary bootstrap (Politis & Romano, 1994) generates bootstrap samples
42
+ by resampling blocks of random length from the original data, preserving the
43
+ weak dependence structure of the time series.
44
+
45
+ Parameters
46
+ ----------
47
+ predictions : array-like
48
+ Model predictions or signals
49
+ returns : array-like
50
+ Actual returns or target values
51
+ n_samples : int, default=1000
52
+ Number of bootstrap samples to generate
53
+ block_size : float, optional
54
+ Expected block size for the stationary bootstrap.
55
+ If None, uses optimal block size based on data autocorrelation.
56
+ confidence_level : float, default=0.95
57
+ Confidence level for the confidence interval
58
+ return_details : bool, default=True
59
+ If True, returns detailed results including CI and p-value
60
+
61
+ Returns
62
+ -------
63
+ float or dict
64
+ If return_details=False: p-value for the null hypothesis (IC=0)
65
+ If return_details=True: Dictionary containing IC, p_value, CI, etc.
66
+
67
+ References
68
+ ----------
69
+ Politis, D. N., & Romano, J. P. (1994). The stationary bootstrap.
70
+ Journal of the American Statistical Association, 89(428), 1303-1313.
71
+ """
72
+ # Convert to numpy arrays
73
+ pred_array = DataFrameAdapter.to_numpy(predictions).flatten()
74
+ ret_array = DataFrameAdapter.to_numpy(returns).flatten()
75
+
76
+ # Validate inputs
77
+ if len(pred_array) != len(ret_array):
78
+ raise ValueError("Predictions and returns must have the same length")
79
+
80
+ # Remove NaN pairs
81
+ valid_mask = ~(np.isnan(pred_array) | np.isnan(ret_array))
82
+ pred_clean = pred_array[valid_mask]
83
+ ret_clean = ret_array[valid_mask]
84
+
85
+ n = len(pred_clean)
86
+ if n < 30:
87
+ warnings.warn(
88
+ f"Sample size ({n}) may be too small for reliable bootstrap inference",
89
+ stacklevel=2,
90
+ )
91
+
92
+ # Calculate observed IC
93
+ observed_ic, _ = spearmanr(pred_clean, ret_clean)
94
+
95
+ if np.isnan(observed_ic):
96
+ if return_details:
97
+ return {
98
+ "ic": np.nan,
99
+ "p_value": np.nan,
100
+ "ci_lower": np.nan,
101
+ "ci_upper": np.nan,
102
+ "bootstrap_mean": np.nan,
103
+ "bootstrap_std": np.nan,
104
+ }
105
+ return np.nan
106
+
107
+ # Determine optimal block size if not provided
108
+ if block_size is None:
109
+ block_size = _optimal_block_size(ret_clean)
110
+
111
+ # Generate bootstrap samples under null hypothesis
112
+ bootstrap_ics_null = np.zeros(n_samples)
113
+
114
+ for i in range(n_samples):
115
+ # Generate stationary bootstrap sample
116
+ boot_indices = _stationary_bootstrap_indices(n, block_size)
117
+ # Break relationship by independently bootstrapping predictions
118
+ boot_pred_null = pred_clean[_stationary_bootstrap_indices(n, block_size)]
119
+ boot_ret = ret_clean[boot_indices]
120
+
121
+ # Calculate IC on bootstrap sample
122
+ ic_boot, _ = spearmanr(boot_pred_null, boot_ret)
123
+ bootstrap_ics_null[i] = ic_boot if not np.isnan(ic_boot) else 0.0
124
+
125
+ # Calculate p-value (two-tailed test)
126
+ p_value = np.mean(np.abs(bootstrap_ics_null) >= np.abs(observed_ic))
127
+
128
+ # Calculate confidence interval using percentile method
129
+ bootstrap_ics_actual = np.zeros(n_samples)
130
+ for i in range(n_samples):
131
+ boot_indices = _stationary_bootstrap_indices(n, block_size)
132
+ boot_pred = pred_clean[boot_indices]
133
+ boot_ret = ret_clean[boot_indices]
134
+ ic_boot, _ = spearmanr(boot_pred, boot_ret)
135
+ bootstrap_ics_actual[i] = ic_boot if not np.isnan(ic_boot) else observed_ic
136
+
137
+ alpha = 1 - confidence_level
138
+ ci_lower = np.percentile(bootstrap_ics_actual, 100 * alpha / 2)
139
+ ci_upper = np.percentile(bootstrap_ics_actual, 100 * (1 - alpha / 2))
140
+
141
+ if not return_details:
142
+ return float(p_value)
143
+
144
+ return {
145
+ "ic": float(observed_ic),
146
+ "p_value": float(p_value),
147
+ "ci_lower": float(ci_lower),
148
+ "ci_upper": float(ci_upper),
149
+ "bootstrap_mean": float(np.mean(bootstrap_ics_actual)),
150
+ "bootstrap_std": float(np.std(bootstrap_ics_actual)),
151
+ }
152
+
153
+
154
+ def _stationary_bootstrap_indices(n: int, block_size: float) -> "NDArray[np.int_]":
155
+ """Generate indices for one stationary bootstrap sample.
156
+
157
+ Parameters
158
+ ----------
159
+ n : int
160
+ Sample size
161
+ block_size : float
162
+ Expected block size (1/p where p is the probability of ending a block)
163
+
164
+ Returns
165
+ -------
166
+ np.ndarray
167
+ Bootstrap indices of length n
168
+ """
169
+ p = 1.0 / block_size # Probability of ending a block
170
+ indices: list[int] = []
171
+
172
+ while len(indices) < n:
173
+ # Start a new block at a random position
174
+ start_idx = np.random.randint(0, n)
175
+ # Generate block length from geometric distribution
176
+ block_length = np.random.geometric(p)
177
+ # Add indices from this block (with wrapping)
178
+ for j in range(block_length):
179
+ if len(indices) >= n:
180
+ break
181
+ indices.append((start_idx + j) % n)
182
+
183
+ return np.array(indices[:n], dtype=np.int_)
184
+
185
+
186
+ def _optimal_block_size(data: "NDArray[Any]") -> float:
187
+ """Estimate optimal block size for stationary bootstrap using autocorrelation.
188
+
189
+ Uses a simple rule based on lag-1 autocorrelation to determine block size.
190
+ Higher autocorrelation requires larger blocks to preserve dependence structure.
191
+
192
+ Parameters
193
+ ----------
194
+ data : np.ndarray
195
+ Time series data
196
+
197
+ Returns
198
+ -------
199
+ float
200
+ Optimal block size
201
+ """
202
+ n = len(data)
203
+
204
+ if n < 10:
205
+ return max(1, n // 3)
206
+
207
+ # Standardize the data
208
+ data_std = (data - np.mean(data)) / (np.std(data) + 1e-10)
209
+
210
+ # Calculate lag-1 autocorrelation
211
+ acf_1 = np.corrcoef(data_std[:-1], data_std[1:])[0, 1]
212
+
213
+ # Simple rule: block size increases with autocorrelation
214
+ if np.isnan(acf_1) or acf_1 < 0:
215
+ block_size = max(1, int(n ** (1 / 3)))
216
+ else:
217
+ # Positive autocorrelation: larger blocks needed
218
+ block_size = max(1, int(n ** (1 / 3) * (1 + 2 * acf_1)))
219
+
220
+ # Cap at n/3 to ensure reasonable variation
221
+ return min(block_size, n // 3)
222
+
223
+
224
+ __all__ = [
225
+ "stationary_bootstrap_ic",
226
+ "_stationary_bootstrap_indices",
227
+ "_optimal_block_size",
228
+ ]