ml4t-diagnostic 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml4t/diagnostic/AGENT.md +25 -0
- ml4t/diagnostic/__init__.py +166 -0
- ml4t/diagnostic/backends/__init__.py +10 -0
- ml4t/diagnostic/backends/adapter.py +192 -0
- ml4t/diagnostic/backends/polars_backend.py +899 -0
- ml4t/diagnostic/caching/__init__.py +40 -0
- ml4t/diagnostic/caching/cache.py +331 -0
- ml4t/diagnostic/caching/decorators.py +131 -0
- ml4t/diagnostic/caching/smart_cache.py +339 -0
- ml4t/diagnostic/config/AGENT.md +24 -0
- ml4t/diagnostic/config/README.md +267 -0
- ml4t/diagnostic/config/__init__.py +219 -0
- ml4t/diagnostic/config/barrier_config.py +277 -0
- ml4t/diagnostic/config/base.py +301 -0
- ml4t/diagnostic/config/event_config.py +148 -0
- ml4t/diagnostic/config/feature_config.py +404 -0
- ml4t/diagnostic/config/multi_signal_config.py +55 -0
- ml4t/diagnostic/config/portfolio_config.py +215 -0
- ml4t/diagnostic/config/report_config.py +391 -0
- ml4t/diagnostic/config/sharpe_config.py +202 -0
- ml4t/diagnostic/config/signal_config.py +206 -0
- ml4t/diagnostic/config/trade_analysis_config.py +310 -0
- ml4t/diagnostic/config/validation.py +279 -0
- ml4t/diagnostic/core/__init__.py +29 -0
- ml4t/diagnostic/core/numba_utils.py +315 -0
- ml4t/diagnostic/core/purging.py +372 -0
- ml4t/diagnostic/core/sampling.py +471 -0
- ml4t/diagnostic/errors/__init__.py +205 -0
- ml4t/diagnostic/evaluation/AGENT.md +26 -0
- ml4t/diagnostic/evaluation/__init__.py +437 -0
- ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
- ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
- ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
- ml4t/diagnostic/evaluation/dashboard.py +715 -0
- ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
- ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
- ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
- ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
- ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
- ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
- ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
- ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
- ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
- ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
- ml4t/diagnostic/evaluation/event_analysis.py +647 -0
- ml4t/diagnostic/evaluation/excursion.py +390 -0
- ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
- ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
- ml4t/diagnostic/evaluation/framework.py +935 -0
- ml4t/diagnostic/evaluation/metric_registry.py +255 -0
- ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
- ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
- ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
- ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
- ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
- ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
- ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
- ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
- ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
- ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
- ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
- ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
- ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
- ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
- ml4t/diagnostic/evaluation/multi_signal.py +550 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
- ml4t/diagnostic/evaluation/report_generation.py +824 -0
- ml4t/diagnostic/evaluation/signal_selector.py +452 -0
- ml4t/diagnostic/evaluation/stat_registry.py +139 -0
- ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
- ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
- ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
- ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
- ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
- ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
- ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
- ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
- ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
- ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
- ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
- ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
- ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
- ml4t/diagnostic/evaluation/stats/moments.py +164 -0
- ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
- ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
- ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
- ml4t/diagnostic/evaluation/themes.py +330 -0
- ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
- ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
- ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
- ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
- ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
- ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
- ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
- ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
- ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
- ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
- ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
- ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
- ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
- ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
- ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
- ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
- ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
- ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
- ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
- ml4t/diagnostic/evaluation/validated_cv.py +535 -0
- ml4t/diagnostic/evaluation/visualization.py +1050 -0
- ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
- ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
- ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
- ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
- ml4t/diagnostic/integration/__init__.py +48 -0
- ml4t/diagnostic/integration/backtest_contract.py +671 -0
- ml4t/diagnostic/integration/data_contract.py +316 -0
- ml4t/diagnostic/integration/engineer_contract.py +226 -0
- ml4t/diagnostic/logging/__init__.py +77 -0
- ml4t/diagnostic/logging/logger.py +245 -0
- ml4t/diagnostic/logging/performance.py +234 -0
- ml4t/diagnostic/logging/progress.py +234 -0
- ml4t/diagnostic/logging/wandb.py +412 -0
- ml4t/diagnostic/metrics/__init__.py +9 -0
- ml4t/diagnostic/metrics/percentiles.py +128 -0
- ml4t/diagnostic/py.typed +1 -0
- ml4t/diagnostic/reporting/__init__.py +43 -0
- ml4t/diagnostic/reporting/base.py +130 -0
- ml4t/diagnostic/reporting/html_renderer.py +275 -0
- ml4t/diagnostic/reporting/json_renderer.py +51 -0
- ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
- ml4t/diagnostic/results/AGENT.md +24 -0
- ml4t/diagnostic/results/__init__.py +105 -0
- ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
- ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
- ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
- ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
- ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
- ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
- ml4t/diagnostic/results/barrier_results/validation.py +38 -0
- ml4t/diagnostic/results/base.py +177 -0
- ml4t/diagnostic/results/event_results.py +349 -0
- ml4t/diagnostic/results/feature_results.py +787 -0
- ml4t/diagnostic/results/multi_signal_results.py +431 -0
- ml4t/diagnostic/results/portfolio_results.py +281 -0
- ml4t/diagnostic/results/sharpe_results.py +448 -0
- ml4t/diagnostic/results/signal_results/__init__.py +74 -0
- ml4t/diagnostic/results/signal_results/ic.py +581 -0
- ml4t/diagnostic/results/signal_results/irtc.py +110 -0
- ml4t/diagnostic/results/signal_results/quantile.py +392 -0
- ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
- ml4t/diagnostic/results/signal_results/turnover.py +213 -0
- ml4t/diagnostic/results/signal_results/validation.py +147 -0
- ml4t/diagnostic/signal/AGENT.md +17 -0
- ml4t/diagnostic/signal/__init__.py +69 -0
- ml4t/diagnostic/signal/_report.py +152 -0
- ml4t/diagnostic/signal/_utils.py +261 -0
- ml4t/diagnostic/signal/core.py +275 -0
- ml4t/diagnostic/signal/quantile.py +148 -0
- ml4t/diagnostic/signal/result.py +214 -0
- ml4t/diagnostic/signal/signal_ic.py +129 -0
- ml4t/diagnostic/signal/turnover.py +182 -0
- ml4t/diagnostic/splitters/AGENT.md +19 -0
- ml4t/diagnostic/splitters/__init__.py +36 -0
- ml4t/diagnostic/splitters/base.py +501 -0
- ml4t/diagnostic/splitters/calendar.py +421 -0
- ml4t/diagnostic/splitters/calendar_config.py +91 -0
- ml4t/diagnostic/splitters/combinatorial.py +1064 -0
- ml4t/diagnostic/splitters/config.py +322 -0
- ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
- ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
- ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
- ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
- ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
- ml4t/diagnostic/splitters/group_isolation.py +329 -0
- ml4t/diagnostic/splitters/persistence.py +316 -0
- ml4t/diagnostic/splitters/utils.py +207 -0
- ml4t/diagnostic/splitters/walk_forward.py +757 -0
- ml4t/diagnostic/utils/__init__.py +42 -0
- ml4t/diagnostic/utils/config.py +542 -0
- ml4t/diagnostic/utils/dependencies.py +318 -0
- ml4t/diagnostic/utils/sessions.py +127 -0
- ml4t/diagnostic/validation/__init__.py +54 -0
- ml4t/diagnostic/validation/dataframe.py +274 -0
- ml4t/diagnostic/validation/returns.py +280 -0
- ml4t/diagnostic/validation/timeseries.py +299 -0
- ml4t/diagnostic/visualization/AGENT.md +19 -0
- ml4t/diagnostic/visualization/__init__.py +223 -0
- ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
- ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
- ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
- ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
- ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
- ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
- ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
- ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
- ml4t/diagnostic/visualization/barrier_plots.py +782 -0
- ml4t/diagnostic/visualization/core.py +1060 -0
- ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
- ml4t/diagnostic/visualization/dashboards/base.py +582 -0
- ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
- ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
- ml4t/diagnostic/visualization/dashboards.py +43 -0
- ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
- ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
- ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
- ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
- ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
- ml4t/diagnostic/visualization/feature_plots.py +888 -0
- ml4t/diagnostic/visualization/interaction_plots.py +618 -0
- ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
- ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
- ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
- ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
- ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
- ml4t/diagnostic/visualization/report_generation.py +1343 -0
- ml4t/diagnostic/visualization/signal/__init__.py +103 -0
- ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
- ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
- ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
- ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
- ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
- ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
- ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
- ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
- ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
- ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
- ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
- ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Probability of Backtest Overfitting (PBO).
|
|
2
|
+
|
|
3
|
+
PBO measures the probability that a strategy selected as best in-sample
|
|
4
|
+
performs below median out-of-sample. A high PBO indicates overfitting.
|
|
5
|
+
|
|
6
|
+
This module is intentionally separate from DSR/Sharpe inference because
|
|
7
|
+
PBO is a model selection diagnostic, not a statistical inference tool.
|
|
8
|
+
|
|
9
|
+
References
|
|
10
|
+
----------
|
|
11
|
+
Bailey, D. H., & López de Prado, M. (2014). "The Probability of Backtest
|
|
12
|
+
Overfitting." Journal of Computational Finance, 20(4), 39-69.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class PBOResult:
|
|
25
|
+
"""Result of Probability of Backtest Overfitting calculation.
|
|
26
|
+
|
|
27
|
+
Attributes
|
|
28
|
+
----------
|
|
29
|
+
pbo : float
|
|
30
|
+
Probability of Backtest Overfitting (0 to 1).
|
|
31
|
+
pbo_pct : float
|
|
32
|
+
PBO as percentage (0 to 100).
|
|
33
|
+
n_combinations : int
|
|
34
|
+
Number of IS/OOS combinations evaluated.
|
|
35
|
+
n_strategies : int
|
|
36
|
+
Number of strategies compared.
|
|
37
|
+
is_best_rank_oos_median : float
|
|
38
|
+
Median OOS rank of IS-best strategy.
|
|
39
|
+
is_best_rank_oos_mean : float
|
|
40
|
+
Mean OOS rank of IS-best strategy.
|
|
41
|
+
degradation_mean : float
|
|
42
|
+
Average OOS performance degradation vs IS.
|
|
43
|
+
degradation_std : float
|
|
44
|
+
Std of degradation.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
pbo: float
|
|
48
|
+
pbo_pct: float
|
|
49
|
+
n_combinations: int
|
|
50
|
+
n_strategies: int
|
|
51
|
+
is_best_rank_oos_median: float
|
|
52
|
+
is_best_rank_oos_mean: float
|
|
53
|
+
degradation_mean: float
|
|
54
|
+
degradation_std: float
|
|
55
|
+
|
|
56
|
+
def interpret(self) -> str:
|
|
57
|
+
"""Generate human-readable interpretation."""
|
|
58
|
+
if self.pbo < 0.10:
|
|
59
|
+
risk_level = "LOW"
|
|
60
|
+
assessment = "Strategy selection appears robust"
|
|
61
|
+
elif self.pbo < 0.30:
|
|
62
|
+
risk_level = "MODERATE"
|
|
63
|
+
assessment = "Some overfitting risk - consider out-of-sample validation"
|
|
64
|
+
elif self.pbo < 0.50:
|
|
65
|
+
risk_level = "HIGH"
|
|
66
|
+
assessment = "Significant overfitting risk - results may not generalize"
|
|
67
|
+
else:
|
|
68
|
+
risk_level = "SEVERE"
|
|
69
|
+
assessment = "IS selection is counterproductive - consider alternative methods"
|
|
70
|
+
|
|
71
|
+
return (
|
|
72
|
+
f"Probability of Backtest Overfitting (PBO)\n"
|
|
73
|
+
f" PBO: {self.pbo_pct:.1f}%\n"
|
|
74
|
+
f" Risk Level: {risk_level}\n"
|
|
75
|
+
f" Assessment: {assessment}\n"
|
|
76
|
+
f"\n"
|
|
77
|
+
f" Combinations: {self.n_combinations}\n"
|
|
78
|
+
f" Strategies: {self.n_strategies}\n"
|
|
79
|
+
f" IS-Best OOS Rank: {self.is_best_rank_oos_median:.1f} (median), "
|
|
80
|
+
f"{self.is_best_rank_oos_mean:.1f} (mean)\n"
|
|
81
|
+
f" Performance Degradation: {self.degradation_mean:.4f} +/- {self.degradation_std:.4f}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> dict[str, float]:
|
|
85
|
+
"""Convert to dictionary."""
|
|
86
|
+
return {
|
|
87
|
+
"pbo": self.pbo,
|
|
88
|
+
"pbo_pct": self.pbo_pct,
|
|
89
|
+
"n_combinations": self.n_combinations,
|
|
90
|
+
"n_strategies": self.n_strategies,
|
|
91
|
+
"is_best_rank_oos_median": self.is_best_rank_oos_median,
|
|
92
|
+
"is_best_rank_oos_mean": self.is_best_rank_oos_mean,
|
|
93
|
+
"degradation_mean": self.degradation_mean,
|
|
94
|
+
"degradation_std": self.degradation_std,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compute_pbo(
|
|
99
|
+
is_performance: np.ndarray[Any, np.dtype[Any]],
|
|
100
|
+
oos_performance: np.ndarray[Any, np.dtype[Any]],
|
|
101
|
+
) -> PBOResult:
|
|
102
|
+
"""Compute Probability of Backtest Overfitting (PBO).
|
|
103
|
+
|
|
104
|
+
PBO measures the probability that a strategy selected as best in-sample
|
|
105
|
+
performs below median out-of-sample. A high PBO indicates overfitting.
|
|
106
|
+
|
|
107
|
+
Definition
|
|
108
|
+
----------
|
|
109
|
+
From Bailey & López de Prado (2014):
|
|
110
|
+
|
|
111
|
+
.. math::
|
|
112
|
+
|
|
113
|
+
PBO = P(rank_{OOS}(\\arg\\max_{IS}) > N/2)
|
|
114
|
+
|
|
115
|
+
In plain English: what's the probability that the best in-sample strategy
|
|
116
|
+
ranks in the bottom half out-of-sample?
|
|
117
|
+
|
|
118
|
+
Interpretation
|
|
119
|
+
--------------
|
|
120
|
+
- PBO = 0%: No overfitting (best IS is also best OOS)
|
|
121
|
+
- PBO = 50%: Random selection (IS performance uncorrelated with OOS)
|
|
122
|
+
- PBO > 50%: Severe overfitting (IS selection is counterproductive)
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
is_performance : np.ndarray, shape (n_folds, n_strategies) or (n_combinations,)
|
|
127
|
+
In-sample performance metrics (Sharpe, IC, returns) for each strategy.
|
|
128
|
+
oos_performance : np.ndarray, shape (n_folds, n_strategies) or (n_combinations,)
|
|
129
|
+
Out-of-sample performance metrics (same structure as is_performance).
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
PBOResult
|
|
134
|
+
Result object with PBO and diagnostic metrics.
|
|
135
|
+
Call .interpret() for human-readable assessment.
|
|
136
|
+
|
|
137
|
+
Raises
|
|
138
|
+
------
|
|
139
|
+
ValueError
|
|
140
|
+
If arrays have different shapes or fewer than 2 strategies.
|
|
141
|
+
|
|
142
|
+
Examples
|
|
143
|
+
--------
|
|
144
|
+
>>> import numpy as np
|
|
145
|
+
>>> # 10 CV folds, 5 strategies
|
|
146
|
+
>>> is_perf = np.random.randn(10, 5)
|
|
147
|
+
>>> oos_perf = np.random.randn(10, 5)
|
|
148
|
+
>>> result = compute_pbo(is_perf, oos_perf)
|
|
149
|
+
>>> print(result.interpret())
|
|
150
|
+
|
|
151
|
+
References
|
|
152
|
+
----------
|
|
153
|
+
Bailey, D. H., & López de Prado, M. (2014). "The Probability of Backtest
|
|
154
|
+
Overfitting." Journal of Computational Finance, 20(4), 39-69.
|
|
155
|
+
"""
|
|
156
|
+
is_performance = np.asarray(is_performance)
|
|
157
|
+
oos_performance = np.asarray(oos_performance)
|
|
158
|
+
|
|
159
|
+
if is_performance.shape != oos_performance.shape:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"is_performance and oos_performance must have same shape. "
|
|
162
|
+
f"Got {is_performance.shape} vs {oos_performance.shape}"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Handle 1D input (single combination with multiple strategies)
|
|
166
|
+
if is_performance.ndim == 1:
|
|
167
|
+
is_performance = is_performance.reshape(1, -1)
|
|
168
|
+
oos_performance = oos_performance.reshape(1, -1)
|
|
169
|
+
|
|
170
|
+
n_combinations, n_strategies = is_performance.shape
|
|
171
|
+
|
|
172
|
+
if n_strategies < 2:
|
|
173
|
+
raise ValueError(f"Need at least 2 strategies, got {n_strategies}")
|
|
174
|
+
|
|
175
|
+
# For each combination, find the IS-best strategy and its OOS rank
|
|
176
|
+
is_best_oos_ranks = []
|
|
177
|
+
degradations = []
|
|
178
|
+
|
|
179
|
+
for i in range(n_combinations):
|
|
180
|
+
is_row = is_performance[i, :]
|
|
181
|
+
oos_row = oos_performance[i, :]
|
|
182
|
+
|
|
183
|
+
# Find strategy with best IS performance
|
|
184
|
+
is_best_idx = np.argmax(is_row)
|
|
185
|
+
is_best_is_perf = is_row[is_best_idx]
|
|
186
|
+
is_best_oos_perf = oos_row[is_best_idx]
|
|
187
|
+
|
|
188
|
+
# Compute OOS rank of IS-best strategy (1 = best, N = worst)
|
|
189
|
+
oos_ranks = n_strategies - np.argsort(np.argsort(oos_row))
|
|
190
|
+
is_best_oos_rank = oos_ranks[is_best_idx]
|
|
191
|
+
is_best_oos_ranks.append(is_best_oos_rank)
|
|
192
|
+
|
|
193
|
+
# Compute degradation (IS - OOS performance)
|
|
194
|
+
degradations.append(is_best_is_perf - is_best_oos_perf)
|
|
195
|
+
|
|
196
|
+
ranks_arr = np.array(is_best_oos_ranks)
|
|
197
|
+
degrad_arr = np.array(degradations)
|
|
198
|
+
|
|
199
|
+
# PBO = P(IS-best ranks in bottom half OOS)
|
|
200
|
+
median_rank = (n_strategies + 1) / 2
|
|
201
|
+
n_below_median = np.sum(ranks_arr > median_rank)
|
|
202
|
+
pbo = n_below_median / n_combinations
|
|
203
|
+
|
|
204
|
+
return PBOResult(
|
|
205
|
+
pbo=float(pbo),
|
|
206
|
+
pbo_pct=float(pbo * 100),
|
|
207
|
+
n_combinations=int(n_combinations),
|
|
208
|
+
n_strategies=int(n_strategies),
|
|
209
|
+
is_best_rank_oos_median=float(np.median(ranks_arr)),
|
|
210
|
+
is_best_rank_oos_mean=float(np.mean(ranks_arr)),
|
|
211
|
+
degradation_mean=float(np.mean(degrad_arr)),
|
|
212
|
+
degradation_std=float(np.std(degrad_arr)),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
__all__ = [
|
|
217
|
+
"PBOResult",
|
|
218
|
+
"compute_pbo",
|
|
219
|
+
]
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Bootstrap methods for statistical inference on time series data.
|
|
2
|
+
|
|
3
|
+
This module implements bootstrap methods that preserve temporal dependence
|
|
4
|
+
structure, which is critical for financial time series:
|
|
5
|
+
- Stationary bootstrap (Politis & Romano, 1994)
|
|
6
|
+
- Block bootstrap variants
|
|
7
|
+
|
|
8
|
+
These methods are essential for valid statistical inference when data
|
|
9
|
+
exhibits autocorrelation, which is common in financial returns.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import warnings
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import polars as pl
|
|
18
|
+
from scipy.stats import spearmanr
|
|
19
|
+
|
|
20
|
+
from ml4t.diagnostic.backends.adapter import DataFrameAdapter
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from numpy.typing import NDArray
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def stationary_bootstrap_ic(
|
|
27
|
+
predictions: Union[pl.Series, pd.Series, "NDArray[Any]"],
|
|
28
|
+
returns: Union[pl.Series, pd.Series, "NDArray[Any]"],
|
|
29
|
+
n_samples: int = 1000,
|
|
30
|
+
block_size: float | None = None,
|
|
31
|
+
confidence_level: float = 0.95,
|
|
32
|
+
return_details: bool = True,
|
|
33
|
+
) -> float | dict[str, Any]:
|
|
34
|
+
"""Calculate p-value and confidence intervals for IC using stationary bootstrap.
|
|
35
|
+
|
|
36
|
+
This method is more rigorous than the HAC approximation as it:
|
|
37
|
+
1. Preserves the temporal dependence structure of the data
|
|
38
|
+
2. Does not rely on asymptotic approximations for rank correlation
|
|
39
|
+
3. Provides accurate confidence intervals for finite samples
|
|
40
|
+
|
|
41
|
+
The stationary bootstrap (Politis & Romano, 1994) generates bootstrap samples
|
|
42
|
+
by resampling blocks of random length from the original data, preserving the
|
|
43
|
+
weak dependence structure of the time series.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
predictions : array-like
|
|
48
|
+
Model predictions or signals
|
|
49
|
+
returns : array-like
|
|
50
|
+
Actual returns or target values
|
|
51
|
+
n_samples : int, default=1000
|
|
52
|
+
Number of bootstrap samples to generate
|
|
53
|
+
block_size : float, optional
|
|
54
|
+
Expected block size for the stationary bootstrap.
|
|
55
|
+
If None, uses optimal block size based on data autocorrelation.
|
|
56
|
+
confidence_level : float, default=0.95
|
|
57
|
+
Confidence level for the confidence interval
|
|
58
|
+
return_details : bool, default=True
|
|
59
|
+
If True, returns detailed results including CI and p-value
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
float or dict
|
|
64
|
+
If return_details=False: p-value for the null hypothesis (IC=0)
|
|
65
|
+
If return_details=True: Dictionary containing IC, p_value, CI, etc.
|
|
66
|
+
|
|
67
|
+
References
|
|
68
|
+
----------
|
|
69
|
+
Politis, D. N., & Romano, J. P. (1994). The stationary bootstrap.
|
|
70
|
+
Journal of the American Statistical Association, 89(428), 1303-1313.
|
|
71
|
+
"""
|
|
72
|
+
# Convert to numpy arrays
|
|
73
|
+
pred_array = DataFrameAdapter.to_numpy(predictions).flatten()
|
|
74
|
+
ret_array = DataFrameAdapter.to_numpy(returns).flatten()
|
|
75
|
+
|
|
76
|
+
# Validate inputs
|
|
77
|
+
if len(pred_array) != len(ret_array):
|
|
78
|
+
raise ValueError("Predictions and returns must have the same length")
|
|
79
|
+
|
|
80
|
+
# Remove NaN pairs
|
|
81
|
+
valid_mask = ~(np.isnan(pred_array) | np.isnan(ret_array))
|
|
82
|
+
pred_clean = pred_array[valid_mask]
|
|
83
|
+
ret_clean = ret_array[valid_mask]
|
|
84
|
+
|
|
85
|
+
n = len(pred_clean)
|
|
86
|
+
if n < 30:
|
|
87
|
+
warnings.warn(
|
|
88
|
+
f"Sample size ({n}) may be too small for reliable bootstrap inference",
|
|
89
|
+
stacklevel=2,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Calculate observed IC
|
|
93
|
+
observed_ic, _ = spearmanr(pred_clean, ret_clean)
|
|
94
|
+
|
|
95
|
+
if np.isnan(observed_ic):
|
|
96
|
+
if return_details:
|
|
97
|
+
return {
|
|
98
|
+
"ic": np.nan,
|
|
99
|
+
"p_value": np.nan,
|
|
100
|
+
"ci_lower": np.nan,
|
|
101
|
+
"ci_upper": np.nan,
|
|
102
|
+
"bootstrap_mean": np.nan,
|
|
103
|
+
"bootstrap_std": np.nan,
|
|
104
|
+
}
|
|
105
|
+
return np.nan
|
|
106
|
+
|
|
107
|
+
# Determine optimal block size if not provided
|
|
108
|
+
if block_size is None:
|
|
109
|
+
block_size = _optimal_block_size(ret_clean)
|
|
110
|
+
|
|
111
|
+
# Generate bootstrap samples under null hypothesis
|
|
112
|
+
bootstrap_ics_null = np.zeros(n_samples)
|
|
113
|
+
|
|
114
|
+
for i in range(n_samples):
|
|
115
|
+
# Generate stationary bootstrap sample
|
|
116
|
+
boot_indices = _stationary_bootstrap_indices(n, block_size)
|
|
117
|
+
# Break relationship by independently bootstrapping predictions
|
|
118
|
+
boot_pred_null = pred_clean[_stationary_bootstrap_indices(n, block_size)]
|
|
119
|
+
boot_ret = ret_clean[boot_indices]
|
|
120
|
+
|
|
121
|
+
# Calculate IC on bootstrap sample
|
|
122
|
+
ic_boot, _ = spearmanr(boot_pred_null, boot_ret)
|
|
123
|
+
bootstrap_ics_null[i] = ic_boot if not np.isnan(ic_boot) else 0.0
|
|
124
|
+
|
|
125
|
+
# Calculate p-value (two-tailed test)
|
|
126
|
+
p_value = np.mean(np.abs(bootstrap_ics_null) >= np.abs(observed_ic))
|
|
127
|
+
|
|
128
|
+
# Calculate confidence interval using percentile method
|
|
129
|
+
bootstrap_ics_actual = np.zeros(n_samples)
|
|
130
|
+
for i in range(n_samples):
|
|
131
|
+
boot_indices = _stationary_bootstrap_indices(n, block_size)
|
|
132
|
+
boot_pred = pred_clean[boot_indices]
|
|
133
|
+
boot_ret = ret_clean[boot_indices]
|
|
134
|
+
ic_boot, _ = spearmanr(boot_pred, boot_ret)
|
|
135
|
+
bootstrap_ics_actual[i] = ic_boot if not np.isnan(ic_boot) else observed_ic
|
|
136
|
+
|
|
137
|
+
alpha = 1 - confidence_level
|
|
138
|
+
ci_lower = np.percentile(bootstrap_ics_actual, 100 * alpha / 2)
|
|
139
|
+
ci_upper = np.percentile(bootstrap_ics_actual, 100 * (1 - alpha / 2))
|
|
140
|
+
|
|
141
|
+
if not return_details:
|
|
142
|
+
return float(p_value)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"ic": float(observed_ic),
|
|
146
|
+
"p_value": float(p_value),
|
|
147
|
+
"ci_lower": float(ci_lower),
|
|
148
|
+
"ci_upper": float(ci_upper),
|
|
149
|
+
"bootstrap_mean": float(np.mean(bootstrap_ics_actual)),
|
|
150
|
+
"bootstrap_std": float(np.std(bootstrap_ics_actual)),
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _stationary_bootstrap_indices(n: int, block_size: float) -> "NDArray[np.int_]":
|
|
155
|
+
"""Generate indices for one stationary bootstrap sample.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
n : int
|
|
160
|
+
Sample size
|
|
161
|
+
block_size : float
|
|
162
|
+
Expected block size (1/p where p is the probability of ending a block)
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
np.ndarray
|
|
167
|
+
Bootstrap indices of length n
|
|
168
|
+
"""
|
|
169
|
+
p = 1.0 / block_size # Probability of ending a block
|
|
170
|
+
indices: list[int] = []
|
|
171
|
+
|
|
172
|
+
while len(indices) < n:
|
|
173
|
+
# Start a new block at a random position
|
|
174
|
+
start_idx = np.random.randint(0, n)
|
|
175
|
+
# Generate block length from geometric distribution
|
|
176
|
+
block_length = np.random.geometric(p)
|
|
177
|
+
# Add indices from this block (with wrapping)
|
|
178
|
+
for j in range(block_length):
|
|
179
|
+
if len(indices) >= n:
|
|
180
|
+
break
|
|
181
|
+
indices.append((start_idx + j) % n)
|
|
182
|
+
|
|
183
|
+
return np.array(indices[:n], dtype=np.int_)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _optimal_block_size(data: "NDArray[Any]") -> float:
|
|
187
|
+
"""Estimate optimal block size for stationary bootstrap using autocorrelation.
|
|
188
|
+
|
|
189
|
+
Uses a simple rule based on lag-1 autocorrelation to determine block size.
|
|
190
|
+
Higher autocorrelation requires larger blocks to preserve dependence structure.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
data : np.ndarray
|
|
195
|
+
Time series data
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
float
|
|
200
|
+
Optimal block size
|
|
201
|
+
"""
|
|
202
|
+
n = len(data)
|
|
203
|
+
|
|
204
|
+
if n < 10:
|
|
205
|
+
return max(1, n // 3)
|
|
206
|
+
|
|
207
|
+
# Standardize the data
|
|
208
|
+
data_std = (data - np.mean(data)) / (np.std(data) + 1e-10)
|
|
209
|
+
|
|
210
|
+
# Calculate lag-1 autocorrelation
|
|
211
|
+
acf_1 = np.corrcoef(data_std[:-1], data_std[1:])[0, 1]
|
|
212
|
+
|
|
213
|
+
# Simple rule: block size increases with autocorrelation
|
|
214
|
+
if np.isnan(acf_1) or acf_1 < 0:
|
|
215
|
+
block_size = max(1, int(n ** (1 / 3)))
|
|
216
|
+
else:
|
|
217
|
+
# Positive autocorrelation: larger blocks needed
|
|
218
|
+
block_size = max(1, int(n ** (1 / 3) * (1 + 2 * acf_1)))
|
|
219
|
+
|
|
220
|
+
# Cap at n/3 to ensure reasonable variation
|
|
221
|
+
return min(block_size, n // 3)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
__all__ = [
|
|
225
|
+
"stationary_bootstrap_ic",
|
|
226
|
+
"_stationary_bootstrap_indices",
|
|
227
|
+
"_optimal_block_size",
|
|
228
|
+
]
|