ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,446 @@
1
+ """IC statistical analysis: HAC-adjusted significance and decay analysis.
2
+
3
+ This module provides advanced statistical analysis for IC time series,
4
+ including autocorrelation-robust significance tests and decay analysis.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING, Any, Union, cast
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import polars as pl
12
+ from scipy import stats
13
+ from statsmodels.regression.linear_model import OLS
14
+ from statsmodels.stats.sandwich_covariance import cov_hac
15
+
16
+ from ml4t.diagnostic.evaluation.metrics.information_coefficient import compute_ic_by_horizon
17
+
18
+ if TYPE_CHECKING:
19
+ from numpy.typing import NDArray
20
+
21
+
22
+ def compute_ic_hac_stats(
23
+ ic_series: Union[pl.DataFrame, pd.DataFrame, "NDArray[Any]"],
24
+ ic_col: str = "ic",
25
+ maxlags: int | None = None,
26
+ kernel: str = "bartlett",
27
+ use_correction: bool = True,
28
+ ) -> dict[str, float]:
29
+ """Compute HAC-adjusted significance statistics for IC time series.
30
+
31
+ Uses Newey-West HAC (Heteroskedasticity and Autocorrelation Consistent)
32
+ standard errors to account for autocorrelation in IC time series. This
33
+ provides robust t-statistics and p-values when IC exhibits serial correlation.
34
+
35
+ The Newey-West estimator accounts for:
36
+ 1. Heteroskedasticity: Non-constant variance in IC over time
37
+ 2. Autocorrelation: Serial correlation in IC values
38
+ 3. Lag selection: Automatic selection of optimal lag window
39
+
40
+ Parameters
41
+ ----------
42
+ ic_series : Union[pl.DataFrame, pd.DataFrame, np.ndarray]
43
+ Time series of IC values (from compute_ic_series)
44
+ ic_col : str, default "ic"
45
+ Column name for IC values (if DataFrame)
46
+ maxlags : int | None, default None
47
+ Maximum lag for HAC adjustment. If None, uses Newey-West formula:
48
+ maxlags = floor(4 * (T/100)^(2/9))
49
+ where T is the sample size
50
+ kernel : str, default "bartlett"
51
+ Kernel function for lag weighting:
52
+ - "bartlett": Triangular kernel (Newey-West default)
53
+ - "uniform": Equal weights
54
+ - "parzen": Parzen kernel
55
+ use_correction : bool, default True
56
+ Apply small-sample correction to standard errors
57
+
58
+ Returns
59
+ -------
60
+ dict[str, float]
61
+ Dictionary with HAC-adjusted statistics:
62
+ - mean_ic: Mean IC across time series
63
+ - hac_se: HAC-adjusted standard error
64
+ - t_stat: t-statistic (mean_ic / hac_se)
65
+ - p_value: Two-tailed p-value for H0: IC = 0
66
+ - n_periods: Number of observations
67
+ - effective_lags: Number of lags used in HAC adjustment
68
+ - naive_se: Standard OLS standard error (for comparison)
69
+ - naive_t_stat: Naive t-statistic without HAC adjustment
70
+
71
+ Examples
72
+ --------
73
+ >>> # Compute IC series first
74
+ >>> ic_series = compute_ic_series(pred_df, ret_df)
75
+ >>>
76
+ >>> # Compute HAC-adjusted statistics
77
+ >>> stats = compute_ic_hac_stats(ic_series)
78
+ >>> print(f"Mean IC: {stats['mean_ic']:.4f}")
79
+ >>> print(f"HAC t-stat: {stats['t_stat']:.2f}")
80
+ >>> print(f"P-value: {stats['p_value']:.4f}")
81
+ >>> print(f"Significant: {stats['p_value'] < 0.05}")
82
+ Mean IC: 0.0234
83
+ HAC t-stat: 2.14
84
+ P-value: 0.0327
85
+ Significant: True
86
+ >>>
87
+ >>> # Compare with naive statistics
88
+ >>> print(f"Naive t-stat: {stats['naive_t_stat']:.2f}")
89
+ >>> print(f"HAC adjustment factor: {stats['naive_se'] / stats['hac_se']:.2f}x")
90
+ Naive t-stat: 3.45
91
+ HAC adjustment factor: 1.61x
92
+
93
+ Notes
94
+ -----
95
+ HAC Adjustment Interpretation:
96
+ - HAC SE > Naive SE: Positive autocorrelation detected
97
+ - HAC SE < Naive SE: Negative autocorrelation (rare)
98
+ - HAC SE ~ Naive SE: Little autocorrelation
99
+
100
+ The Newey-West automatic lag selection formula is:
101
+ maxlags = floor(4 * (T/100)^(2/9))
102
+
103
+ For example:
104
+ - T=100 -> maxlags=4
105
+ - T=252 -> maxlags=5
106
+ - T=500 -> maxlags=6
107
+
108
+ References
109
+ ----------
110
+ .. [1] Newey, W. K., & West, K. D. (1987). "A Simple, Positive Semi-Definite,
111
+ Heteroskedasticity and Autocorrelation Consistent Covariance Matrix."
112
+ Econometrica, 55(3), 703-708.
113
+ .. [2] Andrews, D. W. K. (1991). "Heteroskedasticity and Autocorrelation
114
+ Consistent Covariance Matrix Estimation." Econometrica, 59(3), 817-858.
115
+ """
116
+ # Extract IC values
117
+ ic_values: NDArray[Any]
118
+ if isinstance(ic_series, pl.DataFrame | pd.DataFrame):
119
+ is_polars = isinstance(ic_series, pl.DataFrame)
120
+ if is_polars:
121
+ ic_values = cast(pl.DataFrame, ic_series)[ic_col].to_numpy()
122
+ else:
123
+ ic_values = cast(pd.DataFrame, ic_series)[ic_col].to_numpy()
124
+ else:
125
+ ic_values = np.asarray(ic_series).flatten()
126
+
127
+ # Remove NaN values
128
+ ic_clean: NDArray[Any] = ic_values[~np.isnan(ic_values)]
129
+
130
+ # Validate sufficient data
131
+ n = len(ic_clean)
132
+ if n < 3:
133
+ return {
134
+ "mean_ic": np.nan,
135
+ "hac_se": np.nan,
136
+ "t_stat": np.nan,
137
+ "p_value": np.nan,
138
+ "n_periods": n,
139
+ "effective_lags": 0,
140
+ "naive_se": np.nan,
141
+ "naive_t_stat": np.nan,
142
+ }
143
+
144
+ # Compute mean IC
145
+ mean_ic = float(np.mean(ic_clean))
146
+
147
+ # Compute naive (OLS) standard error
148
+ naive_var = float(np.var(ic_clean, ddof=1)) # Sample variance
149
+ naive_se = np.sqrt(naive_var / n) # Standard error of mean
150
+ naive_t_stat = mean_ic / naive_se if naive_se > 0 else np.nan
151
+
152
+ # Determine optimal lags if not specified
153
+ if maxlags is None:
154
+ # Newey-West automatic lag selection formula
155
+ # maxlags = floor(4 * (T/100)^(2/9))
156
+ maxlags = int(np.floor(4 * (n / 100) ** (2 / 9)))
157
+ maxlags = max(1, maxlags) # At least 1 lag
158
+ maxlags = min(maxlags, n // 2) # No more than T/2
159
+
160
+ # Fit OLS model: IC ~ constant (testing if mean IC != 0)
161
+ # This is equivalent to a one-sample t-test
162
+ exog = np.ones((n, 1)) # Just constant term
163
+ y = ic_clean.reshape(-1, 1)
164
+
165
+ # Compute HAC covariance matrix
166
+ try:
167
+ # Fit OLS model
168
+ model = OLS(y, exog)
169
+ ols_results = model.fit()
170
+
171
+ # Get HAC-robust covariance matrix
172
+ hac_cov = cov_hac(
173
+ ols_results,
174
+ nlags=maxlags,
175
+ weights_func=_get_kernel_weights(kernel),
176
+ use_correction=use_correction,
177
+ )
178
+
179
+ # Extract HAC variance (it's a 1x1 matrix for the constant)
180
+ hac_var = hac_cov[0, 0]
181
+ hac_se = np.sqrt(hac_var)
182
+
183
+ except Exception as e:
184
+ # If HAC computation fails, fall back to naive SE
185
+ print(f"Warning: HAC computation failed ({e}), using naive SE")
186
+ hac_se = naive_se
187
+
188
+ # Compute HAC-adjusted t-statistic
189
+ t_stat = mean_ic / hac_se if hac_se > 0 else np.nan
190
+
191
+ # Compute two-tailed p-value
192
+ # Use t-distribution with n-1 degrees of freedom
193
+ p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 1)) if not np.isnan(t_stat) else np.nan
194
+
195
+ return {
196
+ "mean_ic": float(mean_ic),
197
+ "hac_se": float(hac_se),
198
+ "t_stat": float(t_stat),
199
+ "p_value": float(p_value),
200
+ "n_periods": n,
201
+ "effective_lags": maxlags,
202
+ "naive_se": float(naive_se),
203
+ "naive_t_stat": float(naive_t_stat),
204
+ }
205
+
206
+
207
+ def _get_kernel_weights(kernel: str):
208
+ """Get kernel weight function for HAC estimation.
209
+
210
+ Parameters
211
+ ----------
212
+ kernel : str
213
+ Kernel name: "bartlett", "uniform", or "parzen"
214
+
215
+ Returns
216
+ -------
217
+ callable
218
+ Weight function that takes nlags and returns array of weights
219
+ """
220
+ if kernel == "bartlett":
221
+ # Bartlett kernel: weights decline linearly (Newey-West default)
222
+ def bartlett_weights(nlags):
223
+ return np.array([1 - h / (nlags + 1) for h in range(nlags + 1)])
224
+
225
+ return bartlett_weights
226
+
227
+ if kernel == "uniform":
228
+ # Uniform kernel: equal weights
229
+ def uniform_weights(nlags):
230
+ return np.ones(nlags + 1)
231
+
232
+ return uniform_weights
233
+
234
+ if kernel == "parzen":
235
+ # Parzen kernel: smoother decay
236
+ def parzen_weights(nlags):
237
+ weights = np.zeros(nlags + 1)
238
+ for h in range(nlags + 1):
239
+ z = h / (nlags + 1)
240
+ if z <= 0.5:
241
+ weights[h] = 1 - 6 * z**2 + 6 * z**3
242
+ else:
243
+ weights[h] = 2 * (1 - z) ** 3
244
+ return weights
245
+
246
+ return parzen_weights
247
+
248
+ raise ValueError(f"Unknown kernel: {kernel}. Use 'bartlett', 'uniform', or 'parzen'.")
249
+
250
+
251
+ def compute_ic_decay(
252
+ predictions: pl.DataFrame | pd.DataFrame,
253
+ prices: pl.DataFrame | pd.DataFrame,
254
+ horizons: list[int] | None = None,
255
+ pred_col: str = "prediction",
256
+ price_col: str = "close",
257
+ date_col: str = "date",
258
+ group_col: str | None = None,
259
+ method: str = "spearman",
260
+ estimate_half_life: bool = True,
261
+ ) -> dict[str, Any]:
262
+ """Analyze how IC decays over prediction horizons.
263
+
264
+ Computes IC at multiple forward-looking horizons to understand how long
265
+ predictions retain predictive power. Faster IC decay indicates shorter
266
+ signal persistence.
267
+
268
+ This is critical for:
269
+ 1. Determining optimal holding periods
270
+ 2. Understanding alpha decay dynamics
271
+ 3. Identifying when to retrain models
272
+ 4. Avoiding stale predictions
273
+
274
+ Parameters
275
+ ----------
276
+ predictions : Union[pl.DataFrame, pd.DataFrame]
277
+ DataFrame with predictions, must have pred_col, date_col, and optionally group_col
278
+ prices : Union[pl.DataFrame, pd.DataFrame]
279
+ DataFrame with prices, must have price_col, date_col, and optionally group_col
280
+ horizons : list[int] | None, default None
281
+ List of forward horizons in days. If None, uses [1, 2, 5, 10, 21]
282
+ pred_col : str, default "prediction"
283
+ Column name for predictions
284
+ price_col : str, default "close"
285
+ Column name for prices
286
+ date_col : str, default "date"
287
+ Column name for dates
288
+ group_col : str | None, default None
289
+ Column name for grouping (e.g., "symbol" for multi-asset)
290
+ method : str, default "spearman"
291
+ Correlation method: "spearman" or "pearson"
292
+ estimate_half_life : bool, default True
293
+ Whether to estimate IC half-life (horizon where IC drops to 50% of initial)
294
+
295
+ Returns
296
+ -------
297
+ dict[str, Any]
298
+ Dictionary with decay analysis:
299
+ - ic_by_horizon: dict mapping horizon -> IC value
300
+ - horizons: list of horizons analyzed
301
+ - decay_rate: exponential decay rate (if estimable)
302
+ - half_life: estimated half-life in days (if estimate_half_life=True)
303
+ - optimal_horizon: horizon with highest IC
304
+ - n_observations: number of observations per horizon
305
+
306
+ Examples
307
+ --------
308
+ >>> # Analyze IC decay for multi-asset predictions
309
+ >>> decay = compute_ic_decay(
310
+ ... predictions=pred_df,
311
+ ... prices=price_df,
312
+ ... horizons=[1, 2, 5, 10, 21],
313
+ ... group_col="symbol"
314
+ ... )
315
+ >>> print(f"IC at 1-day: {decay['ic_by_horizon'][1]:.3f}")
316
+ >>> print(f"IC at 21-day: {decay['ic_by_horizon'][21]:.3f}")
317
+ >>> print(f"Half-life: {decay['half_life']:.1f} days")
318
+ >>> print(f"Optimal horizon: {decay['optimal_horizon']} days")
319
+ IC at 1-day: 0.045
320
+ IC at 21-day: 0.012
321
+ Half-life: 8.3 days
322
+ Optimal horizon: 1 days
323
+
324
+ Notes
325
+ -----
326
+ IC Decay Patterns:
327
+ - Fast decay: IC drops >50% within 5 days -> high-frequency signal
328
+ - Moderate decay: IC half-life 5-20 days -> medium-term signal
329
+ - Slow decay: IC half-life >20 days -> long-term signal
330
+ - No decay: IC stable -> structural/fundamental signal
331
+
332
+ Half-life is estimated by fitting exponential decay:
333
+ IC(h) = IC(0) * exp(-lambda * h)
334
+ half_life = ln(2) / lambda
335
+
336
+ Optimal horizon is the horizon with maximum IC, useful for determining
337
+ best rebalancing frequency.
338
+
339
+ References
340
+ ----------
341
+ .. [1] Kakushadze, Z. (2016). "101 Formulaic Alphas." Wilmott, 2016(84), 72-81.
342
+ """
343
+ # Set default horizons if not provided
344
+ if horizons is None:
345
+ horizons = [1, 2, 5, 10, 21]
346
+
347
+ # Ensure horizons are sorted
348
+ horizons = sorted(horizons)
349
+
350
+ # Compute IC for each horizon using compute_ic_by_horizon
351
+ ic_results = compute_ic_by_horizon(
352
+ predictions=predictions,
353
+ prices=prices,
354
+ horizons=horizons,
355
+ pred_col=pred_col,
356
+ price_col=price_col,
357
+ date_col=date_col,
358
+ group_col=group_col,
359
+ method=method,
360
+ )
361
+
362
+ # Extract IC values and observation counts
363
+ ic_by_horizon: dict[int, float] = {}
364
+ n_obs_by_horizon: dict[int, int] = {}
365
+
366
+ for horizon, ic_value in ic_results.items():
367
+ ic_by_horizon[horizon] = ic_value
368
+ # Note: compute_ic_by_horizon returns just IC values, not counts
369
+ # We'll approximate n_obs from the input data
370
+ n_obs_by_horizon[horizon] = len(predictions)
371
+
372
+ # Find optimal horizon (highest absolute IC)
373
+ optimal_ic: float
374
+ optimal_horizon: int | None
375
+ if ic_by_horizon:
376
+ optimal_horizon = max(ic_by_horizon.keys(), key=lambda h: abs(ic_by_horizon[h]))
377
+ optimal_ic = ic_by_horizon[optimal_horizon]
378
+ else:
379
+ optimal_horizon = None
380
+ optimal_ic = np.nan
381
+
382
+ # Estimate decay rate and half-life
383
+ decay_rate = np.nan
384
+ half_life = np.nan
385
+
386
+ if estimate_half_life and len(ic_by_horizon) >= 2:
387
+ # Extract horizons and IC values for fitting
388
+ h_vals = np.array(list(ic_by_horizon.keys()))
389
+ ic_vals = np.array([ic_by_horizon[h] for h in h_vals])
390
+
391
+ # Remove NaN values
392
+ valid_mask = ~np.isnan(ic_vals)
393
+ h_vals = h_vals[valid_mask]
394
+ ic_vals = ic_vals[valid_mask]
395
+
396
+ if len(h_vals) >= 2 and np.all(ic_vals > 0):
397
+ # Fit exponential decay: IC(h) = IC(0) * exp(-lambda * h)
398
+ # Take log: ln(IC(h)) = ln(IC(0)) - lambda * h
399
+ # This is linear regression: y = a + b*x where b = -lambda
400
+
401
+ try:
402
+ log_ic = np.log(ic_vals)
403
+
404
+ # Linear regression
405
+ coeffs = np.polyfit(h_vals, log_ic, deg=1)
406
+ decay_rate = -coeffs[0] # -lambda from the linear fit
407
+
408
+ # Half-life: t_{1/2} = ln(2) / lambda
409
+ if decay_rate > 0:
410
+ half_life = np.log(2) / decay_rate
411
+ elif decay_rate < 0:
412
+ # Negative decay rate means IC is increasing (unusual)
413
+ half_life = np.inf
414
+ else:
415
+ half_life = np.nan
416
+
417
+ except (ValueError, np.linalg.LinAlgError):
418
+ # Fitting failed (e.g., all IC values identical)
419
+ decay_rate = np.nan
420
+ half_life = np.nan
421
+
422
+ elif len(h_vals) >= 2:
423
+ # Can't fit exponential if IC values are not all positive
424
+ # Try fitting to absolute values
425
+ try:
426
+ abs_ic_vals = np.abs(ic_vals)
427
+ if np.all(abs_ic_vals > 0):
428
+ log_abs_ic = np.log(abs_ic_vals)
429
+ coeffs = np.polyfit(h_vals, log_abs_ic, deg=1)
430
+ decay_rate = -coeffs[0]
431
+
432
+ half_life = np.log(2) / decay_rate if decay_rate > 0 else np.nan
433
+ except (ValueError, np.linalg.LinAlgError):
434
+ pass
435
+
436
+ return {
437
+ "ic_by_horizon": ic_by_horizon,
438
+ "horizons": horizons,
439
+ "decay_rate": float(decay_rate) if not np.isnan(decay_rate) else None,
440
+ "half_life": float(half_life)
441
+ if not np.isnan(half_life) and not np.isinf(half_life)
442
+ else None,
443
+ "optimal_horizon": optimal_horizon,
444
+ "optimal_ic": optimal_ic if not np.isnan(optimal_ic) else None,
445
+ "n_observations": n_obs_by_horizon,
446
+ }