ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,129 @@
1
+ """Information Coefficient (IC) computation.
2
+
3
+ Simple, pure functions for IC analysis.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ import numpy as np
11
+ import polars as pl
12
+ from scipy.stats import spearmanr
13
+ from scipy.stats import t as t_dist
14
+
15
+
16
+ def compute_ic_series(
17
+ data: pl.DataFrame,
18
+ period: int,
19
+ method: str = "spearman",
20
+ factor_col: str = "factor",
21
+ date_col: str = "date",
22
+ min_obs: int = 10,
23
+ ) -> tuple[list[Any], list[float]]:
24
+ """Compute IC time series for a single period.
25
+
26
+ Parameters
27
+ ----------
28
+ data : pl.DataFrame
29
+ Factor data with factor and forward return columns.
30
+ period : int
31
+ Forward return period in days.
32
+ method : str, default "spearman"
33
+ Correlation method ("spearman" or "pearson").
34
+ factor_col : str, default "factor"
35
+ Factor column name.
36
+ date_col : str, default "date"
37
+ Date column name.
38
+ min_obs : int, default 10
39
+ Minimum observations per date.
40
+
41
+ Returns
42
+ -------
43
+ tuple[list[Any], list[float]]
44
+ (dates, ic_values) for dates with valid IC.
45
+ """
46
+ return_col = f"{period}D_fwd_return"
47
+
48
+ valid_data = data.filter(pl.col(return_col).is_not_null())
49
+ unique_dates = valid_data.select(date_col).unique().sort(date_col).to_series().to_list()
50
+
51
+ dates: list[Any] = []
52
+ ic_values: list[float] = []
53
+
54
+ for date in unique_dates:
55
+ date_data = valid_data.filter(pl.col(date_col) == date)
56
+ if date_data.height < min_obs:
57
+ continue
58
+
59
+ factors = date_data[factor_col].to_numpy()
60
+ returns = date_data[return_col].to_numpy()
61
+
62
+ # Remove NaN pairs
63
+ mask = ~(np.isnan(factors) | np.isnan(returns))
64
+ if mask.sum() < min_obs:
65
+ continue
66
+
67
+ factors = factors[mask]
68
+ returns = returns[mask]
69
+
70
+ if method == "spearman":
71
+ ic, _ = spearmanr(factors, returns)
72
+ else:
73
+ ic = float(np.corrcoef(factors, returns)[0, 1])
74
+
75
+ if not np.isnan(ic):
76
+ dates.append(date)
77
+ ic_values.append(float(ic))
78
+
79
+ return dates, ic_values
80
+
81
+
82
+ def compute_ic_summary(
83
+ ic_series: list[float],
84
+ ) -> dict[str, float]:
85
+ """Compute summary statistics for an IC series.
86
+
87
+ Parameters
88
+ ----------
89
+ ic_series : list[float]
90
+ IC values over time.
91
+
92
+ Returns
93
+ -------
94
+ dict[str, float]
95
+ mean, std, t_stat, p_value, pct_positive
96
+ """
97
+ n = len(ic_series)
98
+ if n < 2:
99
+ return {
100
+ "mean": float("nan"),
101
+ "std": float("nan"),
102
+ "t_stat": float("nan"),
103
+ "p_value": float("nan"),
104
+ "pct_positive": float("nan"),
105
+ }
106
+
107
+ arr = np.array(ic_series)
108
+ mean_ic = float(np.nanmean(arr))
109
+ std_ic = float(np.nanstd(arr, ddof=1))
110
+
111
+ if std_ic > 0:
112
+ t_stat = mean_ic / (std_ic / np.sqrt(n))
113
+ p_value = float(2 * (1 - t_dist.cdf(abs(t_stat), df=n - 1)))
114
+ else:
115
+ t_stat = float("nan")
116
+ p_value = float("nan")
117
+
118
+ pct_positive = float(np.mean(arr > 0))
119
+
120
+ return {
121
+ "mean": mean_ic,
122
+ "std": std_ic,
123
+ "t_stat": float(t_stat),
124
+ "p_value": p_value,
125
+ "pct_positive": pct_positive,
126
+ }
127
+
128
+
129
+ __all__ = ["compute_ic_series", "compute_ic_summary"]
@@ -0,0 +1,182 @@
1
+ """Turnover and autocorrelation analysis.
2
+
3
+ Simple, pure functions for factor persistence analysis.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ import numpy as np
11
+ import polars as pl
12
+ from scipy.stats import spearmanr
13
+
14
+
15
+ def compute_turnover(
16
+ data: pl.DataFrame,
17
+ n_quantiles: int,
18
+ date_col: str = "date",
19
+ asset_col: str = "asset",
20
+ quantile_col: str = "quantile",
21
+ ) -> float:
22
+ """Compute mean turnover rate across quantiles.
23
+
24
+ Turnover = fraction of assets that change quantile each period.
25
+
26
+ Parameters
27
+ ----------
28
+ data : pl.DataFrame
29
+ Data with date, asset, and quantile columns.
30
+ n_quantiles : int
31
+ Number of quantiles.
32
+ date_col, asset_col, quantile_col : str
33
+ Column names.
34
+
35
+ Returns
36
+ -------
37
+ float
38
+ Mean turnover rate (0-1).
39
+ """
40
+ unique_dates = data.select(date_col).unique().sort(date_col).to_series().to_list()
41
+
42
+ if len(unique_dates) < 2:
43
+ return float("nan")
44
+
45
+ # Pre-compute asset sets per (date, quantile) using dict comprehension
46
+ asset_lists = (
47
+ data.group_by([date_col, quantile_col])
48
+ .agg(pl.col(asset_col).alias("assets"))
49
+ .sort([date_col, quantile_col])
50
+ )
51
+ # Use rows() for faster iteration (returns tuples)
52
+ asset_sets: dict[tuple[Any, int], set[Any]] = {
53
+ (row[0], row[1]): set(row[2]) for row in asset_lists.rows()
54
+ }
55
+
56
+ # Compute turnover for each quantile
57
+ all_turnovers: list[float] = []
58
+
59
+ for q in range(1, n_quantiles + 1):
60
+ q_turnovers: list[float] = []
61
+
62
+ for i in range(len(unique_dates) - 1):
63
+ date_t = unique_dates[i]
64
+ date_t1 = unique_dates[i + 1]
65
+
66
+ assets_t = asset_sets.get((date_t, q), set())
67
+ assets_t1 = asset_sets.get((date_t1, q), set())
68
+
69
+ if assets_t and assets_t1:
70
+ overlap = len(assets_t & assets_t1)
71
+ turnover = 1 - overlap / max(len(assets_t), len(assets_t1))
72
+ q_turnovers.append(turnover)
73
+
74
+ if q_turnovers:
75
+ all_turnovers.append(float(np.mean(q_turnovers)))
76
+
77
+ return float(np.nanmean(all_turnovers)) if all_turnovers else float("nan")
78
+
79
+
80
+ def compute_autocorrelation(
81
+ data: pl.DataFrame,
82
+ lags: list[int],
83
+ date_col: str = "date",
84
+ asset_col: str = "asset",
85
+ factor_col: str = "factor",
86
+ min_obs: int = 10,
87
+ ) -> list[float]:
88
+ """Compute factor rank autocorrelation at different lags.
89
+
90
+ Parameters
91
+ ----------
92
+ data : pl.DataFrame
93
+ Data with date, asset, and factor columns.
94
+ lags : list[int]
95
+ Lag values (e.g., [1, 2, 3, 4, 5]).
96
+ date_col, asset_col, factor_col : str
97
+ Column names.
98
+ min_obs : int, default 10
99
+ Minimum observations per date pair.
100
+
101
+ Returns
102
+ -------
103
+ list[float]
104
+ Autocorrelation at each lag.
105
+ """
106
+ unique_dates = data.select(date_col).unique().sort(date_col).to_series().to_list()
107
+
108
+ if len(unique_dates) < max(lags) + 1:
109
+ return [float("nan")] * len(lags)
110
+
111
+ # Cache data by date using partition_by (single pass, O(n))
112
+ date_cache: dict[Any, pl.DataFrame] = {}
113
+ partitions = data.select([date_col, asset_col, factor_col]).partition_by(
114
+ date_col, as_dict=True, include_key=False
115
+ )
116
+ for date_key, df in partitions.items():
117
+ # partition_by returns tuple keys when grouping by single column
118
+ date = date_key[0] if isinstance(date_key, tuple) else date_key
119
+ date_cache[date] = df
120
+
121
+ autocorrelations: list[float] = []
122
+
123
+ for lag in lags:
124
+ correlations: list[float] = []
125
+
126
+ for i in range(len(unique_dates) - lag):
127
+ date_t = unique_dates[i]
128
+ date_t_lag = unique_dates[i + lag]
129
+
130
+ data_t = date_cache[date_t]
131
+ data_t_lag = date_cache[date_t_lag]
132
+
133
+ merged = data_t.join(data_t_lag, on=asset_col, how="inner", suffix="_lag")
134
+
135
+ if merged.height < min_obs:
136
+ continue
137
+
138
+ rho, _ = spearmanr(
139
+ merged[factor_col].to_numpy(), merged[f"{factor_col}_lag"].to_numpy()
140
+ )
141
+ if not np.isnan(rho):
142
+ correlations.append(float(rho))
143
+
144
+ lag_ac = float(np.mean(correlations)) if correlations else float("nan")
145
+ autocorrelations.append(lag_ac)
146
+
147
+ return autocorrelations
148
+
149
+
150
+ def estimate_half_life(autocorrelations: list[float]) -> float | None:
151
+ """Estimate half-life from autocorrelation decay.
152
+
153
+ Half-life is the lag where autocorrelation drops to 50% of lag-1 value.
154
+
155
+ Parameters
156
+ ----------
157
+ autocorrelations : list[float]
158
+ Autocorrelation at lags 1, 2, 3, ...
159
+
160
+ Returns
161
+ -------
162
+ float | None
163
+ Half-life in periods, or None if undefined.
164
+ """
165
+ valid_ac = [ac for ac in autocorrelations if not np.isnan(ac)]
166
+
167
+ if len(valid_ac) < 2 or valid_ac[0] <= 0:
168
+ return None
169
+
170
+ threshold = 0.5 * valid_ac[0]
171
+
172
+ for i, ac in enumerate(valid_ac):
173
+ if ac < threshold:
174
+ if i > 0:
175
+ # Linear interpolation
176
+ return i + (valid_ac[i - 1] - threshold) / (valid_ac[i - 1] - ac)
177
+ return float(i + 1)
178
+
179
+ return None # Never decayed below threshold
180
+
181
+
182
+ __all__ = ["compute_turnover", "compute_autocorrelation", "estimate_half_life"]
@@ -0,0 +1,19 @@
1
+ # splitters/ - Cross-Validation
2
+
3
+ Time-series CV with purging and embargo.
4
+
5
+ ## Modules
6
+
7
+ | File | Lines | Purpose |
8
+ |------|-------|---------|
9
+ | combinatorial.py | 1392 | `CombinatorialPurgedCV` (CPCV) |
10
+ | walk_forward.py | 757 | `PurgedWalkForwardCV` |
11
+ | base.py | 501 | `BaseSplitter` abstract |
12
+ | calendar.py | 421 | `TradingCalendar` |
13
+ | config.py | 315 | Configuration classes |
14
+ | group_isolation.py | 329 | Multi-asset isolation |
15
+ | persistence.py | 316 | Fold save/load |
16
+
17
+ ## Key Classes
18
+
19
+ `CombinatorialPurgedCV`, `PurgedWalkForwardCV`, `TradingCalendar`
@@ -0,0 +1,36 @@
1
+ """Time-series cross-validation splitters with purging and embargo support.
2
+
3
+ This module provides advanced cross-validation methods designed specifically for
4
+ financial time-series data, addressing common issues like data leakage and
5
+ backtest overfitting.
6
+ """
7
+
8
+ from ml4t.diagnostic.splitters.base import BaseSplitter
9
+ from ml4t.diagnostic.splitters.combinatorial import CombinatorialPurgedCV
10
+ from ml4t.diagnostic.splitters.config import (
11
+ CombinatorialPurgedConfig,
12
+ PurgedWalkForwardConfig,
13
+ SplitterConfig,
14
+ )
15
+ from ml4t.diagnostic.splitters.persistence import (
16
+ load_config,
17
+ load_folds,
18
+ save_config,
19
+ save_folds,
20
+ verify_folds,
21
+ )
22
+ from ml4t.diagnostic.splitters.walk_forward import PurgedWalkForwardCV
23
+
24
+ __all__ = [
25
+ "BaseSplitter",
26
+ "CombinatorialPurgedCV",
27
+ "CombinatorialPurgedConfig",
28
+ "PurgedWalkForwardCV",
29
+ "PurgedWalkForwardConfig",
30
+ "SplitterConfig",
31
+ "load_config",
32
+ "load_folds",
33
+ "save_config",
34
+ "save_folds",
35
+ "verify_folds",
36
+ ]