ml4t-diagnostic 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml4t/diagnostic/AGENT.md +25 -0
- ml4t/diagnostic/__init__.py +166 -0
- ml4t/diagnostic/backends/__init__.py +10 -0
- ml4t/diagnostic/backends/adapter.py +192 -0
- ml4t/diagnostic/backends/polars_backend.py +899 -0
- ml4t/diagnostic/caching/__init__.py +40 -0
- ml4t/diagnostic/caching/cache.py +331 -0
- ml4t/diagnostic/caching/decorators.py +131 -0
- ml4t/diagnostic/caching/smart_cache.py +339 -0
- ml4t/diagnostic/config/AGENT.md +24 -0
- ml4t/diagnostic/config/README.md +267 -0
- ml4t/diagnostic/config/__init__.py +219 -0
- ml4t/diagnostic/config/barrier_config.py +277 -0
- ml4t/diagnostic/config/base.py +301 -0
- ml4t/diagnostic/config/event_config.py +148 -0
- ml4t/diagnostic/config/feature_config.py +404 -0
- ml4t/diagnostic/config/multi_signal_config.py +55 -0
- ml4t/diagnostic/config/portfolio_config.py +215 -0
- ml4t/diagnostic/config/report_config.py +391 -0
- ml4t/diagnostic/config/sharpe_config.py +202 -0
- ml4t/diagnostic/config/signal_config.py +206 -0
- ml4t/diagnostic/config/trade_analysis_config.py +310 -0
- ml4t/diagnostic/config/validation.py +279 -0
- ml4t/diagnostic/core/__init__.py +29 -0
- ml4t/diagnostic/core/numba_utils.py +315 -0
- ml4t/diagnostic/core/purging.py +372 -0
- ml4t/diagnostic/core/sampling.py +471 -0
- ml4t/diagnostic/errors/__init__.py +205 -0
- ml4t/diagnostic/evaluation/AGENT.md +26 -0
- ml4t/diagnostic/evaluation/__init__.py +437 -0
- ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
- ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
- ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
- ml4t/diagnostic/evaluation/dashboard.py +715 -0
- ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
- ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
- ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
- ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
- ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
- ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
- ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
- ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
- ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
- ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
- ml4t/diagnostic/evaluation/event_analysis.py +647 -0
- ml4t/diagnostic/evaluation/excursion.py +390 -0
- ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
- ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
- ml4t/diagnostic/evaluation/framework.py +935 -0
- ml4t/diagnostic/evaluation/metric_registry.py +255 -0
- ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
- ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
- ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
- ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
- ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
- ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
- ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
- ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
- ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
- ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
- ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
- ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
- ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
- ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
- ml4t/diagnostic/evaluation/multi_signal.py +550 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
- ml4t/diagnostic/evaluation/report_generation.py +824 -0
- ml4t/diagnostic/evaluation/signal_selector.py +452 -0
- ml4t/diagnostic/evaluation/stat_registry.py +139 -0
- ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
- ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
- ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
- ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
- ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
- ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
- ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
- ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
- ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
- ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
- ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
- ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
- ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
- ml4t/diagnostic/evaluation/stats/moments.py +164 -0
- ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
- ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
- ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
- ml4t/diagnostic/evaluation/themes.py +330 -0
- ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
- ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
- ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
- ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
- ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
- ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
- ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
- ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
- ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
- ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
- ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
- ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
- ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
- ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
- ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
- ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
- ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
- ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
- ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
- ml4t/diagnostic/evaluation/validated_cv.py +535 -0
- ml4t/diagnostic/evaluation/visualization.py +1050 -0
- ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
- ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
- ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
- ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
- ml4t/diagnostic/integration/__init__.py +48 -0
- ml4t/diagnostic/integration/backtest_contract.py +671 -0
- ml4t/diagnostic/integration/data_contract.py +316 -0
- ml4t/diagnostic/integration/engineer_contract.py +226 -0
- ml4t/diagnostic/logging/__init__.py +77 -0
- ml4t/diagnostic/logging/logger.py +245 -0
- ml4t/diagnostic/logging/performance.py +234 -0
- ml4t/diagnostic/logging/progress.py +234 -0
- ml4t/diagnostic/logging/wandb.py +412 -0
- ml4t/diagnostic/metrics/__init__.py +9 -0
- ml4t/diagnostic/metrics/percentiles.py +128 -0
- ml4t/diagnostic/py.typed +1 -0
- ml4t/diagnostic/reporting/__init__.py +43 -0
- ml4t/diagnostic/reporting/base.py +130 -0
- ml4t/diagnostic/reporting/html_renderer.py +275 -0
- ml4t/diagnostic/reporting/json_renderer.py +51 -0
- ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
- ml4t/diagnostic/results/AGENT.md +24 -0
- ml4t/diagnostic/results/__init__.py +105 -0
- ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
- ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
- ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
- ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
- ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
- ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
- ml4t/diagnostic/results/barrier_results/validation.py +38 -0
- ml4t/diagnostic/results/base.py +177 -0
- ml4t/diagnostic/results/event_results.py +349 -0
- ml4t/diagnostic/results/feature_results.py +787 -0
- ml4t/diagnostic/results/multi_signal_results.py +431 -0
- ml4t/diagnostic/results/portfolio_results.py +281 -0
- ml4t/diagnostic/results/sharpe_results.py +448 -0
- ml4t/diagnostic/results/signal_results/__init__.py +74 -0
- ml4t/diagnostic/results/signal_results/ic.py +581 -0
- ml4t/diagnostic/results/signal_results/irtc.py +110 -0
- ml4t/diagnostic/results/signal_results/quantile.py +392 -0
- ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
- ml4t/diagnostic/results/signal_results/turnover.py +213 -0
- ml4t/diagnostic/results/signal_results/validation.py +147 -0
- ml4t/diagnostic/signal/AGENT.md +17 -0
- ml4t/diagnostic/signal/__init__.py +69 -0
- ml4t/diagnostic/signal/_report.py +152 -0
- ml4t/diagnostic/signal/_utils.py +261 -0
- ml4t/diagnostic/signal/core.py +275 -0
- ml4t/diagnostic/signal/quantile.py +148 -0
- ml4t/diagnostic/signal/result.py +214 -0
- ml4t/diagnostic/signal/signal_ic.py +129 -0
- ml4t/diagnostic/signal/turnover.py +182 -0
- ml4t/diagnostic/splitters/AGENT.md +19 -0
- ml4t/diagnostic/splitters/__init__.py +36 -0
- ml4t/diagnostic/splitters/base.py +501 -0
- ml4t/diagnostic/splitters/calendar.py +421 -0
- ml4t/diagnostic/splitters/calendar_config.py +91 -0
- ml4t/diagnostic/splitters/combinatorial.py +1064 -0
- ml4t/diagnostic/splitters/config.py +322 -0
- ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
- ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
- ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
- ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
- ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
- ml4t/diagnostic/splitters/group_isolation.py +329 -0
- ml4t/diagnostic/splitters/persistence.py +316 -0
- ml4t/diagnostic/splitters/utils.py +207 -0
- ml4t/diagnostic/splitters/walk_forward.py +757 -0
- ml4t/diagnostic/utils/__init__.py +42 -0
- ml4t/diagnostic/utils/config.py +542 -0
- ml4t/diagnostic/utils/dependencies.py +318 -0
- ml4t/diagnostic/utils/sessions.py +127 -0
- ml4t/diagnostic/validation/__init__.py +54 -0
- ml4t/diagnostic/validation/dataframe.py +274 -0
- ml4t/diagnostic/validation/returns.py +280 -0
- ml4t/diagnostic/validation/timeseries.py +299 -0
- ml4t/diagnostic/visualization/AGENT.md +19 -0
- ml4t/diagnostic/visualization/__init__.py +223 -0
- ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
- ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
- ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
- ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
- ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
- ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
- ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
- ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
- ml4t/diagnostic/visualization/barrier_plots.py +782 -0
- ml4t/diagnostic/visualization/core.py +1060 -0
- ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
- ml4t/diagnostic/visualization/dashboards/base.py +582 -0
- ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
- ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
- ml4t/diagnostic/visualization/dashboards.py +43 -0
- ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
- ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
- ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
- ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
- ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
- ml4t/diagnostic/visualization/feature_plots.py +888 -0
- ml4t/diagnostic/visualization/interaction_plots.py +618 -0
- ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
- ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
- ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
- ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
- ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
- ml4t/diagnostic/visualization/report_generation.py +1343 -0
- ml4t/diagnostic/visualization/signal/__init__.py +103 -0
- ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
- ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
- ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
- ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
- ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
- ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
- ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
- ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
- ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
- ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
- ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
- ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Information Coefficient (IC) computation.
|
|
2
|
+
|
|
3
|
+
Simple, pure functions for IC analysis.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import polars as pl
|
|
12
|
+
from scipy.stats import spearmanr
|
|
13
|
+
from scipy.stats import t as t_dist
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def compute_ic_series(
|
|
17
|
+
data: pl.DataFrame,
|
|
18
|
+
period: int,
|
|
19
|
+
method: str = "spearman",
|
|
20
|
+
factor_col: str = "factor",
|
|
21
|
+
date_col: str = "date",
|
|
22
|
+
min_obs: int = 10,
|
|
23
|
+
) -> tuple[list[Any], list[float]]:
|
|
24
|
+
"""Compute IC time series for a single period.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data : pl.DataFrame
|
|
29
|
+
Factor data with factor and forward return columns.
|
|
30
|
+
period : int
|
|
31
|
+
Forward return period in days.
|
|
32
|
+
method : str, default "spearman"
|
|
33
|
+
Correlation method ("spearman" or "pearson").
|
|
34
|
+
factor_col : str, default "factor"
|
|
35
|
+
Factor column name.
|
|
36
|
+
date_col : str, default "date"
|
|
37
|
+
Date column name.
|
|
38
|
+
min_obs : int, default 10
|
|
39
|
+
Minimum observations per date.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
tuple[list[Any], list[float]]
|
|
44
|
+
(dates, ic_values) for dates with valid IC.
|
|
45
|
+
"""
|
|
46
|
+
return_col = f"{period}D_fwd_return"
|
|
47
|
+
|
|
48
|
+
valid_data = data.filter(pl.col(return_col).is_not_null())
|
|
49
|
+
unique_dates = valid_data.select(date_col).unique().sort(date_col).to_series().to_list()
|
|
50
|
+
|
|
51
|
+
dates: list[Any] = []
|
|
52
|
+
ic_values: list[float] = []
|
|
53
|
+
|
|
54
|
+
for date in unique_dates:
|
|
55
|
+
date_data = valid_data.filter(pl.col(date_col) == date)
|
|
56
|
+
if date_data.height < min_obs:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
factors = date_data[factor_col].to_numpy()
|
|
60
|
+
returns = date_data[return_col].to_numpy()
|
|
61
|
+
|
|
62
|
+
# Remove NaN pairs
|
|
63
|
+
mask = ~(np.isnan(factors) | np.isnan(returns))
|
|
64
|
+
if mask.sum() < min_obs:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
factors = factors[mask]
|
|
68
|
+
returns = returns[mask]
|
|
69
|
+
|
|
70
|
+
if method == "spearman":
|
|
71
|
+
ic, _ = spearmanr(factors, returns)
|
|
72
|
+
else:
|
|
73
|
+
ic = float(np.corrcoef(factors, returns)[0, 1])
|
|
74
|
+
|
|
75
|
+
if not np.isnan(ic):
|
|
76
|
+
dates.append(date)
|
|
77
|
+
ic_values.append(float(ic))
|
|
78
|
+
|
|
79
|
+
return dates, ic_values
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_ic_summary(
|
|
83
|
+
ic_series: list[float],
|
|
84
|
+
) -> dict[str, float]:
|
|
85
|
+
"""Compute summary statistics for an IC series.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
ic_series : list[float]
|
|
90
|
+
IC values over time.
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
dict[str, float]
|
|
95
|
+
mean, std, t_stat, p_value, pct_positive
|
|
96
|
+
"""
|
|
97
|
+
n = len(ic_series)
|
|
98
|
+
if n < 2:
|
|
99
|
+
return {
|
|
100
|
+
"mean": float("nan"),
|
|
101
|
+
"std": float("nan"),
|
|
102
|
+
"t_stat": float("nan"),
|
|
103
|
+
"p_value": float("nan"),
|
|
104
|
+
"pct_positive": float("nan"),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
arr = np.array(ic_series)
|
|
108
|
+
mean_ic = float(np.nanmean(arr))
|
|
109
|
+
std_ic = float(np.nanstd(arr, ddof=1))
|
|
110
|
+
|
|
111
|
+
if std_ic > 0:
|
|
112
|
+
t_stat = mean_ic / (std_ic / np.sqrt(n))
|
|
113
|
+
p_value = float(2 * (1 - t_dist.cdf(abs(t_stat), df=n - 1)))
|
|
114
|
+
else:
|
|
115
|
+
t_stat = float("nan")
|
|
116
|
+
p_value = float("nan")
|
|
117
|
+
|
|
118
|
+
pct_positive = float(np.mean(arr > 0))
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
"mean": mean_ic,
|
|
122
|
+
"std": std_ic,
|
|
123
|
+
"t_stat": float(t_stat),
|
|
124
|
+
"p_value": p_value,
|
|
125
|
+
"pct_positive": pct_positive,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
__all__ = ["compute_ic_series", "compute_ic_summary"]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Turnover and autocorrelation analysis.
|
|
2
|
+
|
|
3
|
+
Simple, pure functions for factor persistence analysis.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import polars as pl
|
|
12
|
+
from scipy.stats import spearmanr
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compute_turnover(
|
|
16
|
+
data: pl.DataFrame,
|
|
17
|
+
n_quantiles: int,
|
|
18
|
+
date_col: str = "date",
|
|
19
|
+
asset_col: str = "asset",
|
|
20
|
+
quantile_col: str = "quantile",
|
|
21
|
+
) -> float:
|
|
22
|
+
"""Compute mean turnover rate across quantiles.
|
|
23
|
+
|
|
24
|
+
Turnover = fraction of assets that change quantile each period.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data : pl.DataFrame
|
|
29
|
+
Data with date, asset, and quantile columns.
|
|
30
|
+
n_quantiles : int
|
|
31
|
+
Number of quantiles.
|
|
32
|
+
date_col, asset_col, quantile_col : str
|
|
33
|
+
Column names.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
float
|
|
38
|
+
Mean turnover rate (0-1).
|
|
39
|
+
"""
|
|
40
|
+
unique_dates = data.select(date_col).unique().sort(date_col).to_series().to_list()
|
|
41
|
+
|
|
42
|
+
if len(unique_dates) < 2:
|
|
43
|
+
return float("nan")
|
|
44
|
+
|
|
45
|
+
# Pre-compute asset sets per (date, quantile) using dict comprehension
|
|
46
|
+
asset_lists = (
|
|
47
|
+
data.group_by([date_col, quantile_col])
|
|
48
|
+
.agg(pl.col(asset_col).alias("assets"))
|
|
49
|
+
.sort([date_col, quantile_col])
|
|
50
|
+
)
|
|
51
|
+
# Use rows() for faster iteration (returns tuples)
|
|
52
|
+
asset_sets: dict[tuple[Any, int], set[Any]] = {
|
|
53
|
+
(row[0], row[1]): set(row[2]) for row in asset_lists.rows()
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
# Compute turnover for each quantile
|
|
57
|
+
all_turnovers: list[float] = []
|
|
58
|
+
|
|
59
|
+
for q in range(1, n_quantiles + 1):
|
|
60
|
+
q_turnovers: list[float] = []
|
|
61
|
+
|
|
62
|
+
for i in range(len(unique_dates) - 1):
|
|
63
|
+
date_t = unique_dates[i]
|
|
64
|
+
date_t1 = unique_dates[i + 1]
|
|
65
|
+
|
|
66
|
+
assets_t = asset_sets.get((date_t, q), set())
|
|
67
|
+
assets_t1 = asset_sets.get((date_t1, q), set())
|
|
68
|
+
|
|
69
|
+
if assets_t and assets_t1:
|
|
70
|
+
overlap = len(assets_t & assets_t1)
|
|
71
|
+
turnover = 1 - overlap / max(len(assets_t), len(assets_t1))
|
|
72
|
+
q_turnovers.append(turnover)
|
|
73
|
+
|
|
74
|
+
if q_turnovers:
|
|
75
|
+
all_turnovers.append(float(np.mean(q_turnovers)))
|
|
76
|
+
|
|
77
|
+
return float(np.nanmean(all_turnovers)) if all_turnovers else float("nan")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_autocorrelation(
|
|
81
|
+
data: pl.DataFrame,
|
|
82
|
+
lags: list[int],
|
|
83
|
+
date_col: str = "date",
|
|
84
|
+
asset_col: str = "asset",
|
|
85
|
+
factor_col: str = "factor",
|
|
86
|
+
min_obs: int = 10,
|
|
87
|
+
) -> list[float]:
|
|
88
|
+
"""Compute factor rank autocorrelation at different lags.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
data : pl.DataFrame
|
|
93
|
+
Data with date, asset, and factor columns.
|
|
94
|
+
lags : list[int]
|
|
95
|
+
Lag values (e.g., [1, 2, 3, 4, 5]).
|
|
96
|
+
date_col, asset_col, factor_col : str
|
|
97
|
+
Column names.
|
|
98
|
+
min_obs : int, default 10
|
|
99
|
+
Minimum observations per date pair.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
list[float]
|
|
104
|
+
Autocorrelation at each lag.
|
|
105
|
+
"""
|
|
106
|
+
unique_dates = data.select(date_col).unique().sort(date_col).to_series().to_list()
|
|
107
|
+
|
|
108
|
+
if len(unique_dates) < max(lags) + 1:
|
|
109
|
+
return [float("nan")] * len(lags)
|
|
110
|
+
|
|
111
|
+
# Cache data by date using partition_by (single pass, O(n))
|
|
112
|
+
date_cache: dict[Any, pl.DataFrame] = {}
|
|
113
|
+
partitions = data.select([date_col, asset_col, factor_col]).partition_by(
|
|
114
|
+
date_col, as_dict=True, include_key=False
|
|
115
|
+
)
|
|
116
|
+
for date_key, df in partitions.items():
|
|
117
|
+
# partition_by returns tuple keys when grouping by single column
|
|
118
|
+
date = date_key[0] if isinstance(date_key, tuple) else date_key
|
|
119
|
+
date_cache[date] = df
|
|
120
|
+
|
|
121
|
+
autocorrelations: list[float] = []
|
|
122
|
+
|
|
123
|
+
for lag in lags:
|
|
124
|
+
correlations: list[float] = []
|
|
125
|
+
|
|
126
|
+
for i in range(len(unique_dates) - lag):
|
|
127
|
+
date_t = unique_dates[i]
|
|
128
|
+
date_t_lag = unique_dates[i + lag]
|
|
129
|
+
|
|
130
|
+
data_t = date_cache[date_t]
|
|
131
|
+
data_t_lag = date_cache[date_t_lag]
|
|
132
|
+
|
|
133
|
+
merged = data_t.join(data_t_lag, on=asset_col, how="inner", suffix="_lag")
|
|
134
|
+
|
|
135
|
+
if merged.height < min_obs:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
rho, _ = spearmanr(
|
|
139
|
+
merged[factor_col].to_numpy(), merged[f"{factor_col}_lag"].to_numpy()
|
|
140
|
+
)
|
|
141
|
+
if not np.isnan(rho):
|
|
142
|
+
correlations.append(float(rho))
|
|
143
|
+
|
|
144
|
+
lag_ac = float(np.mean(correlations)) if correlations else float("nan")
|
|
145
|
+
autocorrelations.append(lag_ac)
|
|
146
|
+
|
|
147
|
+
return autocorrelations
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def estimate_half_life(autocorrelations: list[float]) -> float | None:
|
|
151
|
+
"""Estimate half-life from autocorrelation decay.
|
|
152
|
+
|
|
153
|
+
Half-life is the lag where autocorrelation drops to 50% of lag-1 value.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
autocorrelations : list[float]
|
|
158
|
+
Autocorrelation at lags 1, 2, 3, ...
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
float | None
|
|
163
|
+
Half-life in periods, or None if undefined.
|
|
164
|
+
"""
|
|
165
|
+
valid_ac = [ac for ac in autocorrelations if not np.isnan(ac)]
|
|
166
|
+
|
|
167
|
+
if len(valid_ac) < 2 or valid_ac[0] <= 0:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
threshold = 0.5 * valid_ac[0]
|
|
171
|
+
|
|
172
|
+
for i, ac in enumerate(valid_ac):
|
|
173
|
+
if ac < threshold:
|
|
174
|
+
if i > 0:
|
|
175
|
+
# Linear interpolation
|
|
176
|
+
return i + (valid_ac[i - 1] - threshold) / (valid_ac[i - 1] - ac)
|
|
177
|
+
return float(i + 1)
|
|
178
|
+
|
|
179
|
+
return None # Never decayed below threshold
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
__all__ = ["compute_turnover", "compute_autocorrelation", "estimate_half_life"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# splitters/ - Cross-Validation
|
|
2
|
+
|
|
3
|
+
Time-series CV with purging and embargo.
|
|
4
|
+
|
|
5
|
+
## Modules
|
|
6
|
+
|
|
7
|
+
| File | Lines | Purpose |
|
|
8
|
+
|------|-------|---------|
|
|
9
|
+
| combinatorial.py | 1392 | `CombinatorialPurgedCV` (CPCV) |
|
|
10
|
+
| walk_forward.py | 757 | `PurgedWalkForwardCV` |
|
|
11
|
+
| base.py | 501 | `BaseSplitter` abstract |
|
|
12
|
+
| calendar.py | 421 | `TradingCalendar` |
|
|
13
|
+
| config.py | 315 | Configuration classes |
|
|
14
|
+
| group_isolation.py | 329 | Multi-asset isolation |
|
|
15
|
+
| persistence.py | 316 | Fold save/load |
|
|
16
|
+
|
|
17
|
+
## Key Classes
|
|
18
|
+
|
|
19
|
+
`CombinatorialPurgedCV`, `PurgedWalkForwardCV`, `TradingCalendar`
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Time-series cross-validation splitters with purging and embargo support.
|
|
2
|
+
|
|
3
|
+
This module provides advanced cross-validation methods designed specifically for
|
|
4
|
+
financial time-series data, addressing common issues like data leakage and
|
|
5
|
+
backtest overfitting.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ml4t.diagnostic.splitters.base import BaseSplitter
|
|
9
|
+
from ml4t.diagnostic.splitters.combinatorial import CombinatorialPurgedCV
|
|
10
|
+
from ml4t.diagnostic.splitters.config import (
|
|
11
|
+
CombinatorialPurgedConfig,
|
|
12
|
+
PurgedWalkForwardConfig,
|
|
13
|
+
SplitterConfig,
|
|
14
|
+
)
|
|
15
|
+
from ml4t.diagnostic.splitters.persistence import (
|
|
16
|
+
load_config,
|
|
17
|
+
load_folds,
|
|
18
|
+
save_config,
|
|
19
|
+
save_folds,
|
|
20
|
+
verify_folds,
|
|
21
|
+
)
|
|
22
|
+
from ml4t.diagnostic.splitters.walk_forward import PurgedWalkForwardCV
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"BaseSplitter",
|
|
26
|
+
"CombinatorialPurgedCV",
|
|
27
|
+
"CombinatorialPurgedConfig",
|
|
28
|
+
"PurgedWalkForwardCV",
|
|
29
|
+
"PurgedWalkForwardConfig",
|
|
30
|
+
"SplitterConfig",
|
|
31
|
+
"load_config",
|
|
32
|
+
"load_folds",
|
|
33
|
+
"save_config",
|
|
34
|
+
"save_folds",
|
|
35
|
+
"verify_folds",
|
|
36
|
+
]
|