ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,339 @@
1
+ """Smart cache with Polars DataFrame fingerprinting.
2
+
3
+ This module provides a memory-only cache optimized for signal analysis workloads,
4
+ featuring fast and stable DataFrame fingerprinting using Polars' hash_rows().
5
+
6
+ The SmartCache is designed for exploration workflows where signals are frequently
7
+ re-analyzed with different parameters. It uses LRU eviction and optional TTL
8
+ expiration to manage memory usage.
9
+
10
+ Examples
11
+ --------
12
+ >>> from ml4t.diagnostic.caching.smart_cache import SmartCache
13
+ >>> cache = SmartCache(max_items=100, ttl_seconds=3600)
14
+ >>>
15
+ >>> # Generate cache key for a signal
16
+ >>> key = cache.make_key("momentum", signal_df, config)
17
+ >>>
18
+ >>> # Check cache
19
+ >>> result = cache.get(key)
20
+ >>> if result is None:
21
+ ... result = expensive_analysis(signal_df)
22
+ ... cache.set(key, result)
23
+
24
+ References
25
+ ----------
26
+ Polars hash_rows: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe.html
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import hashlib
32
+ import time
33
+ from collections import OrderedDict
34
+ from typing import TYPE_CHECKING, Any
35
+
36
+ import polars as pl
37
+
38
+ if TYPE_CHECKING:
39
+ from ml4t.diagnostic.config.base import BaseConfig
40
+
41
+
42
+ class SmartCache:
43
+ """Memory cache with Polars DataFrame fingerprinting.
44
+
45
+ Provides fast, stable caching for signal analysis results using
46
+ content-based keys generated from DataFrames and configurations.
47
+
48
+ Features
49
+ --------
50
+ - **Polars fingerprinting**: Uses pl.hash_rows() for fast, stable hashing
51
+ - **LRU eviction**: Automatically removes least recently used items
52
+ - **TTL expiration**: Optional time-based expiration
53
+ - **Memory-only**: No disk persistence (simpler, exploration-focused)
54
+
55
+ Parameters
56
+ ----------
57
+ max_items : int, default 100
58
+ Maximum number of items in cache. When exceeded, LRU eviction occurs.
59
+ ttl_seconds : int | None, default 3600
60
+ Time-to-live in seconds. None disables expiration.
61
+
62
+ Examples
63
+ --------
64
+ >>> cache = SmartCache(max_items=200, ttl_seconds=None) # No expiration
65
+ >>>
66
+ >>> # Cache individual signal results
67
+ >>> for name, df in signals.items():
68
+ ... key = cache.make_key(name, df, config)
69
+ ... result = cache.get(key)
70
+ ... if result is None:
71
+ ... result = analyzer.analyze(df)
72
+ ... cache.set(key, result)
73
+ """
74
+
75
+ def __init__(self, max_items: int = 100, ttl_seconds: int | None = 3600):
76
+ """Initialize SmartCache.
77
+
78
+ Parameters
79
+ ----------
80
+ max_items : int
81
+ Maximum cache size (LRU eviction when exceeded)
82
+ ttl_seconds : int | None
83
+ Time-to-live in seconds (None = no expiration)
84
+ """
85
+ self._cache: OrderedDict[str, tuple[Any, float]] = OrderedDict()
86
+ self.max_items = max_items
87
+ self.ttl_seconds = ttl_seconds
88
+ self._hits = 0
89
+ self._misses = 0
90
+
91
+ @staticmethod
92
+ def polars_fingerprint(df: pl.DataFrame, seed: int = 42) -> str:
93
+ """Generate stable hash from Polars DataFrame.
94
+
95
+ Uses pl.hash_rows() for fast row-wise hashing, combined with
96
+ schema and shape information for collision resistance.
97
+
98
+ Parameters
99
+ ----------
100
+ df : pl.DataFrame
101
+ DataFrame to fingerprint
102
+ seed : int, default 42
103
+ Seed for hash_rows() reproducibility
104
+
105
+ Returns
106
+ -------
107
+ str
108
+ MD5 hex digest of the DataFrame content
109
+
110
+ Notes
111
+ -----
112
+ The fingerprint includes:
113
+ - Column names and dtypes (schema)
114
+ - DataFrame shape
115
+ - Row-wise content hash using pl.hash_rows()
116
+
117
+ This ensures different DataFrames produce different fingerprints,
118
+ while identical DataFrames always produce the same fingerprint.
119
+
120
+ Examples
121
+ --------
122
+ >>> df1 = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
123
+ >>> fp1 = SmartCache.polars_fingerprint(df1)
124
+ >>>
125
+ >>> # Same data = same fingerprint
126
+ >>> df2 = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
127
+ >>> fp2 = SmartCache.polars_fingerprint(df2)
128
+ >>> assert fp1 == fp2
129
+ >>>
130
+ >>> # Different data = different fingerprint
131
+ >>> df3 = pl.DataFrame({"a": [1, 2, 4], "b": [4.0, 5.0, 6.0]})
132
+ >>> fp3 = SmartCache.polars_fingerprint(df3)
133
+ >>> assert fp1 != fp3
134
+ """
135
+ # Build schema string for deterministic ordering
136
+ schema_str = str([(c, str(d)) for c, d in zip(df.columns, df.dtypes)])
137
+
138
+ # Compute row hashes using Polars' optimized function
139
+ row_hashes = df.hash_rows(seed=seed)
140
+
141
+ # Combine into final hash
142
+ hasher = hashlib.md5()
143
+ hasher.update(schema_str.encode())
144
+ hasher.update(row_hashes.to_numpy().tobytes())
145
+ hasher.update(f"{df.shape}".encode())
146
+
147
+ return hasher.hexdigest()
148
+
149
+ def make_key(
150
+ self,
151
+ signal_name: str,
152
+ signal_df: pl.DataFrame,
153
+ config: BaseConfig,
154
+ ) -> str:
155
+ """Generate cache key from signal name, data, and configuration.
156
+
157
+ Parameters
158
+ ----------
159
+ signal_name : str
160
+ Unique identifier for the signal
161
+ signal_df : pl.DataFrame
162
+ Signal data
163
+ config : BaseConfig
164
+ Analysis configuration
165
+
166
+ Returns
167
+ -------
168
+ str
169
+ Cache key combining signal, data fingerprint, and config hash
170
+
171
+ Examples
172
+ --------
173
+ >>> key = cache.make_key("momentum_12m", momentum_df, config)
174
+ >>> key
175
+ 'momentum_12m_a1b2c3d4e5f6_g7h8i9j0k1l2'
176
+ """
177
+ # DataFrame fingerprint (first 12 chars)
178
+ df_hash = self.polars_fingerprint(signal_df)[:12]
179
+
180
+ # Config hash (first 12 chars)
181
+ config_hash = hashlib.md5(config.model_dump_json().encode()).hexdigest()[:12]
182
+
183
+ return f"{signal_name}_{df_hash}_{config_hash}"
184
+
185
+ def get(self, key: str) -> Any | None:
186
+ """Retrieve value from cache.
187
+
188
+ Parameters
189
+ ----------
190
+ key : str
191
+ Cache key (from make_key())
192
+
193
+ Returns
194
+ -------
195
+ Any | None
196
+ Cached value, or None if not found/expired
197
+
198
+ Notes
199
+ -----
200
+ Updates LRU ordering on hit. Automatically removes expired entries.
201
+ """
202
+ if key not in self._cache:
203
+ self._misses += 1
204
+ return None
205
+
206
+ value, timestamp = self._cache[key]
207
+
208
+ # Check TTL expiration
209
+ if self.ttl_seconds is not None:
210
+ age = time.time() - timestamp
211
+ if age > self.ttl_seconds:
212
+ del self._cache[key]
213
+ self._misses += 1
214
+ return None
215
+
216
+ # Move to end (most recently used)
217
+ self._cache.move_to_end(key)
218
+ self._hits += 1
219
+ return value
220
+
221
+ def set(self, key: str, value: Any) -> None:
222
+ """Store value in cache.
223
+
224
+ Parameters
225
+ ----------
226
+ key : str
227
+ Cache key
228
+ value : Any
229
+ Value to cache
230
+
231
+ Notes
232
+ -----
233
+ Triggers LRU eviction if cache exceeds max_items.
234
+ """
235
+ # Evict oldest entries if at capacity
236
+ while len(self._cache) >= self.max_items:
237
+ self._cache.popitem(last=False)
238
+
239
+ # Add/update entry
240
+ self._cache[key] = (value, time.time())
241
+ self._cache.move_to_end(key)
242
+
243
+ def invalidate(self, key: str) -> bool:
244
+ """Remove specific entry from cache.
245
+
246
+ Parameters
247
+ ----------
248
+ key : str
249
+ Cache key to invalidate
250
+
251
+ Returns
252
+ -------
253
+ bool
254
+ True if key existed and was removed, False otherwise
255
+ """
256
+ if key in self._cache:
257
+ del self._cache[key]
258
+ return True
259
+ return False
260
+
261
+ def clear(self) -> None:
262
+ """Remove all entries from cache."""
263
+ self._cache.clear()
264
+ self._hits = 0
265
+ self._misses = 0
266
+
267
+ def invalidate_signal(self, signal_name: str) -> int:
268
+ """Invalidate all cache entries for a specific signal.
269
+
270
+ Useful when signal data has been updated and all cached
271
+ analysis results need to be discarded.
272
+
273
+ Parameters
274
+ ----------
275
+ signal_name : str
276
+ Signal name prefix to match
277
+
278
+ Returns
279
+ -------
280
+ int
281
+ Number of entries removed
282
+ """
283
+ prefix = f"{signal_name}_"
284
+ keys_to_remove = [k for k in self._cache if k.startswith(prefix)]
285
+ for key in keys_to_remove:
286
+ del self._cache[key]
287
+ return len(keys_to_remove)
288
+
289
+ @property
290
+ def size(self) -> int:
291
+ """Current number of items in cache."""
292
+ return len(self._cache)
293
+
294
+ @property
295
+ def hit_rate(self) -> float:
296
+ """Cache hit rate (0.0 to 1.0)."""
297
+ total = self._hits + self._misses
298
+ return self._hits / total if total > 0 else 0.0
299
+
300
+ @property
301
+ def stats(self) -> dict[str, Any]:
302
+ """Cache statistics.
303
+
304
+ Returns
305
+ -------
306
+ dict
307
+ Dictionary with hits, misses, hit_rate, size, max_items, ttl_seconds
308
+ """
309
+ return {
310
+ "hits": self._hits,
311
+ "misses": self._misses,
312
+ "hit_rate": self.hit_rate,
313
+ "size": self.size,
314
+ "max_items": self.max_items,
315
+ "ttl_seconds": self.ttl_seconds,
316
+ }
317
+
318
+ def __repr__(self) -> str:
319
+ """Developer representation."""
320
+ return (
321
+ f"SmartCache(size={self.size}/{self.max_items}, "
322
+ f"hit_rate={self.hit_rate:.1%}, ttl={self.ttl_seconds}s)"
323
+ )
324
+
325
+ def __contains__(self, key: str) -> bool:
326
+ """Check if key exists in cache (does not update LRU or count as hit)."""
327
+ if key not in self._cache:
328
+ return False
329
+ # Check expiration without modifying state
330
+ if self.ttl_seconds is not None:
331
+ _, timestamp = self._cache[key]
332
+ age = time.time() - timestamp
333
+ if age > self.ttl_seconds:
334
+ return False
335
+ return True
336
+
337
+ def __len__(self) -> int:
338
+ """Return number of items in cache."""
339
+ return len(self._cache)
@@ -0,0 +1,24 @@
1
+ # config/ - Pydantic Configuration
2
+
3
+ 10 primary configs with `.for_quick_analysis()`, `.for_research()` presets.
4
+
5
+ ## Primary Configs
6
+
7
+ | Config | Purpose |
8
+ |--------|---------|
9
+ | DiagnosticConfig | Feature diagnostics |
10
+ | StatisticalConfig | DSR, RAS, FDR |
11
+ | PortfolioConfig | Portfolio analysis |
12
+ | SignalConfig | Signal analysis |
13
+ | TradeConfig | Trade analysis |
14
+ | EventConfig | Event studies |
15
+ | BarrierConfig | Barrier analysis |
16
+ | ReportConfig | Report generation |
17
+ | RuntimeConfig | Execution settings |
18
+
19
+ ## Pattern
20
+
21
+ ```python
22
+ config = DiagnosticConfig.for_research()
23
+ config.stationarity.enabled # Single-level nesting
24
+ ```
@@ -0,0 +1,267 @@
1
+ # ML4T Diagnostic Configuration System
2
+
3
+ Type-safe, validated configuration using Pydantic v2.
4
+
5
+ ## Overview
6
+
7
+ The configuration system provides 10 primary config classes:
8
+
9
+ | Config | Purpose |
10
+ |--------|---------|
11
+ | `DiagnosticConfig` | Feature diagnostics (stationarity, IC, volatility) |
12
+ | `StatisticalConfig` | Statistical tests (PSR, DSR, MinTRL, FDR) |
13
+ | `PortfolioConfig` | Portfolio analysis (metrics, Bayesian, drawdown) |
14
+ | `TradeConfig` | Trade analysis (extraction, SHAP, clustering) |
15
+ | `SignalConfig` | Signal analysis (IC, quantiles, RAS) |
16
+ | `EventConfig` | Event studies |
17
+ | `BarrierConfig` | Triple barrier analysis |
18
+ | `ReportConfig` | Report generation (HTML, JSON, output) |
19
+ | `RuntimeConfig` | Execution settings (n_jobs, cache, verbose) |
20
+ | `MultiSignalAnalysisConfig` | Multi-signal comparison |
21
+
22
+ ## Quick Start
23
+
24
+ ```python
25
+ from ml4t.diagnostic.config import (
26
+ DiagnosticConfig,
27
+ PortfolioConfig,
28
+ StatisticalConfig,
29
+ RuntimeConfig,
30
+ )
31
+
32
+ # Use defaults (sensible out-of-the-box)
33
+ config = DiagnosticConfig()
34
+ portfolio_config = PortfolioConfig()
35
+
36
+ # Use presets
37
+ quick_config = DiagnosticConfig.for_quick_analysis()
38
+ research_config = DiagnosticConfig.for_research()
39
+ production_config = DiagnosticConfig.for_production()
40
+
41
+ # Load from YAML
42
+ config = DiagnosticConfig.from_yaml("config.yaml")
43
+
44
+ # Save to YAML
45
+ config.to_yaml("config.yaml")
46
+ ```
47
+
48
+ ## Architecture
49
+
50
+ ### File Structure
51
+
52
+ ```
53
+ config/
54
+ ├── __init__.py # Public API exports
55
+ ├── base.py # BaseConfig, RuntimeConfig
56
+ ├── validation.py # Custom validators and types
57
+ ├── feature_config.py # DiagnosticConfig + Settings
58
+ ├── portfolio_config.py # PortfolioConfig + Settings
59
+ ├── sharpe_config.py # StatisticalConfig + Settings
60
+ ├── signal_config.py # SignalConfig + Settings
61
+ ├── trade_analysis_config.py # TradeConfig + Settings
62
+ ├── event_config.py # EventConfig + WindowSettings
63
+ ├── barrier_config.py # BarrierConfig + Settings
64
+ ├── multi_signal_config.py # MultiSignalAnalysisConfig
65
+ └── report_config.py # ReportConfig + Settings
66
+ ```
67
+
68
+ ### Design Pattern: Single-Level Nesting
69
+
70
+ All configs use a flat structure with Settings classes for grouping:
71
+
72
+ ```python
73
+ from ml4t.diagnostic.config import DiagnosticConfig, StationaritySettings
74
+
75
+ config = DiagnosticConfig(
76
+ stationarity=StationaritySettings(
77
+ enabled=True,
78
+ significance_level=0.01,
79
+ )
80
+ )
81
+
82
+ # Access: config.stationarity.enabled
83
+ ```
84
+
85
+ ## Module Configurations
86
+
87
+ ### Feature Diagnostics
88
+
89
+ ```python
90
+ from ml4t.diagnostic.config import (
91
+ DiagnosticConfig,
92
+ StationaritySettings,
93
+ ICSettings,
94
+ )
95
+
96
+ config = DiagnosticConfig(
97
+ stationarity=StationaritySettings(
98
+ significance_level=0.01,
99
+ adf_enabled=True,
100
+ kpss_enabled=True,
101
+ ),
102
+ ic=ICSettings(
103
+ lag_structure=[0, 1, 5, 10, 21],
104
+ hac_adjustment=True,
105
+ ),
106
+ )
107
+ ```
108
+
109
+ **Settings**: StationaritySettings, ACFSettings, VolatilitySettings, DistributionSettings,
110
+ CorrelationSettings, PCASettings, ClusteringSettings, RedundancySettings, ICSettings,
111
+ BinaryClassificationSettings, ThresholdAnalysisSettings, MLDiagnosticsSettings
112
+
113
+ ### Portfolio Analysis
114
+
115
+ ```python
116
+ from ml4t.diagnostic.config import (
117
+ PortfolioConfig,
118
+ MetricsSettings,
119
+ PortfolioMetric,
120
+ )
121
+
122
+ config = PortfolioConfig(
123
+ metrics=MetricsSettings(
124
+ metrics=[
125
+ PortfolioMetric.SHARPE,
126
+ PortfolioMetric.SORTINO,
127
+ PortfolioMetric.MAX_DRAWDOWN,
128
+ ],
129
+ risk_free_rate=0.02,
130
+ periods_per_year=252,
131
+ ),
132
+ )
133
+ ```
134
+
135
+ **Settings**: MetricsSettings, BayesianSettings, TimeAggregationSettings, DrawdownSettings
136
+
137
+ ### Statistical Testing
138
+
139
+ ```python
140
+ from ml4t.diagnostic.config import (
141
+ StatisticalConfig,
142
+ PSRSettings,
143
+ DSRSettings,
144
+ )
145
+
146
+ config = StatisticalConfig(
147
+ psr=PSRSettings(
148
+ target_sharpe=1.0,
149
+ confidence_level=0.95,
150
+ ),
151
+ dsr=DSRSettings(
152
+ n_trials=500,
153
+ prob_zero_sharpe=0.5,
154
+ ),
155
+ )
156
+ ```
157
+
158
+ **Settings**: PSRSettings, MinTRLSettings, DSRSettings, FDRSettings
159
+
160
+ ### Trade Analysis
161
+
162
+ ```python
163
+ from ml4t.diagnostic.config import (
164
+ TradeConfig,
165
+ ExtractionSettings,
166
+ ClusteringSettings,
167
+ )
168
+
169
+ config = TradeConfig(
170
+ extraction=ExtractionSettings(n_worst=50, n_best=20),
171
+ clustering=ClusteringSettings(min_cluster_size=10),
172
+ )
173
+ ```
174
+
175
+ **Settings**: ExtractionSettings, FilterSettings, AlignmentSettings, ClusteringSettings, HypothesisSettings
176
+
177
+ ### Signal Analysis
178
+
179
+ ```python
180
+ from ml4t.diagnostic.config import SignalConfig, ICSignalSettings
181
+
182
+ config = SignalConfig(
183
+ ic=ICSignalSettings(
184
+ method="spearman",
185
+ periods=[1, 5, 10, 21],
186
+ ),
187
+ )
188
+ ```
189
+
190
+ **Settings**: ICSignalSettings, QuantileSettings, RASSettings, VisualizationSettings
191
+
192
+ ## Presets
193
+
194
+ Each config provides common presets:
195
+
196
+ ```python
197
+ # Quick exploratory analysis
198
+ config = DiagnosticConfig.for_quick_analysis()
199
+ config = PortfolioConfig.for_quick_analysis()
200
+
201
+ # Comprehensive research
202
+ config = DiagnosticConfig.for_research()
203
+ config = StatisticalConfig.for_research()
204
+
205
+ # Production monitoring
206
+ config = DiagnosticConfig.for_production()
207
+ config = TradeConfig.for_production()
208
+ ```
209
+
210
+ ## Serialization
211
+
212
+ ```python
213
+ # YAML (recommended for human editing)
214
+ config.to_yaml("config.yaml")
215
+ config = DiagnosticConfig.from_yaml("config.yaml")
216
+
217
+ # JSON (better for APIs)
218
+ config.to_json("config.json")
219
+ config = DiagnosticConfig.from_json("config.json")
220
+
221
+ # Auto-detect from extension
222
+ config = DiagnosticConfig.from_file("config.yaml")
223
+
224
+ # Dictionary
225
+ config = DiagnosticConfig.from_dict({"verbose": True})
226
+ d = config.to_dict()
227
+ ```
228
+
229
+ ## Validation
230
+
231
+ ```python
232
+ # Automatic validation on construction
233
+ from pydantic import ValidationError
234
+
235
+ try:
236
+ config = StationaritySettings(significance_level=0.5) # Invalid
237
+ except ValidationError as e:
238
+ print(e) # "significance_level must be <= 0.10"
239
+
240
+ # Manual validation
241
+ config = DiagnosticConfig()
242
+ errors = config.validate_fully()
243
+ ```
244
+
245
+ ## Runtime Configuration
246
+
247
+ Runtime settings are separate to avoid coupling with analysis configs:
248
+
249
+ ```python
250
+ from ml4t.diagnostic.config import RuntimeConfig
251
+
252
+ runtime = RuntimeConfig(
253
+ n_jobs=-1, # Use all CPU cores
254
+ cache_enabled=True, # Cache expensive computations
255
+ verbose=True, # Show progress
256
+ random_state=42, # Reproducibility
257
+ )
258
+
259
+ # Pass as separate parameter
260
+ result = analyze_features(df, config=DiagnosticConfig(), runtime=runtime)
261
+ ```
262
+
263
+ ## References
264
+
265
+ - **Pydantic v2**: https://docs.pydantic.dev/latest/
266
+ - **López de Prado, M.**: "Advances in Financial Machine Learning"
267
+ - **Bailey & López de Prado**: Multiple testing papers