ml4t-diagnostic 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml4t/diagnostic/AGENT.md +25 -0
- ml4t/diagnostic/__init__.py +166 -0
- ml4t/diagnostic/backends/__init__.py +10 -0
- ml4t/diagnostic/backends/adapter.py +192 -0
- ml4t/diagnostic/backends/polars_backend.py +899 -0
- ml4t/diagnostic/caching/__init__.py +40 -0
- ml4t/diagnostic/caching/cache.py +331 -0
- ml4t/diagnostic/caching/decorators.py +131 -0
- ml4t/diagnostic/caching/smart_cache.py +339 -0
- ml4t/diagnostic/config/AGENT.md +24 -0
- ml4t/diagnostic/config/README.md +267 -0
- ml4t/diagnostic/config/__init__.py +219 -0
- ml4t/diagnostic/config/barrier_config.py +277 -0
- ml4t/diagnostic/config/base.py +301 -0
- ml4t/diagnostic/config/event_config.py +148 -0
- ml4t/diagnostic/config/feature_config.py +404 -0
- ml4t/diagnostic/config/multi_signal_config.py +55 -0
- ml4t/diagnostic/config/portfolio_config.py +215 -0
- ml4t/diagnostic/config/report_config.py +391 -0
- ml4t/diagnostic/config/sharpe_config.py +202 -0
- ml4t/diagnostic/config/signal_config.py +206 -0
- ml4t/diagnostic/config/trade_analysis_config.py +310 -0
- ml4t/diagnostic/config/validation.py +279 -0
- ml4t/diagnostic/core/__init__.py +29 -0
- ml4t/diagnostic/core/numba_utils.py +315 -0
- ml4t/diagnostic/core/purging.py +372 -0
- ml4t/diagnostic/core/sampling.py +471 -0
- ml4t/diagnostic/errors/__init__.py +205 -0
- ml4t/diagnostic/evaluation/AGENT.md +26 -0
- ml4t/diagnostic/evaluation/__init__.py +437 -0
- ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
- ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
- ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
- ml4t/diagnostic/evaluation/dashboard.py +715 -0
- ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
- ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
- ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
- ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
- ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
- ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
- ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
- ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
- ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
- ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
- ml4t/diagnostic/evaluation/event_analysis.py +647 -0
- ml4t/diagnostic/evaluation/excursion.py +390 -0
- ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
- ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
- ml4t/diagnostic/evaluation/framework.py +935 -0
- ml4t/diagnostic/evaluation/metric_registry.py +255 -0
- ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
- ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
- ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
- ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
- ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
- ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
- ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
- ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
- ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
- ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
- ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
- ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
- ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
- ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
- ml4t/diagnostic/evaluation/multi_signal.py +550 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
- ml4t/diagnostic/evaluation/report_generation.py +824 -0
- ml4t/diagnostic/evaluation/signal_selector.py +452 -0
- ml4t/diagnostic/evaluation/stat_registry.py +139 -0
- ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
- ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
- ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
- ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
- ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
- ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
- ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
- ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
- ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
- ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
- ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
- ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
- ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
- ml4t/diagnostic/evaluation/stats/moments.py +164 -0
- ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
- ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
- ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
- ml4t/diagnostic/evaluation/themes.py +330 -0
- ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
- ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
- ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
- ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
- ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
- ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
- ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
- ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
- ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
- ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
- ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
- ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
- ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
- ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
- ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
- ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
- ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
- ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
- ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
- ml4t/diagnostic/evaluation/validated_cv.py +535 -0
- ml4t/diagnostic/evaluation/visualization.py +1050 -0
- ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
- ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
- ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
- ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
- ml4t/diagnostic/integration/__init__.py +48 -0
- ml4t/diagnostic/integration/backtest_contract.py +671 -0
- ml4t/diagnostic/integration/data_contract.py +316 -0
- ml4t/diagnostic/integration/engineer_contract.py +226 -0
- ml4t/diagnostic/logging/__init__.py +77 -0
- ml4t/diagnostic/logging/logger.py +245 -0
- ml4t/diagnostic/logging/performance.py +234 -0
- ml4t/diagnostic/logging/progress.py +234 -0
- ml4t/diagnostic/logging/wandb.py +412 -0
- ml4t/diagnostic/metrics/__init__.py +9 -0
- ml4t/diagnostic/metrics/percentiles.py +128 -0
- ml4t/diagnostic/py.typed +1 -0
- ml4t/diagnostic/reporting/__init__.py +43 -0
- ml4t/diagnostic/reporting/base.py +130 -0
- ml4t/diagnostic/reporting/html_renderer.py +275 -0
- ml4t/diagnostic/reporting/json_renderer.py +51 -0
- ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
- ml4t/diagnostic/results/AGENT.md +24 -0
- ml4t/diagnostic/results/__init__.py +105 -0
- ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
- ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
- ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
- ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
- ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
- ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
- ml4t/diagnostic/results/barrier_results/validation.py +38 -0
- ml4t/diagnostic/results/base.py +177 -0
- ml4t/diagnostic/results/event_results.py +349 -0
- ml4t/diagnostic/results/feature_results.py +787 -0
- ml4t/diagnostic/results/multi_signal_results.py +431 -0
- ml4t/diagnostic/results/portfolio_results.py +281 -0
- ml4t/diagnostic/results/sharpe_results.py +448 -0
- ml4t/diagnostic/results/signal_results/__init__.py +74 -0
- ml4t/diagnostic/results/signal_results/ic.py +581 -0
- ml4t/diagnostic/results/signal_results/irtc.py +110 -0
- ml4t/diagnostic/results/signal_results/quantile.py +392 -0
- ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
- ml4t/diagnostic/results/signal_results/turnover.py +213 -0
- ml4t/diagnostic/results/signal_results/validation.py +147 -0
- ml4t/diagnostic/signal/AGENT.md +17 -0
- ml4t/diagnostic/signal/__init__.py +69 -0
- ml4t/diagnostic/signal/_report.py +152 -0
- ml4t/diagnostic/signal/_utils.py +261 -0
- ml4t/diagnostic/signal/core.py +275 -0
- ml4t/diagnostic/signal/quantile.py +148 -0
- ml4t/diagnostic/signal/result.py +214 -0
- ml4t/diagnostic/signal/signal_ic.py +129 -0
- ml4t/diagnostic/signal/turnover.py +182 -0
- ml4t/diagnostic/splitters/AGENT.md +19 -0
- ml4t/diagnostic/splitters/__init__.py +36 -0
- ml4t/diagnostic/splitters/base.py +501 -0
- ml4t/diagnostic/splitters/calendar.py +421 -0
- ml4t/diagnostic/splitters/calendar_config.py +91 -0
- ml4t/diagnostic/splitters/combinatorial.py +1064 -0
- ml4t/diagnostic/splitters/config.py +322 -0
- ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
- ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
- ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
- ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
- ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
- ml4t/diagnostic/splitters/group_isolation.py +329 -0
- ml4t/diagnostic/splitters/persistence.py +316 -0
- ml4t/diagnostic/splitters/utils.py +207 -0
- ml4t/diagnostic/splitters/walk_forward.py +757 -0
- ml4t/diagnostic/utils/__init__.py +42 -0
- ml4t/diagnostic/utils/config.py +542 -0
- ml4t/diagnostic/utils/dependencies.py +318 -0
- ml4t/diagnostic/utils/sessions.py +127 -0
- ml4t/diagnostic/validation/__init__.py +54 -0
- ml4t/diagnostic/validation/dataframe.py +274 -0
- ml4t/diagnostic/validation/returns.py +280 -0
- ml4t/diagnostic/validation/timeseries.py +299 -0
- ml4t/diagnostic/visualization/AGENT.md +19 -0
- ml4t/diagnostic/visualization/__init__.py +223 -0
- ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
- ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
- ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
- ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
- ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
- ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
- ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
- ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
- ml4t/diagnostic/visualization/barrier_plots.py +782 -0
- ml4t/diagnostic/visualization/core.py +1060 -0
- ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
- ml4t/diagnostic/visualization/dashboards/base.py +582 -0
- ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
- ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
- ml4t/diagnostic/visualization/dashboards.py +43 -0
- ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
- ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
- ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
- ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
- ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
- ml4t/diagnostic/visualization/feature_plots.py +888 -0
- ml4t/diagnostic/visualization/interaction_plots.py +618 -0
- ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
- ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
- ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
- ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
- ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
- ml4t/diagnostic/visualization/report_generation.py +1343 -0
- ml4t/diagnostic/visualization/signal/__init__.py +103 -0
- ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
- ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
- ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
- ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
- ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
- ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
- ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
- ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
- ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
- ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
- ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
- ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""Smart cache with Polars DataFrame fingerprinting.
|
|
2
|
+
|
|
3
|
+
This module provides a memory-only cache optimized for signal analysis workloads,
|
|
4
|
+
featuring fast and stable DataFrame fingerprinting using Polars' hash_rows().
|
|
5
|
+
|
|
6
|
+
The SmartCache is designed for exploration workflows where signals are frequently
|
|
7
|
+
re-analyzed with different parameters. It uses LRU eviction and optional TTL
|
|
8
|
+
expiration to manage memory usage.
|
|
9
|
+
|
|
10
|
+
Examples
|
|
11
|
+
--------
|
|
12
|
+
>>> from ml4t.diagnostic.caching.smart_cache import SmartCache
|
|
13
|
+
>>> cache = SmartCache(max_items=100, ttl_seconds=3600)
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Generate cache key for a signal
|
|
16
|
+
>>> key = cache.make_key("momentum", signal_df, config)
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Check cache
|
|
19
|
+
>>> result = cache.get(key)
|
|
20
|
+
>>> if result is None:
|
|
21
|
+
... result = expensive_analysis(signal_df)
|
|
22
|
+
... cache.set(key, result)
|
|
23
|
+
|
|
24
|
+
References
|
|
25
|
+
----------
|
|
26
|
+
Polars hash_rows: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe.html
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import hashlib
|
|
32
|
+
import time
|
|
33
|
+
from collections import OrderedDict
|
|
34
|
+
from typing import TYPE_CHECKING, Any
|
|
35
|
+
|
|
36
|
+
import polars as pl
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from ml4t.diagnostic.config.base import BaseConfig
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SmartCache:
|
|
43
|
+
"""Memory cache with Polars DataFrame fingerprinting.
|
|
44
|
+
|
|
45
|
+
Provides fast, stable caching for signal analysis results using
|
|
46
|
+
content-based keys generated from DataFrames and configurations.
|
|
47
|
+
|
|
48
|
+
Features
|
|
49
|
+
--------
|
|
50
|
+
- **Polars fingerprinting**: Uses pl.hash_rows() for fast, stable hashing
|
|
51
|
+
- **LRU eviction**: Automatically removes least recently used items
|
|
52
|
+
- **TTL expiration**: Optional time-based expiration
|
|
53
|
+
- **Memory-only**: No disk persistence (simpler, exploration-focused)
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
max_items : int, default 100
|
|
58
|
+
Maximum number of items in cache. When exceeded, LRU eviction occurs.
|
|
59
|
+
ttl_seconds : int | None, default 3600
|
|
60
|
+
Time-to-live in seconds. None disables expiration.
|
|
61
|
+
|
|
62
|
+
Examples
|
|
63
|
+
--------
|
|
64
|
+
>>> cache = SmartCache(max_items=200, ttl_seconds=None) # No expiration
|
|
65
|
+
>>>
|
|
66
|
+
>>> # Cache individual signal results
|
|
67
|
+
>>> for name, df in signals.items():
|
|
68
|
+
... key = cache.make_key(name, df, config)
|
|
69
|
+
... result = cache.get(key)
|
|
70
|
+
... if result is None:
|
|
71
|
+
... result = analyzer.analyze(df)
|
|
72
|
+
... cache.set(key, result)
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, max_items: int = 100, ttl_seconds: int | None = 3600):
|
|
76
|
+
"""Initialize SmartCache.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
max_items : int
|
|
81
|
+
Maximum cache size (LRU eviction when exceeded)
|
|
82
|
+
ttl_seconds : int | None
|
|
83
|
+
Time-to-live in seconds (None = no expiration)
|
|
84
|
+
"""
|
|
85
|
+
self._cache: OrderedDict[str, tuple[Any, float]] = OrderedDict()
|
|
86
|
+
self.max_items = max_items
|
|
87
|
+
self.ttl_seconds = ttl_seconds
|
|
88
|
+
self._hits = 0
|
|
89
|
+
self._misses = 0
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def polars_fingerprint(df: pl.DataFrame, seed: int = 42) -> str:
|
|
93
|
+
"""Generate stable hash from Polars DataFrame.
|
|
94
|
+
|
|
95
|
+
Uses pl.hash_rows() for fast row-wise hashing, combined with
|
|
96
|
+
schema and shape information for collision resistance.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
df : pl.DataFrame
|
|
101
|
+
DataFrame to fingerprint
|
|
102
|
+
seed : int, default 42
|
|
103
|
+
Seed for hash_rows() reproducibility
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
str
|
|
108
|
+
MD5 hex digest of the DataFrame content
|
|
109
|
+
|
|
110
|
+
Notes
|
|
111
|
+
-----
|
|
112
|
+
The fingerprint includes:
|
|
113
|
+
- Column names and dtypes (schema)
|
|
114
|
+
- DataFrame shape
|
|
115
|
+
- Row-wise content hash using pl.hash_rows()
|
|
116
|
+
|
|
117
|
+
This ensures different DataFrames produce different fingerprints,
|
|
118
|
+
while identical DataFrames always produce the same fingerprint.
|
|
119
|
+
|
|
120
|
+
Examples
|
|
121
|
+
--------
|
|
122
|
+
>>> df1 = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
|
|
123
|
+
>>> fp1 = SmartCache.polars_fingerprint(df1)
|
|
124
|
+
>>>
|
|
125
|
+
>>> # Same data = same fingerprint
|
|
126
|
+
>>> df2 = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
|
|
127
|
+
>>> fp2 = SmartCache.polars_fingerprint(df2)
|
|
128
|
+
>>> assert fp1 == fp2
|
|
129
|
+
>>>
|
|
130
|
+
>>> # Different data = different fingerprint
|
|
131
|
+
>>> df3 = pl.DataFrame({"a": [1, 2, 4], "b": [4.0, 5.0, 6.0]})
|
|
132
|
+
>>> fp3 = SmartCache.polars_fingerprint(df3)
|
|
133
|
+
>>> assert fp1 != fp3
|
|
134
|
+
"""
|
|
135
|
+
# Build schema string for deterministic ordering
|
|
136
|
+
schema_str = str([(c, str(d)) for c, d in zip(df.columns, df.dtypes)])
|
|
137
|
+
|
|
138
|
+
# Compute row hashes using Polars' optimized function
|
|
139
|
+
row_hashes = df.hash_rows(seed=seed)
|
|
140
|
+
|
|
141
|
+
# Combine into final hash
|
|
142
|
+
hasher = hashlib.md5()
|
|
143
|
+
hasher.update(schema_str.encode())
|
|
144
|
+
hasher.update(row_hashes.to_numpy().tobytes())
|
|
145
|
+
hasher.update(f"{df.shape}".encode())
|
|
146
|
+
|
|
147
|
+
return hasher.hexdigest()
|
|
148
|
+
|
|
149
|
+
def make_key(
|
|
150
|
+
self,
|
|
151
|
+
signal_name: str,
|
|
152
|
+
signal_df: pl.DataFrame,
|
|
153
|
+
config: BaseConfig,
|
|
154
|
+
) -> str:
|
|
155
|
+
"""Generate cache key from signal name, data, and configuration.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
signal_name : str
|
|
160
|
+
Unique identifier for the signal
|
|
161
|
+
signal_df : pl.DataFrame
|
|
162
|
+
Signal data
|
|
163
|
+
config : BaseConfig
|
|
164
|
+
Analysis configuration
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
str
|
|
169
|
+
Cache key combining signal, data fingerprint, and config hash
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> key = cache.make_key("momentum_12m", momentum_df, config)
|
|
174
|
+
>>> key
|
|
175
|
+
'momentum_12m_a1b2c3d4e5f6_g7h8i9j0k1l2'
|
|
176
|
+
"""
|
|
177
|
+
# DataFrame fingerprint (first 12 chars)
|
|
178
|
+
df_hash = self.polars_fingerprint(signal_df)[:12]
|
|
179
|
+
|
|
180
|
+
# Config hash (first 12 chars)
|
|
181
|
+
config_hash = hashlib.md5(config.model_dump_json().encode()).hexdigest()[:12]
|
|
182
|
+
|
|
183
|
+
return f"{signal_name}_{df_hash}_{config_hash}"
|
|
184
|
+
|
|
185
|
+
def get(self, key: str) -> Any | None:
|
|
186
|
+
"""Retrieve value from cache.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
key : str
|
|
191
|
+
Cache key (from make_key())
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
Any | None
|
|
196
|
+
Cached value, or None if not found/expired
|
|
197
|
+
|
|
198
|
+
Notes
|
|
199
|
+
-----
|
|
200
|
+
Updates LRU ordering on hit. Automatically removes expired entries.
|
|
201
|
+
"""
|
|
202
|
+
if key not in self._cache:
|
|
203
|
+
self._misses += 1
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
value, timestamp = self._cache[key]
|
|
207
|
+
|
|
208
|
+
# Check TTL expiration
|
|
209
|
+
if self.ttl_seconds is not None:
|
|
210
|
+
age = time.time() - timestamp
|
|
211
|
+
if age > self.ttl_seconds:
|
|
212
|
+
del self._cache[key]
|
|
213
|
+
self._misses += 1
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
# Move to end (most recently used)
|
|
217
|
+
self._cache.move_to_end(key)
|
|
218
|
+
self._hits += 1
|
|
219
|
+
return value
|
|
220
|
+
|
|
221
|
+
def set(self, key: str, value: Any) -> None:
|
|
222
|
+
"""Store value in cache.
|
|
223
|
+
|
|
224
|
+
Parameters
|
|
225
|
+
----------
|
|
226
|
+
key : str
|
|
227
|
+
Cache key
|
|
228
|
+
value : Any
|
|
229
|
+
Value to cache
|
|
230
|
+
|
|
231
|
+
Notes
|
|
232
|
+
-----
|
|
233
|
+
Triggers LRU eviction if cache exceeds max_items.
|
|
234
|
+
"""
|
|
235
|
+
# Evict oldest entries if at capacity
|
|
236
|
+
while len(self._cache) >= self.max_items:
|
|
237
|
+
self._cache.popitem(last=False)
|
|
238
|
+
|
|
239
|
+
# Add/update entry
|
|
240
|
+
self._cache[key] = (value, time.time())
|
|
241
|
+
self._cache.move_to_end(key)
|
|
242
|
+
|
|
243
|
+
def invalidate(self, key: str) -> bool:
|
|
244
|
+
"""Remove specific entry from cache.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
key : str
|
|
249
|
+
Cache key to invalidate
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
bool
|
|
254
|
+
True if key existed and was removed, False otherwise
|
|
255
|
+
"""
|
|
256
|
+
if key in self._cache:
|
|
257
|
+
del self._cache[key]
|
|
258
|
+
return True
|
|
259
|
+
return False
|
|
260
|
+
|
|
261
|
+
def clear(self) -> None:
|
|
262
|
+
"""Remove all entries from cache."""
|
|
263
|
+
self._cache.clear()
|
|
264
|
+
self._hits = 0
|
|
265
|
+
self._misses = 0
|
|
266
|
+
|
|
267
|
+
def invalidate_signal(self, signal_name: str) -> int:
|
|
268
|
+
"""Invalidate all cache entries for a specific signal.
|
|
269
|
+
|
|
270
|
+
Useful when signal data has been updated and all cached
|
|
271
|
+
analysis results need to be discarded.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
signal_name : str
|
|
276
|
+
Signal name prefix to match
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
int
|
|
281
|
+
Number of entries removed
|
|
282
|
+
"""
|
|
283
|
+
prefix = f"{signal_name}_"
|
|
284
|
+
keys_to_remove = [k for k in self._cache if k.startswith(prefix)]
|
|
285
|
+
for key in keys_to_remove:
|
|
286
|
+
del self._cache[key]
|
|
287
|
+
return len(keys_to_remove)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def size(self) -> int:
|
|
291
|
+
"""Current number of items in cache."""
|
|
292
|
+
return len(self._cache)
|
|
293
|
+
|
|
294
|
+
@property
|
|
295
|
+
def hit_rate(self) -> float:
|
|
296
|
+
"""Cache hit rate (0.0 to 1.0)."""
|
|
297
|
+
total = self._hits + self._misses
|
|
298
|
+
return self._hits / total if total > 0 else 0.0
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def stats(self) -> dict[str, Any]:
|
|
302
|
+
"""Cache statistics.
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
dict
|
|
307
|
+
Dictionary with hits, misses, hit_rate, size, max_items, ttl_seconds
|
|
308
|
+
"""
|
|
309
|
+
return {
|
|
310
|
+
"hits": self._hits,
|
|
311
|
+
"misses": self._misses,
|
|
312
|
+
"hit_rate": self.hit_rate,
|
|
313
|
+
"size": self.size,
|
|
314
|
+
"max_items": self.max_items,
|
|
315
|
+
"ttl_seconds": self.ttl_seconds,
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
def __repr__(self) -> str:
|
|
319
|
+
"""Developer representation."""
|
|
320
|
+
return (
|
|
321
|
+
f"SmartCache(size={self.size}/{self.max_items}, "
|
|
322
|
+
f"hit_rate={self.hit_rate:.1%}, ttl={self.ttl_seconds}s)"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
def __contains__(self, key: str) -> bool:
|
|
326
|
+
"""Check if key exists in cache (does not update LRU or count as hit)."""
|
|
327
|
+
if key not in self._cache:
|
|
328
|
+
return False
|
|
329
|
+
# Check expiration without modifying state
|
|
330
|
+
if self.ttl_seconds is not None:
|
|
331
|
+
_, timestamp = self._cache[key]
|
|
332
|
+
age = time.time() - timestamp
|
|
333
|
+
if age > self.ttl_seconds:
|
|
334
|
+
return False
|
|
335
|
+
return True
|
|
336
|
+
|
|
337
|
+
def __len__(self) -> int:
|
|
338
|
+
"""Return number of items in cache."""
|
|
339
|
+
return len(self._cache)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# config/ - Pydantic Configuration
|
|
2
|
+
|
|
3
|
+
10 primary configs with `.for_quick_analysis()`, `.for_research()` presets.
|
|
4
|
+
|
|
5
|
+
## Primary Configs
|
|
6
|
+
|
|
7
|
+
| Config | Purpose |
|
|
8
|
+
|--------|---------|
|
|
9
|
+
| DiagnosticConfig | Feature diagnostics |
|
|
10
|
+
| StatisticalConfig | DSR, RAS, FDR |
|
|
11
|
+
| PortfolioConfig | Portfolio analysis |
|
|
12
|
+
| SignalConfig | Signal analysis |
|
|
13
|
+
| TradeConfig | Trade analysis |
|
|
14
|
+
| EventConfig | Event studies |
|
|
15
|
+
| BarrierConfig | Barrier analysis |
|
|
16
|
+
| ReportConfig | Report generation |
|
|
17
|
+
| RuntimeConfig | Execution settings |
|
|
18
|
+
|
|
19
|
+
## Pattern
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
config = DiagnosticConfig.for_research()
|
|
23
|
+
config.stationarity.enabled # Single-level nesting
|
|
24
|
+
```
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# ML4T Diagnostic Configuration System
|
|
2
|
+
|
|
3
|
+
Type-safe, validated configuration using Pydantic v2.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The configuration system provides 10 primary config classes:
|
|
8
|
+
|
|
9
|
+
| Config | Purpose |
|
|
10
|
+
|--------|---------|
|
|
11
|
+
| `DiagnosticConfig` | Feature diagnostics (stationarity, IC, volatility) |
|
|
12
|
+
| `StatisticalConfig` | Statistical tests (PSR, DSR, MinTRL, FDR) |
|
|
13
|
+
| `PortfolioConfig` | Portfolio analysis (metrics, Bayesian, drawdown) |
|
|
14
|
+
| `TradeConfig` | Trade analysis (extraction, SHAP, clustering) |
|
|
15
|
+
| `SignalConfig` | Signal analysis (IC, quantiles, RAS) |
|
|
16
|
+
| `EventConfig` | Event studies |
|
|
17
|
+
| `BarrierConfig` | Triple barrier analysis |
|
|
18
|
+
| `ReportConfig` | Report generation (HTML, JSON, output) |
|
|
19
|
+
| `RuntimeConfig` | Execution settings (n_jobs, cache, verbose) |
|
|
20
|
+
| `MultiSignalAnalysisConfig` | Multi-signal comparison |
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from ml4t.diagnostic.config import (
|
|
26
|
+
DiagnosticConfig,
|
|
27
|
+
PortfolioConfig,
|
|
28
|
+
StatisticalConfig,
|
|
29
|
+
RuntimeConfig,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Use defaults (sensible out-of-the-box)
|
|
33
|
+
config = DiagnosticConfig()
|
|
34
|
+
portfolio_config = PortfolioConfig()
|
|
35
|
+
|
|
36
|
+
# Use presets
|
|
37
|
+
quick_config = DiagnosticConfig.for_quick_analysis()
|
|
38
|
+
research_config = DiagnosticConfig.for_research()
|
|
39
|
+
production_config = DiagnosticConfig.for_production()
|
|
40
|
+
|
|
41
|
+
# Load from YAML
|
|
42
|
+
config = DiagnosticConfig.from_yaml("config.yaml")
|
|
43
|
+
|
|
44
|
+
# Save to YAML
|
|
45
|
+
config.to_yaml("config.yaml")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Architecture
|
|
49
|
+
|
|
50
|
+
### File Structure
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
config/
|
|
54
|
+
├── __init__.py # Public API exports
|
|
55
|
+
├── base.py # BaseConfig, RuntimeConfig
|
|
56
|
+
├── validation.py # Custom validators and types
|
|
57
|
+
├── feature_config.py # DiagnosticConfig + Settings
|
|
58
|
+
├── portfolio_config.py # PortfolioConfig + Settings
|
|
59
|
+
├── sharpe_config.py # StatisticalConfig + Settings
|
|
60
|
+
├── signal_config.py # SignalConfig + Settings
|
|
61
|
+
├── trade_analysis_config.py # TradeConfig + Settings
|
|
62
|
+
├── event_config.py # EventConfig + WindowSettings
|
|
63
|
+
├── barrier_config.py # BarrierConfig + Settings
|
|
64
|
+
├── multi_signal_config.py # MultiSignalAnalysisConfig
|
|
65
|
+
└── report_config.py # ReportConfig + Settings
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Design Pattern: Single-Level Nesting
|
|
69
|
+
|
|
70
|
+
All configs use a flat structure with Settings classes for grouping:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from ml4t.diagnostic.config import DiagnosticConfig, StationaritySettings
|
|
74
|
+
|
|
75
|
+
config = DiagnosticConfig(
|
|
76
|
+
stationarity=StationaritySettings(
|
|
77
|
+
enabled=True,
|
|
78
|
+
significance_level=0.01,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Access: config.stationarity.enabled
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Module Configurations
|
|
86
|
+
|
|
87
|
+
### Feature Diagnostics
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from ml4t.diagnostic.config import (
|
|
91
|
+
DiagnosticConfig,
|
|
92
|
+
StationaritySettings,
|
|
93
|
+
ICSettings,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
config = DiagnosticConfig(
|
|
97
|
+
stationarity=StationaritySettings(
|
|
98
|
+
significance_level=0.01,
|
|
99
|
+
adf_enabled=True,
|
|
100
|
+
kpss_enabled=True,
|
|
101
|
+
),
|
|
102
|
+
ic=ICSettings(
|
|
103
|
+
lag_structure=[0, 1, 5, 10, 21],
|
|
104
|
+
hac_adjustment=True,
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Settings**: StationaritySettings, ACFSettings, VolatilitySettings, DistributionSettings,
|
|
110
|
+
CorrelationSettings, PCASettings, ClusteringSettings, RedundancySettings, ICSettings,
|
|
111
|
+
BinaryClassificationSettings, ThresholdAnalysisSettings, MLDiagnosticsSettings
|
|
112
|
+
|
|
113
|
+
### Portfolio Analysis
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from ml4t.diagnostic.config import (
|
|
117
|
+
PortfolioConfig,
|
|
118
|
+
MetricsSettings,
|
|
119
|
+
PortfolioMetric,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
config = PortfolioConfig(
|
|
123
|
+
metrics=MetricsSettings(
|
|
124
|
+
metrics=[
|
|
125
|
+
PortfolioMetric.SHARPE,
|
|
126
|
+
PortfolioMetric.SORTINO,
|
|
127
|
+
PortfolioMetric.MAX_DRAWDOWN,
|
|
128
|
+
],
|
|
129
|
+
risk_free_rate=0.02,
|
|
130
|
+
periods_per_year=252,
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Settings**: MetricsSettings, BayesianSettings, TimeAggregationSettings, DrawdownSettings
|
|
136
|
+
|
|
137
|
+
### Statistical Testing
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from ml4t.diagnostic.config import (
|
|
141
|
+
StatisticalConfig,
|
|
142
|
+
PSRSettings,
|
|
143
|
+
DSRSettings,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
config = StatisticalConfig(
|
|
147
|
+
psr=PSRSettings(
|
|
148
|
+
target_sharpe=1.0,
|
|
149
|
+
confidence_level=0.95,
|
|
150
|
+
),
|
|
151
|
+
dsr=DSRSettings(
|
|
152
|
+
n_trials=500,
|
|
153
|
+
prob_zero_sharpe=0.5,
|
|
154
|
+
),
|
|
155
|
+
)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**Settings**: PSRSettings, MinTRLSettings, DSRSettings, FDRSettings
|
|
159
|
+
|
|
160
|
+
### Trade Analysis
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from ml4t.diagnostic.config import (
|
|
164
|
+
TradeConfig,
|
|
165
|
+
ExtractionSettings,
|
|
166
|
+
ClusteringSettings,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
config = TradeConfig(
|
|
170
|
+
extraction=ExtractionSettings(n_worst=50, n_best=20),
|
|
171
|
+
clustering=ClusteringSettings(min_cluster_size=10),
|
|
172
|
+
)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Settings**: ExtractionSettings, FilterSettings, AlignmentSettings, ClusteringSettings, HypothesisSettings
|
|
176
|
+
|
|
177
|
+
### Signal Analysis
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from ml4t.diagnostic.config import SignalConfig, ICSignalSettings
|
|
181
|
+
|
|
182
|
+
config = SignalConfig(
|
|
183
|
+
ic=ICSignalSettings(
|
|
184
|
+
method="spearman",
|
|
185
|
+
periods=[1, 5, 10, 21],
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
**Settings**: ICSignalSettings, QuantileSettings, RASSettings, VisualizationSettings
|
|
191
|
+
|
|
192
|
+
## Presets
|
|
193
|
+
|
|
194
|
+
Each config provides common presets:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
# Quick exploratory analysis
|
|
198
|
+
config = DiagnosticConfig.for_quick_analysis()
|
|
199
|
+
config = PortfolioConfig.for_quick_analysis()
|
|
200
|
+
|
|
201
|
+
# Comprehensive research
|
|
202
|
+
config = DiagnosticConfig.for_research()
|
|
203
|
+
config = StatisticalConfig.for_research()
|
|
204
|
+
|
|
205
|
+
# Production monitoring
|
|
206
|
+
config = DiagnosticConfig.for_production()
|
|
207
|
+
config = TradeConfig.for_production()
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Serialization
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
# YAML (recommended for human editing)
|
|
214
|
+
config.to_yaml("config.yaml")
|
|
215
|
+
config = DiagnosticConfig.from_yaml("config.yaml")
|
|
216
|
+
|
|
217
|
+
# JSON (better for APIs)
|
|
218
|
+
config.to_json("config.json")
|
|
219
|
+
config = DiagnosticConfig.from_json("config.json")
|
|
220
|
+
|
|
221
|
+
# Auto-detect from extension
|
|
222
|
+
config = DiagnosticConfig.from_file("config.yaml")
|
|
223
|
+
|
|
224
|
+
# Dictionary
|
|
225
|
+
config = DiagnosticConfig.from_dict({"verbose": True})
|
|
226
|
+
d = config.to_dict()
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Validation
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
# Automatic validation on construction
|
|
233
|
+
from pydantic import ValidationError
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
config = StationaritySettings(significance_level=0.5) # Invalid
|
|
237
|
+
except ValidationError as e:
|
|
238
|
+
print(e) # "significance_level must be <= 0.10"
|
|
239
|
+
|
|
240
|
+
# Manual validation
|
|
241
|
+
config = DiagnosticConfig()
|
|
242
|
+
errors = config.validate_fully()
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Runtime Configuration
|
|
246
|
+
|
|
247
|
+
Runtime settings are separate to avoid coupling with analysis configs:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from ml4t.diagnostic.config import RuntimeConfig
|
|
251
|
+
|
|
252
|
+
runtime = RuntimeConfig(
|
|
253
|
+
n_jobs=-1, # Use all CPU cores
|
|
254
|
+
cache_enabled=True, # Cache expensive computations
|
|
255
|
+
verbose=True, # Show progress
|
|
256
|
+
random_state=42, # Reproducibility
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Pass as separate parameter
|
|
260
|
+
result = analyze_features(df, config=DiagnosticConfig(), runtime=runtime)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## References
|
|
264
|
+
|
|
265
|
+
- **Pydantic v2**: https://docs.pydantic.dev/latest/
|
|
266
|
+
- **López de Prado, M.**: "Advances in Financial Machine Learning"
|
|
267
|
+
- **Bailey & López de Prado**: Multiple testing papers
|