ml4t-diagnostic 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml4t/diagnostic/AGENT.md +25 -0
- ml4t/diagnostic/__init__.py +166 -0
- ml4t/diagnostic/backends/__init__.py +10 -0
- ml4t/diagnostic/backends/adapter.py +192 -0
- ml4t/diagnostic/backends/polars_backend.py +899 -0
- ml4t/diagnostic/caching/__init__.py +40 -0
- ml4t/diagnostic/caching/cache.py +331 -0
- ml4t/diagnostic/caching/decorators.py +131 -0
- ml4t/diagnostic/caching/smart_cache.py +339 -0
- ml4t/diagnostic/config/AGENT.md +24 -0
- ml4t/diagnostic/config/README.md +267 -0
- ml4t/diagnostic/config/__init__.py +219 -0
- ml4t/diagnostic/config/barrier_config.py +277 -0
- ml4t/diagnostic/config/base.py +301 -0
- ml4t/diagnostic/config/event_config.py +148 -0
- ml4t/diagnostic/config/feature_config.py +404 -0
- ml4t/diagnostic/config/multi_signal_config.py +55 -0
- ml4t/diagnostic/config/portfolio_config.py +215 -0
- ml4t/diagnostic/config/report_config.py +391 -0
- ml4t/diagnostic/config/sharpe_config.py +202 -0
- ml4t/diagnostic/config/signal_config.py +206 -0
- ml4t/diagnostic/config/trade_analysis_config.py +310 -0
- ml4t/diagnostic/config/validation.py +279 -0
- ml4t/diagnostic/core/__init__.py +29 -0
- ml4t/diagnostic/core/numba_utils.py +315 -0
- ml4t/diagnostic/core/purging.py +372 -0
- ml4t/diagnostic/core/sampling.py +471 -0
- ml4t/diagnostic/errors/__init__.py +205 -0
- ml4t/diagnostic/evaluation/AGENT.md +26 -0
- ml4t/diagnostic/evaluation/__init__.py +437 -0
- ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
- ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
- ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
- ml4t/diagnostic/evaluation/dashboard.py +715 -0
- ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
- ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
- ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
- ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
- ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
- ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
- ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
- ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
- ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
- ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
- ml4t/diagnostic/evaluation/event_analysis.py +647 -0
- ml4t/diagnostic/evaluation/excursion.py +390 -0
- ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
- ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
- ml4t/diagnostic/evaluation/framework.py +935 -0
- ml4t/diagnostic/evaluation/metric_registry.py +255 -0
- ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
- ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
- ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
- ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
- ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
- ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
- ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
- ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
- ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
- ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
- ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
- ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
- ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
- ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
- ml4t/diagnostic/evaluation/multi_signal.py +550 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
- ml4t/diagnostic/evaluation/report_generation.py +824 -0
- ml4t/diagnostic/evaluation/signal_selector.py +452 -0
- ml4t/diagnostic/evaluation/stat_registry.py +139 -0
- ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
- ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
- ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
- ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
- ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
- ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
- ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
- ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
- ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
- ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
- ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
- ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
- ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
- ml4t/diagnostic/evaluation/stats/moments.py +164 -0
- ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
- ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
- ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
- ml4t/diagnostic/evaluation/themes.py +330 -0
- ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
- ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
- ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
- ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
- ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
- ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
- ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
- ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
- ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
- ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
- ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
- ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
- ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
- ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
- ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
- ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
- ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
- ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
- ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
- ml4t/diagnostic/evaluation/validated_cv.py +535 -0
- ml4t/diagnostic/evaluation/visualization.py +1050 -0
- ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
- ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
- ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
- ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
- ml4t/diagnostic/integration/__init__.py +48 -0
- ml4t/diagnostic/integration/backtest_contract.py +671 -0
- ml4t/diagnostic/integration/data_contract.py +316 -0
- ml4t/diagnostic/integration/engineer_contract.py +226 -0
- ml4t/diagnostic/logging/__init__.py +77 -0
- ml4t/diagnostic/logging/logger.py +245 -0
- ml4t/diagnostic/logging/performance.py +234 -0
- ml4t/diagnostic/logging/progress.py +234 -0
- ml4t/diagnostic/logging/wandb.py +412 -0
- ml4t/diagnostic/metrics/__init__.py +9 -0
- ml4t/diagnostic/metrics/percentiles.py +128 -0
- ml4t/diagnostic/py.typed +1 -0
- ml4t/diagnostic/reporting/__init__.py +43 -0
- ml4t/diagnostic/reporting/base.py +130 -0
- ml4t/diagnostic/reporting/html_renderer.py +275 -0
- ml4t/diagnostic/reporting/json_renderer.py +51 -0
- ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
- ml4t/diagnostic/results/AGENT.md +24 -0
- ml4t/diagnostic/results/__init__.py +105 -0
- ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
- ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
- ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
- ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
- ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
- ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
- ml4t/diagnostic/results/barrier_results/validation.py +38 -0
- ml4t/diagnostic/results/base.py +177 -0
- ml4t/diagnostic/results/event_results.py +349 -0
- ml4t/diagnostic/results/feature_results.py +787 -0
- ml4t/diagnostic/results/multi_signal_results.py +431 -0
- ml4t/diagnostic/results/portfolio_results.py +281 -0
- ml4t/diagnostic/results/sharpe_results.py +448 -0
- ml4t/diagnostic/results/signal_results/__init__.py +74 -0
- ml4t/diagnostic/results/signal_results/ic.py +581 -0
- ml4t/diagnostic/results/signal_results/irtc.py +110 -0
- ml4t/diagnostic/results/signal_results/quantile.py +392 -0
- ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
- ml4t/diagnostic/results/signal_results/turnover.py +213 -0
- ml4t/diagnostic/results/signal_results/validation.py +147 -0
- ml4t/diagnostic/signal/AGENT.md +17 -0
- ml4t/diagnostic/signal/__init__.py +69 -0
- ml4t/diagnostic/signal/_report.py +152 -0
- ml4t/diagnostic/signal/_utils.py +261 -0
- ml4t/diagnostic/signal/core.py +275 -0
- ml4t/diagnostic/signal/quantile.py +148 -0
- ml4t/diagnostic/signal/result.py +214 -0
- ml4t/diagnostic/signal/signal_ic.py +129 -0
- ml4t/diagnostic/signal/turnover.py +182 -0
- ml4t/diagnostic/splitters/AGENT.md +19 -0
- ml4t/diagnostic/splitters/__init__.py +36 -0
- ml4t/diagnostic/splitters/base.py +501 -0
- ml4t/diagnostic/splitters/calendar.py +421 -0
- ml4t/diagnostic/splitters/calendar_config.py +91 -0
- ml4t/diagnostic/splitters/combinatorial.py +1064 -0
- ml4t/diagnostic/splitters/config.py +322 -0
- ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
- ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
- ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
- ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
- ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
- ml4t/diagnostic/splitters/group_isolation.py +329 -0
- ml4t/diagnostic/splitters/persistence.py +316 -0
- ml4t/diagnostic/splitters/utils.py +207 -0
- ml4t/diagnostic/splitters/walk_forward.py +757 -0
- ml4t/diagnostic/utils/__init__.py +42 -0
- ml4t/diagnostic/utils/config.py +542 -0
- ml4t/diagnostic/utils/dependencies.py +318 -0
- ml4t/diagnostic/utils/sessions.py +127 -0
- ml4t/diagnostic/validation/__init__.py +54 -0
- ml4t/diagnostic/validation/dataframe.py +274 -0
- ml4t/diagnostic/validation/returns.py +280 -0
- ml4t/diagnostic/validation/timeseries.py +299 -0
- ml4t/diagnostic/visualization/AGENT.md +19 -0
- ml4t/diagnostic/visualization/__init__.py +223 -0
- ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
- ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
- ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
- ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
- ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
- ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
- ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
- ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
- ml4t/diagnostic/visualization/barrier_plots.py +782 -0
- ml4t/diagnostic/visualization/core.py +1060 -0
- ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
- ml4t/diagnostic/visualization/dashboards/base.py +582 -0
- ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
- ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
- ml4t/diagnostic/visualization/dashboards.py +43 -0
- ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
- ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
- ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
- ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
- ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
- ml4t/diagnostic/visualization/feature_plots.py +888 -0
- ml4t/diagnostic/visualization/interaction_plots.py +618 -0
- ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
- ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
- ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
- ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
- ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
- ml4t/diagnostic/visualization/report_generation.py +1343 -0
- ml4t/diagnostic/visualization/signal/__init__.py +103 -0
- ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
- ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
- ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
- ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
- ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
- ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
- ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
- ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
- ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
- ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
- ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
- ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""Distribution diagnostics for financial returns analysis.
|
|
2
|
+
|
|
3
|
+
This module provides statistical tests and metrics for analyzing the distribution
|
|
4
|
+
properties of financial returns:
|
|
5
|
+
|
|
6
|
+
- Moments (skewness and excess kurtosis) with significance tests
|
|
7
|
+
- Jarque-Bera test for normality (based on moments)
|
|
8
|
+
- Shapiro-Wilk test for normality (more powerful for small samples)
|
|
9
|
+
- Heavy tail detection using Hill estimator and QQ plots
|
|
10
|
+
- Tail classification (thin, medium, heavy) for power law analysis
|
|
11
|
+
|
|
12
|
+
Distribution analysis is critical for understanding return characteristics and
|
|
13
|
+
validating modeling assumptions. Many financial models assume normally distributed
|
|
14
|
+
returns, but real financial data often exhibits:
|
|
15
|
+
- Skewness (asymmetry): Negative skew common in equity returns
|
|
16
|
+
- Excess kurtosis (fat tails): More extreme events than normal distribution
|
|
17
|
+
- Non-normality: Violations of Gaussian assumptions
|
|
18
|
+
- Heavy tails: Power law behavior in extreme events
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
>>> import numpy as np
|
|
22
|
+
>>> from ml4t.diagnostic.evaluation.distribution import (
|
|
23
|
+
... compute_moments, jarque_bera_test, shapiro_wilk_test,
|
|
24
|
+
... hill_estimator, analyze_tails, analyze_distribution
|
|
25
|
+
... )
|
|
26
|
+
>>>
|
|
27
|
+
>>> # Quick comprehensive analysis (recommended)
|
|
28
|
+
>>> returns = np.random.standard_t(df=5, size=1000) * 0.01
|
|
29
|
+
>>> result = analyze_distribution(returns)
|
|
30
|
+
>>> print(result.summary())
|
|
31
|
+
>>> print(f"Recommended: {result.recommended_distribution}")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
|
|
38
|
+
import numpy as np
|
|
39
|
+
import pandas as pd
|
|
40
|
+
|
|
41
|
+
from ml4t.diagnostic.errors import ComputationError, ValidationError
|
|
42
|
+
|
|
43
|
+
# Import from submodules
|
|
44
|
+
from ml4t.diagnostic.evaluation.distribution.moments import (
|
|
45
|
+
MomentsResult,
|
|
46
|
+
compute_moments,
|
|
47
|
+
)
|
|
48
|
+
from ml4t.diagnostic.evaluation.distribution.tails import (
|
|
49
|
+
HillEstimatorResult,
|
|
50
|
+
QQPlotData,
|
|
51
|
+
TailAnalysisResult,
|
|
52
|
+
analyze_tails,
|
|
53
|
+
generate_qq_data,
|
|
54
|
+
hill_estimator,
|
|
55
|
+
)
|
|
56
|
+
from ml4t.diagnostic.evaluation.distribution.tests import (
|
|
57
|
+
JarqueBeraResult,
|
|
58
|
+
ShapiroWilkResult,
|
|
59
|
+
jarque_bera_test,
|
|
60
|
+
shapiro_wilk_test,
|
|
61
|
+
)
|
|
62
|
+
from ml4t.diagnostic.logging import get_logger
|
|
63
|
+
|
|
64
|
+
logger = get_logger(__name__)
|
|
65
|
+
|
|
66
|
+
# Public API
|
|
67
|
+
__all__ = [
|
|
68
|
+
# Result classes
|
|
69
|
+
"MomentsResult",
|
|
70
|
+
"JarqueBeraResult",
|
|
71
|
+
"ShapiroWilkResult",
|
|
72
|
+
"HillEstimatorResult",
|
|
73
|
+
"QQPlotData",
|
|
74
|
+
"TailAnalysisResult",
|
|
75
|
+
"DistributionAnalysisResult",
|
|
76
|
+
# Functions
|
|
77
|
+
"compute_moments",
|
|
78
|
+
"jarque_bera_test",
|
|
79
|
+
"shapiro_wilk_test",
|
|
80
|
+
"hill_estimator",
|
|
81
|
+
"generate_qq_data",
|
|
82
|
+
"analyze_tails",
|
|
83
|
+
"analyze_distribution",
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class DistributionAnalysisResult:
|
|
89
|
+
"""Comprehensive distribution analysis results.
|
|
90
|
+
|
|
91
|
+
Combines moments, normality tests, and tail analysis to provide complete
|
|
92
|
+
characterization of distribution properties. This unified analysis helps
|
|
93
|
+
determine appropriate statistical methods and risk models.
|
|
94
|
+
|
|
95
|
+
Attributes:
|
|
96
|
+
moments_result: Distribution moments (skewness, kurtosis) with significance
|
|
97
|
+
jarque_bera_result: Jarque-Bera normality test result
|
|
98
|
+
shapiro_wilk_result: Shapiro-Wilk normality test result (more powerful for small n)
|
|
99
|
+
tail_analysis_result: Comprehensive tail analysis (Hill, QQ plots)
|
|
100
|
+
is_normal: Consensus normality assessment from all tests
|
|
101
|
+
recommended_distribution: Best-fit distribution ("normal", "t", "stable", "heavy-tailed")
|
|
102
|
+
recommended_df: Degrees of freedom for t-distribution (None otherwise)
|
|
103
|
+
interpretation: Human-readable summary of key findings
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
moments_result: MomentsResult
|
|
107
|
+
jarque_bera_result: JarqueBeraResult
|
|
108
|
+
shapiro_wilk_result: ShapiroWilkResult
|
|
109
|
+
tail_analysis_result: TailAnalysisResult | None
|
|
110
|
+
is_normal: bool
|
|
111
|
+
recommended_distribution: str
|
|
112
|
+
recommended_df: int | None
|
|
113
|
+
interpretation: str
|
|
114
|
+
|
|
115
|
+
def __repr__(self) -> str:
|
|
116
|
+
"""String representation."""
|
|
117
|
+
return (
|
|
118
|
+
f"DistributionAnalysisResult(is_normal={self.is_normal}, "
|
|
119
|
+
f"recommended='{self.recommended_distribution}', "
|
|
120
|
+
f"n={self.moments_result.n_obs})"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def summary(self) -> str:
|
|
124
|
+
"""Comprehensive distribution analysis summary.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Formatted summary string combining all analyses
|
|
128
|
+
"""
|
|
129
|
+
lines = [
|
|
130
|
+
"=" * 70,
|
|
131
|
+
"COMPREHENSIVE DISTRIBUTION ANALYSIS",
|
|
132
|
+
"=" * 70,
|
|
133
|
+
f"Sample Size: {self.moments_result.n_obs}",
|
|
134
|
+
f"Mean: {self.moments_result.mean:.6f}",
|
|
135
|
+
f"Std Dev: {self.moments_result.std:.6f}",
|
|
136
|
+
"",
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
# Moments summary
|
|
140
|
+
lines.append("MOMENTS:")
|
|
141
|
+
lines.append(f" Skewness: {self.moments_result.skewness:.4f}")
|
|
142
|
+
if self.moments_result.skewness_significant:
|
|
143
|
+
skew_interp = "right-skewed" if self.moments_result.skewness > 0 else "left-skewed"
|
|
144
|
+
lines.append(f" (Significantly {skew_interp})")
|
|
145
|
+
else:
|
|
146
|
+
lines.append(" (Not significantly different from 0)")
|
|
147
|
+
|
|
148
|
+
lines.append(f" Excess Kurtosis: {self.moments_result.excess_kurtosis:.4f}")
|
|
149
|
+
if self.moments_result.excess_kurtosis_significant:
|
|
150
|
+
kurt_interp = "fat tails" if self.moments_result.excess_kurtosis > 0 else "thin tails"
|
|
151
|
+
lines.append(f" (Significantly {kurt_interp})")
|
|
152
|
+
else:
|
|
153
|
+
lines.append(" (Not significantly different from 0)")
|
|
154
|
+
|
|
155
|
+
# Normality tests summary
|
|
156
|
+
lines.append("")
|
|
157
|
+
lines.append("NORMALITY TESTS:")
|
|
158
|
+
lines.append(
|
|
159
|
+
f" Jarque-Bera: p={self.jarque_bera_result.p_value:.4f} "
|
|
160
|
+
f"({'PASS' if self.jarque_bera_result.is_normal else 'FAIL'})"
|
|
161
|
+
)
|
|
162
|
+
lines.append(
|
|
163
|
+
f" Shapiro-Wilk: p={self.shapiro_wilk_result.p_value:.4f} "
|
|
164
|
+
f"({'PASS' if self.shapiro_wilk_result.is_normal else 'FAIL'})"
|
|
165
|
+
)
|
|
166
|
+
lines.append(f" Consensus: {'NORMAL' if self.is_normal else 'NON-NORMAL'}")
|
|
167
|
+
|
|
168
|
+
# Tail analysis summary (if computed)
|
|
169
|
+
if self.tail_analysis_result is not None:
|
|
170
|
+
lines.append("")
|
|
171
|
+
lines.append("TAIL ANALYSIS:")
|
|
172
|
+
lines.append(
|
|
173
|
+
f" Hill Tail Index: {self.tail_analysis_result.hill_result.tail_index:.4f}"
|
|
174
|
+
)
|
|
175
|
+
lines.append(
|
|
176
|
+
f" Tail Classification: {self.tail_analysis_result.hill_result.classification.upper()}"
|
|
177
|
+
)
|
|
178
|
+
lines.append(
|
|
179
|
+
f" Normal R²: {self.tail_analysis_result.qq_normal.r_squared:.4f}"
|
|
180
|
+
)
|
|
181
|
+
if self.tail_analysis_result.qq_t is not None:
|
|
182
|
+
lines.append(
|
|
183
|
+
f" Student's t R²: {self.tail_analysis_result.qq_t.r_squared:.4f} "
|
|
184
|
+
f"(df={self.tail_analysis_result.qq_t.df})"
|
|
185
|
+
)
|
|
186
|
+
lines.append(f" Best Fit: {self.tail_analysis_result.best_fit.upper()}")
|
|
187
|
+
|
|
188
|
+
# Recommendation
|
|
189
|
+
lines.append("")
|
|
190
|
+
lines.append("=" * 70)
|
|
191
|
+
lines.append("RECOMMENDATION:")
|
|
192
|
+
lines.append(f" Distribution: {self.recommended_distribution.upper()}")
|
|
193
|
+
if self.recommended_df is not None:
|
|
194
|
+
lines.append(f" Degrees of Freedom: {self.recommended_df}")
|
|
195
|
+
|
|
196
|
+
# Interpretation
|
|
197
|
+
lines.append("")
|
|
198
|
+
lines.append("INTERPRETATION:")
|
|
199
|
+
for line in self.interpretation.split("\n"):
|
|
200
|
+
lines.append(f" {line}")
|
|
201
|
+
|
|
202
|
+
# Risk implications
|
|
203
|
+
lines.append("")
|
|
204
|
+
lines.append("RISK IMPLICATIONS:")
|
|
205
|
+
if self.recommended_distribution == "normal":
|
|
206
|
+
lines.append(" - Standard normal-based risk measures appropriate (VaR, Sharpe)")
|
|
207
|
+
lines.append(" - Classical portfolio optimization methods valid")
|
|
208
|
+
lines.append(" - Parametric statistical inference reliable")
|
|
209
|
+
elif self.recommended_distribution == "t":
|
|
210
|
+
lines.append(
|
|
211
|
+
f" - Use Student's t distribution (df={self.recommended_df}) for modeling"
|
|
212
|
+
)
|
|
213
|
+
lines.append(" - Heavier tails than normal => higher extreme event probability")
|
|
214
|
+
lines.append(" - Consider robust Sharpe ratio alternatives (e.g., Sortino)")
|
|
215
|
+
lines.append(" - VaR should account for fat tails")
|
|
216
|
+
elif self.recommended_distribution in ["stable", "heavy-tailed"]:
|
|
217
|
+
lines.append(" - WARNING: Heavy tails detected => use extreme value theory")
|
|
218
|
+
lines.append(" - Standard risk measures (VaR, Sharpe) may be unreliable")
|
|
219
|
+
lines.append(" - Use CVaR (Expected Shortfall) instead of VaR")
|
|
220
|
+
lines.append(" - Consider tail risk hedging strategies")
|
|
221
|
+
lines.append(" - Apply robust portfolio optimization methods")
|
|
222
|
+
|
|
223
|
+
lines.append("=" * 70)
|
|
224
|
+
|
|
225
|
+
return "\n".join(lines)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def analyze_distribution(
|
|
229
|
+
data: pd.Series | np.ndarray,
|
|
230
|
+
alpha: float = 0.05,
|
|
231
|
+
compute_tails: bool = True,
|
|
232
|
+
) -> DistributionAnalysisResult:
|
|
233
|
+
"""Comprehensive distribution analysis combining all methods.
|
|
234
|
+
|
|
235
|
+
Performs complete statistical characterization of distribution properties:
|
|
236
|
+
1. Computes moments (skewness, kurtosis) with significance tests
|
|
237
|
+
2. Runs normality tests (Jarque-Bera, Shapiro-Wilk)
|
|
238
|
+
3. Analyzes tail behavior (Hill estimator, QQ plots) if compute_tails=True
|
|
239
|
+
4. Determines consensus and recommends appropriate distribution
|
|
240
|
+
|
|
241
|
+
This unified analysis provides actionable guidance for selecting statistical
|
|
242
|
+
methods and risk models appropriate for the data characteristics.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
data: Time series data (1D array or Series), typically financial returns
|
|
246
|
+
alpha: Significance level for statistical tests (default 0.05)
|
|
247
|
+
compute_tails: Whether to run tail analysis (default True, can be slow for large n)
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
DistributionAnalysisResult with comprehensive analysis and recommendations
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
ValidationError: If data is invalid (empty, wrong shape, etc.)
|
|
254
|
+
ComputationError: If analysis fails
|
|
255
|
+
|
|
256
|
+
Example:
|
|
257
|
+
>>> import numpy as np
|
|
258
|
+
>>> from ml4t.diagnostic.evaluation.distribution import analyze_distribution
|
|
259
|
+
>>>
|
|
260
|
+
>>> # Analyze financial returns
|
|
261
|
+
>>> returns = np.random.standard_t(df=5, size=1000) * 0.01 # Heavy-tailed returns
|
|
262
|
+
>>> result = analyze_distribution(returns, alpha=0.05, compute_tails=True)
|
|
263
|
+
>>>
|
|
264
|
+
>>> # Print comprehensive summary
|
|
265
|
+
>>> print(result.summary())
|
|
266
|
+
>>>
|
|
267
|
+
>>> # Get recommendation for risk modeling
|
|
268
|
+
>>> print(f"Use {result.recommended_distribution} distribution")
|
|
269
|
+
>>> if result.recommended_df:
|
|
270
|
+
... print(f"Degrees of freedom: {result.recommended_df}")
|
|
271
|
+
>>>
|
|
272
|
+
>>> # Check if standard methods are appropriate
|
|
273
|
+
>>> if result.is_normal:
|
|
274
|
+
... print("Standard normal-based methods OK")
|
|
275
|
+
... else:
|
|
276
|
+
... print("Use robust methods for non-normal data")
|
|
277
|
+
>>>
|
|
278
|
+
>>> # Quick analysis without tail computation (faster)
|
|
279
|
+
>>> result_fast = analyze_distribution(returns, compute_tails=False)
|
|
280
|
+
|
|
281
|
+
Notes:
|
|
282
|
+
- Tail analysis (compute_tails=True) adds Hill estimator and QQ plots
|
|
283
|
+
- Skip tail analysis for very large datasets or when speed is critical
|
|
284
|
+
- Consensus normality requires both JB and SW to accept H0
|
|
285
|
+
- Recommendation logic prioritizes tail analysis over simple normality tests
|
|
286
|
+
- For n < 50, Shapiro-Wilk test may be unreliable (warning issued)
|
|
287
|
+
"""
|
|
288
|
+
# Input validation (basic check, detailed checks in subfunctions)
|
|
289
|
+
if data is None:
|
|
290
|
+
raise ValidationError("Data cannot be None", context={"function": "analyze_distribution"})
|
|
291
|
+
|
|
292
|
+
logger.info(
|
|
293
|
+
"Starting comprehensive distribution analysis",
|
|
294
|
+
compute_tails=compute_tails,
|
|
295
|
+
alpha=alpha,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
try:
|
|
299
|
+
# 1. Compute moments
|
|
300
|
+
moments_result = compute_moments(data, test_significance=True, alpha=alpha)
|
|
301
|
+
|
|
302
|
+
# 2. Jarque-Bera test
|
|
303
|
+
jarque_bera_result = jarque_bera_test(data, alpha=alpha)
|
|
304
|
+
|
|
305
|
+
# 3. Shapiro-Wilk test
|
|
306
|
+
shapiro_wilk_result = shapiro_wilk_test(data, alpha=alpha)
|
|
307
|
+
|
|
308
|
+
# 4. Tail analysis (optional)
|
|
309
|
+
tail_analysis_result = None
|
|
310
|
+
if compute_tails:
|
|
311
|
+
try:
|
|
312
|
+
tail_analysis_result = analyze_tails(data)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.warning(f"Tail analysis failed, skipping: {e}")
|
|
315
|
+
# Continue without tail analysis
|
|
316
|
+
|
|
317
|
+
# 5. Determine consensus normality
|
|
318
|
+
# Both tests must accept H0 for consensus normality
|
|
319
|
+
is_normal = jarque_bera_result.is_normal and shapiro_wilk_result.is_normal
|
|
320
|
+
|
|
321
|
+
# 6. Recommend distribution
|
|
322
|
+
recommended_distribution, recommended_df = _recommend_distribution(
|
|
323
|
+
is_normal=is_normal,
|
|
324
|
+
moments_result=moments_result,
|
|
325
|
+
tail_analysis_result=tail_analysis_result,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# 7. Generate interpretation
|
|
329
|
+
interpretation = _generate_interpretation(
|
|
330
|
+
is_normal=is_normal,
|
|
331
|
+
moments_result=moments_result,
|
|
332
|
+
jarque_bera_result=jarque_bera_result,
|
|
333
|
+
shapiro_wilk_result=shapiro_wilk_result,
|
|
334
|
+
tail_analysis_result=tail_analysis_result,
|
|
335
|
+
recommended_distribution=recommended_distribution,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
logger.info(
|
|
339
|
+
"Distribution analysis completed",
|
|
340
|
+
is_normal=is_normal,
|
|
341
|
+
recommended=recommended_distribution,
|
|
342
|
+
n_obs=moments_result.n_obs,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return DistributionAnalysisResult(
|
|
346
|
+
moments_result=moments_result,
|
|
347
|
+
jarque_bera_result=jarque_bera_result,
|
|
348
|
+
shapiro_wilk_result=shapiro_wilk_result,
|
|
349
|
+
tail_analysis_result=tail_analysis_result,
|
|
350
|
+
is_normal=is_normal,
|
|
351
|
+
recommended_distribution=recommended_distribution,
|
|
352
|
+
recommended_df=recommended_df,
|
|
353
|
+
interpretation=interpretation,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
except (ValidationError, ComputationError):
|
|
357
|
+
raise
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger.error("Distribution analysis failed", error=str(e))
|
|
360
|
+
raise ComputationError( # noqa: B904
|
|
361
|
+
f"Distribution analysis failed: {e}",
|
|
362
|
+
context={"function": "analyze_distribution"},
|
|
363
|
+
cause=e,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _recommend_distribution(
|
|
368
|
+
is_normal: bool,
|
|
369
|
+
moments_result: MomentsResult,
|
|
370
|
+
tail_analysis_result: TailAnalysisResult | None,
|
|
371
|
+
) -> tuple[str, int | None]:
|
|
372
|
+
"""Internal: Recommend distribution based on analysis results.
|
|
373
|
+
|
|
374
|
+
Logic:
|
|
375
|
+
1. If tail analysis available, prioritize its recommendation
|
|
376
|
+
2. If both normality tests pass, recommend normal
|
|
377
|
+
3. If heavy tails detected (alpha <= 2), recommend stable/heavy-tailed
|
|
378
|
+
4. If medium tails (2 < alpha <= 4), recommend Student's t with estimated df
|
|
379
|
+
5. Otherwise, recommend t-distribution for non-normal data
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Tuple of (distribution_name, degrees_of_freedom)
|
|
383
|
+
"""
|
|
384
|
+
# If tail analysis available, use its recommendation
|
|
385
|
+
if tail_analysis_result is not None:
|
|
386
|
+
best_fit = tail_analysis_result.best_fit
|
|
387
|
+
tail_index = tail_analysis_result.hill_result.tail_index
|
|
388
|
+
classification = tail_analysis_result.hill_result.classification
|
|
389
|
+
|
|
390
|
+
if best_fit == "normal":
|
|
391
|
+
return ("normal", None)
|
|
392
|
+
elif best_fit == "t":
|
|
393
|
+
# Use df from QQ plot if available
|
|
394
|
+
if tail_analysis_result.qq_t is not None:
|
|
395
|
+
return ("t", tail_analysis_result.qq_t.df)
|
|
396
|
+
else:
|
|
397
|
+
# Estimate df from tail index: df ≈ 2*alpha for medium tails
|
|
398
|
+
df = max(2, min(30, int(round(2 * tail_index))))
|
|
399
|
+
return ("t", df)
|
|
400
|
+
elif best_fit == "heavy-tailed":
|
|
401
|
+
# Very heavy tails
|
|
402
|
+
if classification == "heavy" and tail_index <= 2.0:
|
|
403
|
+
return ("stable", None) # Stable distribution for alpha <= 2
|
|
404
|
+
else:
|
|
405
|
+
return ("heavy-tailed", None)
|
|
406
|
+
|
|
407
|
+
# Fallback: Use normality tests and moments
|
|
408
|
+
if is_normal:
|
|
409
|
+
return ("normal", None)
|
|
410
|
+
|
|
411
|
+
# Non-normal: check excess kurtosis
|
|
412
|
+
if moments_result.excess_kurtosis > 2.0:
|
|
413
|
+
# Very fat tails => recommend heavy-tailed
|
|
414
|
+
return ("heavy-tailed", None)
|
|
415
|
+
elif moments_result.excess_kurtosis > 0.5:
|
|
416
|
+
# Moderate fat tails => recommend t with estimated df
|
|
417
|
+
# Heuristic: df ≈ 6/excess_kurtosis + 4 (for excess kurtosis)
|
|
418
|
+
df = max(3, min(30, int(round(6 / moments_result.excess_kurtosis + 4))))
|
|
419
|
+
return ("t", df)
|
|
420
|
+
else:
|
|
421
|
+
# Slight deviation from normal => t with higher df
|
|
422
|
+
return ("t", 10)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _generate_interpretation(
|
|
426
|
+
is_normal: bool,
|
|
427
|
+
moments_result: MomentsResult,
|
|
428
|
+
jarque_bera_result: JarqueBeraResult,
|
|
429
|
+
shapiro_wilk_result: ShapiroWilkResult,
|
|
430
|
+
tail_analysis_result: TailAnalysisResult | None,
|
|
431
|
+
recommended_distribution: str,
|
|
432
|
+
) -> str:
|
|
433
|
+
"""Internal: Generate human-readable interpretation.
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Multi-line interpretation string
|
|
437
|
+
"""
|
|
438
|
+
lines = []
|
|
439
|
+
|
|
440
|
+
# Normality assessment
|
|
441
|
+
if is_normal:
|
|
442
|
+
lines.append("Data is consistent with normal distribution (both tests pass).")
|
|
443
|
+
lines.append("Standard statistical methods and risk measures are appropriate.")
|
|
444
|
+
else:
|
|
445
|
+
lines.append("Data deviates from normality (at least one test rejects H0).")
|
|
446
|
+
|
|
447
|
+
# Explain why
|
|
448
|
+
if jarque_bera_result.is_normal and not shapiro_wilk_result.is_normal:
|
|
449
|
+
lines.append("Shapiro-Wilk test rejects normality (more powerful for small samples).")
|
|
450
|
+
elif not jarque_bera_result.is_normal and shapiro_wilk_result.is_normal:
|
|
451
|
+
lines.append("Jarque-Bera test rejects normality (based on skewness/kurtosis).")
|
|
452
|
+
else:
|
|
453
|
+
lines.append("Both normality tests reject H0.")
|
|
454
|
+
|
|
455
|
+
# Moments interpretation
|
|
456
|
+
if moments_result.skewness_significant:
|
|
457
|
+
if moments_result.skewness > 0:
|
|
458
|
+
lines.append(
|
|
459
|
+
f"Significant positive skewness ({moments_result.skewness:.3f}) indicates right tail is heavier."
|
|
460
|
+
)
|
|
461
|
+
else:
|
|
462
|
+
lines.append(
|
|
463
|
+
f"Significant negative skewness ({moments_result.skewness:.3f}) "
|
|
464
|
+
"indicates left tail is heavier (common for equity returns)."
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
if moments_result.excess_kurtosis_significant and moments_result.excess_kurtosis > 0:
|
|
468
|
+
lines.append(
|
|
469
|
+
f"Significant excess kurtosis ({moments_result.excess_kurtosis:.3f}) "
|
|
470
|
+
"indicates fat tails and higher extreme event probability."
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Tail analysis interpretation
|
|
474
|
+
if tail_analysis_result is not None:
|
|
475
|
+
classification = tail_analysis_result.hill_result.classification
|
|
476
|
+
tail_index = tail_analysis_result.hill_result.tail_index
|
|
477
|
+
|
|
478
|
+
if classification == "heavy":
|
|
479
|
+
lines.append(
|
|
480
|
+
f"Heavy tails detected (α={tail_index:.2f} ≤ 2): power law behavior in extremes."
|
|
481
|
+
)
|
|
482
|
+
elif classification == "medium":
|
|
483
|
+
lines.append(
|
|
484
|
+
f"Medium-heavy tails (α={tail_index:.2f}): heavier than normal but finite variance."
|
|
485
|
+
)
|
|
486
|
+
else:
|
|
487
|
+
lines.append(
|
|
488
|
+
f"Thin tails detected (α={tail_index:.2f} > 4): approaching normal tail behavior."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Recommendation rationale
|
|
492
|
+
if recommended_distribution == "normal":
|
|
493
|
+
lines.append("Normal distribution provides adequate fit for this data.")
|
|
494
|
+
elif recommended_distribution == "t":
|
|
495
|
+
lines.append("Student's t distribution recommended for heavier tails than normal.")
|
|
496
|
+
elif recommended_distribution in ["stable", "heavy-tailed"]:
|
|
497
|
+
lines.append("Heavy-tailed distribution required due to extreme power law behavior.")
|
|
498
|
+
|
|
499
|
+
return "\n".join(lines)
|