ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,499 @@
1
+ """Distribution diagnostics for financial returns analysis.
2
+
3
+ This module provides statistical tests and metrics for analyzing the distribution
4
+ properties of financial returns:
5
+
6
+ - Moments (skewness and excess kurtosis) with significance tests
7
+ - Jarque-Bera test for normality (based on moments)
8
+ - Shapiro-Wilk test for normality (more powerful for small samples)
9
+ - Heavy tail detection using Hill estimator and QQ plots
10
+ - Tail classification (thin, medium, heavy) for power law analysis
11
+
12
+ Distribution analysis is critical for understanding return characteristics and
13
+ validating modeling assumptions. Many financial models assume normally distributed
14
+ returns, but real financial data often exhibits:
15
+ - Skewness (asymmetry): Negative skew common in equity returns
16
+ - Excess kurtosis (fat tails): More extreme events than normal distribution
17
+ - Non-normality: Violations of Gaussian assumptions
18
+ - Heavy tails: Power law behavior in extreme events
19
+
20
+ Example:
21
+ >>> import numpy as np
22
+ >>> from ml4t.diagnostic.evaluation.distribution import (
23
+ ... compute_moments, jarque_bera_test, shapiro_wilk_test,
24
+ ... hill_estimator, analyze_tails, analyze_distribution
25
+ ... )
26
+ >>>
27
+ >>> # Quick comprehensive analysis (recommended)
28
+ >>> returns = np.random.standard_t(df=5, size=1000) * 0.01
29
+ >>> result = analyze_distribution(returns)
30
+ >>> print(result.summary())
31
+ >>> print(f"Recommended: {result.recommended_distribution}")
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ from dataclasses import dataclass
37
+
38
+ import numpy as np
39
+ import pandas as pd
40
+
41
+ from ml4t.diagnostic.errors import ComputationError, ValidationError
42
+
43
+ # Import from submodules
44
+ from ml4t.diagnostic.evaluation.distribution.moments import (
45
+ MomentsResult,
46
+ compute_moments,
47
+ )
48
+ from ml4t.diagnostic.evaluation.distribution.tails import (
49
+ HillEstimatorResult,
50
+ QQPlotData,
51
+ TailAnalysisResult,
52
+ analyze_tails,
53
+ generate_qq_data,
54
+ hill_estimator,
55
+ )
56
+ from ml4t.diagnostic.evaluation.distribution.tests import (
57
+ JarqueBeraResult,
58
+ ShapiroWilkResult,
59
+ jarque_bera_test,
60
+ shapiro_wilk_test,
61
+ )
62
+ from ml4t.diagnostic.logging import get_logger
63
+
64
+ logger = get_logger(__name__)
65
+
66
+ # Public API
67
+ __all__ = [
68
+ # Result classes
69
+ "MomentsResult",
70
+ "JarqueBeraResult",
71
+ "ShapiroWilkResult",
72
+ "HillEstimatorResult",
73
+ "QQPlotData",
74
+ "TailAnalysisResult",
75
+ "DistributionAnalysisResult",
76
+ # Functions
77
+ "compute_moments",
78
+ "jarque_bera_test",
79
+ "shapiro_wilk_test",
80
+ "hill_estimator",
81
+ "generate_qq_data",
82
+ "analyze_tails",
83
+ "analyze_distribution",
84
+ ]
85
+
86
+
87
+ @dataclass
88
+ class DistributionAnalysisResult:
89
+ """Comprehensive distribution analysis results.
90
+
91
+ Combines moments, normality tests, and tail analysis to provide complete
92
+ characterization of distribution properties. This unified analysis helps
93
+ determine appropriate statistical methods and risk models.
94
+
95
+ Attributes:
96
+ moments_result: Distribution moments (skewness, kurtosis) with significance
97
+ jarque_bera_result: Jarque-Bera normality test result
98
+ shapiro_wilk_result: Shapiro-Wilk normality test result (more powerful for small n)
99
+ tail_analysis_result: Comprehensive tail analysis (Hill, QQ plots)
100
+ is_normal: Consensus normality assessment from all tests
101
+ recommended_distribution: Best-fit distribution ("normal", "t", "stable", "heavy-tailed")
102
+ recommended_df: Degrees of freedom for t-distribution (None otherwise)
103
+ interpretation: Human-readable summary of key findings
104
+ """
105
+
106
+ moments_result: MomentsResult
107
+ jarque_bera_result: JarqueBeraResult
108
+ shapiro_wilk_result: ShapiroWilkResult
109
+ tail_analysis_result: TailAnalysisResult | None
110
+ is_normal: bool
111
+ recommended_distribution: str
112
+ recommended_df: int | None
113
+ interpretation: str
114
+
115
+ def __repr__(self) -> str:
116
+ """String representation."""
117
+ return (
118
+ f"DistributionAnalysisResult(is_normal={self.is_normal}, "
119
+ f"recommended='{self.recommended_distribution}', "
120
+ f"n={self.moments_result.n_obs})"
121
+ )
122
+
123
+ def summary(self) -> str:
124
+ """Comprehensive distribution analysis summary.
125
+
126
+ Returns:
127
+ Formatted summary string combining all analyses
128
+ """
129
+ lines = [
130
+ "=" * 70,
131
+ "COMPREHENSIVE DISTRIBUTION ANALYSIS",
132
+ "=" * 70,
133
+ f"Sample Size: {self.moments_result.n_obs}",
134
+ f"Mean: {self.moments_result.mean:.6f}",
135
+ f"Std Dev: {self.moments_result.std:.6f}",
136
+ "",
137
+ ]
138
+
139
+ # Moments summary
140
+ lines.append("MOMENTS:")
141
+ lines.append(f" Skewness: {self.moments_result.skewness:.4f}")
142
+ if self.moments_result.skewness_significant:
143
+ skew_interp = "right-skewed" if self.moments_result.skewness > 0 else "left-skewed"
144
+ lines.append(f" (Significantly {skew_interp})")
145
+ else:
146
+ lines.append(" (Not significantly different from 0)")
147
+
148
+ lines.append(f" Excess Kurtosis: {self.moments_result.excess_kurtosis:.4f}")
149
+ if self.moments_result.excess_kurtosis_significant:
150
+ kurt_interp = "fat tails" if self.moments_result.excess_kurtosis > 0 else "thin tails"
151
+ lines.append(f" (Significantly {kurt_interp})")
152
+ else:
153
+ lines.append(" (Not significantly different from 0)")
154
+
155
+ # Normality tests summary
156
+ lines.append("")
157
+ lines.append("NORMALITY TESTS:")
158
+ lines.append(
159
+ f" Jarque-Bera: p={self.jarque_bera_result.p_value:.4f} "
160
+ f"({'PASS' if self.jarque_bera_result.is_normal else 'FAIL'})"
161
+ )
162
+ lines.append(
163
+ f" Shapiro-Wilk: p={self.shapiro_wilk_result.p_value:.4f} "
164
+ f"({'PASS' if self.shapiro_wilk_result.is_normal else 'FAIL'})"
165
+ )
166
+ lines.append(f" Consensus: {'NORMAL' if self.is_normal else 'NON-NORMAL'}")
167
+
168
+ # Tail analysis summary (if computed)
169
+ if self.tail_analysis_result is not None:
170
+ lines.append("")
171
+ lines.append("TAIL ANALYSIS:")
172
+ lines.append(
173
+ f" Hill Tail Index: {self.tail_analysis_result.hill_result.tail_index:.4f}"
174
+ )
175
+ lines.append(
176
+ f" Tail Classification: {self.tail_analysis_result.hill_result.classification.upper()}"
177
+ )
178
+ lines.append(
179
+ f" Normal R²: {self.tail_analysis_result.qq_normal.r_squared:.4f}"
180
+ )
181
+ if self.tail_analysis_result.qq_t is not None:
182
+ lines.append(
183
+ f" Student's t R²: {self.tail_analysis_result.qq_t.r_squared:.4f} "
184
+ f"(df={self.tail_analysis_result.qq_t.df})"
185
+ )
186
+ lines.append(f" Best Fit: {self.tail_analysis_result.best_fit.upper()}")
187
+
188
+ # Recommendation
189
+ lines.append("")
190
+ lines.append("=" * 70)
191
+ lines.append("RECOMMENDATION:")
192
+ lines.append(f" Distribution: {self.recommended_distribution.upper()}")
193
+ if self.recommended_df is not None:
194
+ lines.append(f" Degrees of Freedom: {self.recommended_df}")
195
+
196
+ # Interpretation
197
+ lines.append("")
198
+ lines.append("INTERPRETATION:")
199
+ for line in self.interpretation.split("\n"):
200
+ lines.append(f" {line}")
201
+
202
+ # Risk implications
203
+ lines.append("")
204
+ lines.append("RISK IMPLICATIONS:")
205
+ if self.recommended_distribution == "normal":
206
+ lines.append(" - Standard normal-based risk measures appropriate (VaR, Sharpe)")
207
+ lines.append(" - Classical portfolio optimization methods valid")
208
+ lines.append(" - Parametric statistical inference reliable")
209
+ elif self.recommended_distribution == "t":
210
+ lines.append(
211
+ f" - Use Student's t distribution (df={self.recommended_df}) for modeling"
212
+ )
213
+ lines.append(" - Heavier tails than normal => higher extreme event probability")
214
+ lines.append(" - Consider robust Sharpe ratio alternatives (e.g., Sortino)")
215
+ lines.append(" - VaR should account for fat tails")
216
+ elif self.recommended_distribution in ["stable", "heavy-tailed"]:
217
+ lines.append(" - WARNING: Heavy tails detected => use extreme value theory")
218
+ lines.append(" - Standard risk measures (VaR, Sharpe) may be unreliable")
219
+ lines.append(" - Use CVaR (Expected Shortfall) instead of VaR")
220
+ lines.append(" - Consider tail risk hedging strategies")
221
+ lines.append(" - Apply robust portfolio optimization methods")
222
+
223
+ lines.append("=" * 70)
224
+
225
+ return "\n".join(lines)
226
+
227
+
228
+ def analyze_distribution(
229
+ data: pd.Series | np.ndarray,
230
+ alpha: float = 0.05,
231
+ compute_tails: bool = True,
232
+ ) -> DistributionAnalysisResult:
233
+ """Comprehensive distribution analysis combining all methods.
234
+
235
+ Performs complete statistical characterization of distribution properties:
236
+ 1. Computes moments (skewness, kurtosis) with significance tests
237
+ 2. Runs normality tests (Jarque-Bera, Shapiro-Wilk)
238
+ 3. Analyzes tail behavior (Hill estimator, QQ plots) if compute_tails=True
239
+ 4. Determines consensus and recommends appropriate distribution
240
+
241
+ This unified analysis provides actionable guidance for selecting statistical
242
+ methods and risk models appropriate for the data characteristics.
243
+
244
+ Args:
245
+ data: Time series data (1D array or Series), typically financial returns
246
+ alpha: Significance level for statistical tests (default 0.05)
247
+ compute_tails: Whether to run tail analysis (default True, can be slow for large n)
248
+
249
+ Returns:
250
+ DistributionAnalysisResult with comprehensive analysis and recommendations
251
+
252
+ Raises:
253
+ ValidationError: If data is invalid (empty, wrong shape, etc.)
254
+ ComputationError: If analysis fails
255
+
256
+ Example:
257
+ >>> import numpy as np
258
+ >>> from ml4t.diagnostic.evaluation.distribution import analyze_distribution
259
+ >>>
260
+ >>> # Analyze financial returns
261
+ >>> returns = np.random.standard_t(df=5, size=1000) * 0.01 # Heavy-tailed returns
262
+ >>> result = analyze_distribution(returns, alpha=0.05, compute_tails=True)
263
+ >>>
264
+ >>> # Print comprehensive summary
265
+ >>> print(result.summary())
266
+ >>>
267
+ >>> # Get recommendation for risk modeling
268
+ >>> print(f"Use {result.recommended_distribution} distribution")
269
+ >>> if result.recommended_df:
270
+ ... print(f"Degrees of freedom: {result.recommended_df}")
271
+ >>>
272
+ >>> # Check if standard methods are appropriate
273
+ >>> if result.is_normal:
274
+ ... print("Standard normal-based methods OK")
275
+ ... else:
276
+ ... print("Use robust methods for non-normal data")
277
+ >>>
278
+ >>> # Quick analysis without tail computation (faster)
279
+ >>> result_fast = analyze_distribution(returns, compute_tails=False)
280
+
281
+ Notes:
282
+ - Tail analysis (compute_tails=True) adds Hill estimator and QQ plots
283
+ - Skip tail analysis for very large datasets or when speed is critical
284
+ - Consensus normality requires both JB and SW to accept H0
285
+ - Recommendation logic prioritizes tail analysis over simple normality tests
286
+ - For n < 50, Shapiro-Wilk test may be unreliable (warning issued)
287
+ """
288
+ # Input validation (basic check, detailed checks in subfunctions)
289
+ if data is None:
290
+ raise ValidationError("Data cannot be None", context={"function": "analyze_distribution"})
291
+
292
+ logger.info(
293
+ "Starting comprehensive distribution analysis",
294
+ compute_tails=compute_tails,
295
+ alpha=alpha,
296
+ )
297
+
298
+ try:
299
+ # 1. Compute moments
300
+ moments_result = compute_moments(data, test_significance=True, alpha=alpha)
301
+
302
+ # 2. Jarque-Bera test
303
+ jarque_bera_result = jarque_bera_test(data, alpha=alpha)
304
+
305
+ # 3. Shapiro-Wilk test
306
+ shapiro_wilk_result = shapiro_wilk_test(data, alpha=alpha)
307
+
308
+ # 4. Tail analysis (optional)
309
+ tail_analysis_result = None
310
+ if compute_tails:
311
+ try:
312
+ tail_analysis_result = analyze_tails(data)
313
+ except Exception as e:
314
+ logger.warning(f"Tail analysis failed, skipping: {e}")
315
+ # Continue without tail analysis
316
+
317
+ # 5. Determine consensus normality
318
+ # Both tests must accept H0 for consensus normality
319
+ is_normal = jarque_bera_result.is_normal and shapiro_wilk_result.is_normal
320
+
321
+ # 6. Recommend distribution
322
+ recommended_distribution, recommended_df = _recommend_distribution(
323
+ is_normal=is_normal,
324
+ moments_result=moments_result,
325
+ tail_analysis_result=tail_analysis_result,
326
+ )
327
+
328
+ # 7. Generate interpretation
329
+ interpretation = _generate_interpretation(
330
+ is_normal=is_normal,
331
+ moments_result=moments_result,
332
+ jarque_bera_result=jarque_bera_result,
333
+ shapiro_wilk_result=shapiro_wilk_result,
334
+ tail_analysis_result=tail_analysis_result,
335
+ recommended_distribution=recommended_distribution,
336
+ )
337
+
338
+ logger.info(
339
+ "Distribution analysis completed",
340
+ is_normal=is_normal,
341
+ recommended=recommended_distribution,
342
+ n_obs=moments_result.n_obs,
343
+ )
344
+
345
+ return DistributionAnalysisResult(
346
+ moments_result=moments_result,
347
+ jarque_bera_result=jarque_bera_result,
348
+ shapiro_wilk_result=shapiro_wilk_result,
349
+ tail_analysis_result=tail_analysis_result,
350
+ is_normal=is_normal,
351
+ recommended_distribution=recommended_distribution,
352
+ recommended_df=recommended_df,
353
+ interpretation=interpretation,
354
+ )
355
+
356
+ except (ValidationError, ComputationError):
357
+ raise
358
+ except Exception as e:
359
+ logger.error("Distribution analysis failed", error=str(e))
360
+ raise ComputationError( # noqa: B904
361
+ f"Distribution analysis failed: {e}",
362
+ context={"function": "analyze_distribution"},
363
+ cause=e,
364
+ )
365
+
366
+
367
+ def _recommend_distribution(
368
+ is_normal: bool,
369
+ moments_result: MomentsResult,
370
+ tail_analysis_result: TailAnalysisResult | None,
371
+ ) -> tuple[str, int | None]:
372
+ """Internal: Recommend distribution based on analysis results.
373
+
374
+ Logic:
375
+ 1. If tail analysis available, prioritize its recommendation
376
+ 2. If both normality tests pass, recommend normal
377
+ 3. If heavy tails detected (alpha <= 2), recommend stable/heavy-tailed
378
+ 4. If medium tails (2 < alpha <= 4), recommend Student's t with estimated df
379
+ 5. Otherwise, recommend t-distribution for non-normal data
380
+
381
+ Returns:
382
+ Tuple of (distribution_name, degrees_of_freedom)
383
+ """
384
+ # If tail analysis available, use its recommendation
385
+ if tail_analysis_result is not None:
386
+ best_fit = tail_analysis_result.best_fit
387
+ tail_index = tail_analysis_result.hill_result.tail_index
388
+ classification = tail_analysis_result.hill_result.classification
389
+
390
+ if best_fit == "normal":
391
+ return ("normal", None)
392
+ elif best_fit == "t":
393
+ # Use df from QQ plot if available
394
+ if tail_analysis_result.qq_t is not None:
395
+ return ("t", tail_analysis_result.qq_t.df)
396
+ else:
397
+ # Estimate df from tail index: df ≈ 2*alpha for medium tails
398
+ df = max(2, min(30, int(round(2 * tail_index))))
399
+ return ("t", df)
400
+ elif best_fit == "heavy-tailed":
401
+ # Very heavy tails
402
+ if classification == "heavy" and tail_index <= 2.0:
403
+ return ("stable", None) # Stable distribution for alpha <= 2
404
+ else:
405
+ return ("heavy-tailed", None)
406
+
407
+ # Fallback: Use normality tests and moments
408
+ if is_normal:
409
+ return ("normal", None)
410
+
411
+ # Non-normal: check excess kurtosis
412
+ if moments_result.excess_kurtosis > 2.0:
413
+ # Very fat tails => recommend heavy-tailed
414
+ return ("heavy-tailed", None)
415
+ elif moments_result.excess_kurtosis > 0.5:
416
+ # Moderate fat tails => recommend t with estimated df
417
+ # Heuristic: df ≈ 6/excess_kurtosis + 4 (for excess kurtosis)
418
+ df = max(3, min(30, int(round(6 / moments_result.excess_kurtosis + 4))))
419
+ return ("t", df)
420
+ else:
421
+ # Slight deviation from normal => t with higher df
422
+ return ("t", 10)
423
+
424
+
425
+ def _generate_interpretation(
426
+ is_normal: bool,
427
+ moments_result: MomentsResult,
428
+ jarque_bera_result: JarqueBeraResult,
429
+ shapiro_wilk_result: ShapiroWilkResult,
430
+ tail_analysis_result: TailAnalysisResult | None,
431
+ recommended_distribution: str,
432
+ ) -> str:
433
+ """Internal: Generate human-readable interpretation.
434
+
435
+ Returns:
436
+ Multi-line interpretation string
437
+ """
438
+ lines = []
439
+
440
+ # Normality assessment
441
+ if is_normal:
442
+ lines.append("Data is consistent with normal distribution (both tests pass).")
443
+ lines.append("Standard statistical methods and risk measures are appropriate.")
444
+ else:
445
+ lines.append("Data deviates from normality (at least one test rejects H0).")
446
+
447
+ # Explain why
448
+ if jarque_bera_result.is_normal and not shapiro_wilk_result.is_normal:
449
+ lines.append("Shapiro-Wilk test rejects normality (more powerful for small samples).")
450
+ elif not jarque_bera_result.is_normal and shapiro_wilk_result.is_normal:
451
+ lines.append("Jarque-Bera test rejects normality (based on skewness/kurtosis).")
452
+ else:
453
+ lines.append("Both normality tests reject H0.")
454
+
455
+ # Moments interpretation
456
+ if moments_result.skewness_significant:
457
+ if moments_result.skewness > 0:
458
+ lines.append(
459
+ f"Significant positive skewness ({moments_result.skewness:.3f}) indicates right tail is heavier."
460
+ )
461
+ else:
462
+ lines.append(
463
+ f"Significant negative skewness ({moments_result.skewness:.3f}) "
464
+ "indicates left tail is heavier (common for equity returns)."
465
+ )
466
+
467
+ if moments_result.excess_kurtosis_significant and moments_result.excess_kurtosis > 0:
468
+ lines.append(
469
+ f"Significant excess kurtosis ({moments_result.excess_kurtosis:.3f}) "
470
+ "indicates fat tails and higher extreme event probability."
471
+ )
472
+
473
+ # Tail analysis interpretation
474
+ if tail_analysis_result is not None:
475
+ classification = tail_analysis_result.hill_result.classification
476
+ tail_index = tail_analysis_result.hill_result.tail_index
477
+
478
+ if classification == "heavy":
479
+ lines.append(
480
+ f"Heavy tails detected (α={tail_index:.2f} ≤ 2): power law behavior in extremes."
481
+ )
482
+ elif classification == "medium":
483
+ lines.append(
484
+ f"Medium-heavy tails (α={tail_index:.2f}): heavier than normal but finite variance."
485
+ )
486
+ else:
487
+ lines.append(
488
+ f"Thin tails detected (α={tail_index:.2f} > 4): approaching normal tail behavior."
489
+ )
490
+
491
+ # Recommendation rationale
492
+ if recommended_distribution == "normal":
493
+ lines.append("Normal distribution provides adequate fit for this data.")
494
+ elif recommended_distribution == "t":
495
+ lines.append("Student's t distribution recommended for heavier tails than normal.")
496
+ elif recommended_distribution in ["stable", "heavy-tailed"]:
497
+ lines.append("Heavy-tailed distribution required due to extreme power law behavior.")
498
+
499
+ return "\n".join(lines)