ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. ml4t/diagnostic/AGENT.md +25 -0
  2. ml4t/diagnostic/__init__.py +166 -0
  3. ml4t/diagnostic/backends/__init__.py +10 -0
  4. ml4t/diagnostic/backends/adapter.py +192 -0
  5. ml4t/diagnostic/backends/polars_backend.py +899 -0
  6. ml4t/diagnostic/caching/__init__.py +40 -0
  7. ml4t/diagnostic/caching/cache.py +331 -0
  8. ml4t/diagnostic/caching/decorators.py +131 -0
  9. ml4t/diagnostic/caching/smart_cache.py +339 -0
  10. ml4t/diagnostic/config/AGENT.md +24 -0
  11. ml4t/diagnostic/config/README.md +267 -0
  12. ml4t/diagnostic/config/__init__.py +219 -0
  13. ml4t/diagnostic/config/barrier_config.py +277 -0
  14. ml4t/diagnostic/config/base.py +301 -0
  15. ml4t/diagnostic/config/event_config.py +148 -0
  16. ml4t/diagnostic/config/feature_config.py +404 -0
  17. ml4t/diagnostic/config/multi_signal_config.py +55 -0
  18. ml4t/diagnostic/config/portfolio_config.py +215 -0
  19. ml4t/diagnostic/config/report_config.py +391 -0
  20. ml4t/diagnostic/config/sharpe_config.py +202 -0
  21. ml4t/diagnostic/config/signal_config.py +206 -0
  22. ml4t/diagnostic/config/trade_analysis_config.py +310 -0
  23. ml4t/diagnostic/config/validation.py +279 -0
  24. ml4t/diagnostic/core/__init__.py +29 -0
  25. ml4t/diagnostic/core/numba_utils.py +315 -0
  26. ml4t/diagnostic/core/purging.py +372 -0
  27. ml4t/diagnostic/core/sampling.py +471 -0
  28. ml4t/diagnostic/errors/__init__.py +205 -0
  29. ml4t/diagnostic/evaluation/AGENT.md +26 -0
  30. ml4t/diagnostic/evaluation/__init__.py +437 -0
  31. ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
  32. ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
  33. ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
  34. ml4t/diagnostic/evaluation/dashboard.py +715 -0
  35. ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
  36. ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
  37. ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
  38. ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
  39. ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
  40. ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
  41. ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
  42. ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
  43. ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
  44. ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
  45. ml4t/diagnostic/evaluation/event_analysis.py +647 -0
  46. ml4t/diagnostic/evaluation/excursion.py +390 -0
  47. ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
  48. ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
  49. ml4t/diagnostic/evaluation/framework.py +935 -0
  50. ml4t/diagnostic/evaluation/metric_registry.py +255 -0
  51. ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
  52. ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
  53. ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
  54. ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
  55. ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
  56. ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
  57. ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
  58. ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
  59. ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
  60. ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
  61. ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
  62. ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
  63. ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
  64. ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
  65. ml4t/diagnostic/evaluation/multi_signal.py +550 -0
  66. ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
  67. ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
  68. ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
  69. ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
  70. ml4t/diagnostic/evaluation/report_generation.py +824 -0
  71. ml4t/diagnostic/evaluation/signal_selector.py +452 -0
  72. ml4t/diagnostic/evaluation/stat_registry.py +139 -0
  73. ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
  74. ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
  75. ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
  76. ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
  77. ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
  78. ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
  79. ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
  80. ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
  81. ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
  82. ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
  83. ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
  84. ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
  85. ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
  86. ml4t/diagnostic/evaluation/stats/moments.py +164 -0
  87. ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
  88. ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
  89. ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
  90. ml4t/diagnostic/evaluation/themes.py +330 -0
  91. ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
  92. ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
  93. ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
  94. ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
  95. ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
  96. ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
  97. ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
  98. ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
  99. ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
  100. ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
  101. ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
  102. ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
  103. ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
  104. ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
  105. ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
  106. ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
  107. ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
  108. ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
  109. ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
  110. ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
  111. ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
  112. ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
  113. ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
  114. ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
  115. ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
  116. ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
  117. ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
  118. ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
  119. ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
  120. ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
  121. ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
  122. ml4t/diagnostic/evaluation/validated_cv.py +535 -0
  123. ml4t/diagnostic/evaluation/visualization.py +1050 -0
  124. ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
  125. ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
  126. ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
  127. ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
  128. ml4t/diagnostic/integration/__init__.py +48 -0
  129. ml4t/diagnostic/integration/backtest_contract.py +671 -0
  130. ml4t/diagnostic/integration/data_contract.py +316 -0
  131. ml4t/diagnostic/integration/engineer_contract.py +226 -0
  132. ml4t/diagnostic/logging/__init__.py +77 -0
  133. ml4t/diagnostic/logging/logger.py +245 -0
  134. ml4t/diagnostic/logging/performance.py +234 -0
  135. ml4t/diagnostic/logging/progress.py +234 -0
  136. ml4t/diagnostic/logging/wandb.py +412 -0
  137. ml4t/diagnostic/metrics/__init__.py +9 -0
  138. ml4t/diagnostic/metrics/percentiles.py +128 -0
  139. ml4t/diagnostic/py.typed +1 -0
  140. ml4t/diagnostic/reporting/__init__.py +43 -0
  141. ml4t/diagnostic/reporting/base.py +130 -0
  142. ml4t/diagnostic/reporting/html_renderer.py +275 -0
  143. ml4t/diagnostic/reporting/json_renderer.py +51 -0
  144. ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
  145. ml4t/diagnostic/results/AGENT.md +24 -0
  146. ml4t/diagnostic/results/__init__.py +105 -0
  147. ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
  148. ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
  149. ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
  150. ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
  151. ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
  152. ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
  153. ml4t/diagnostic/results/barrier_results/validation.py +38 -0
  154. ml4t/diagnostic/results/base.py +177 -0
  155. ml4t/diagnostic/results/event_results.py +349 -0
  156. ml4t/diagnostic/results/feature_results.py +787 -0
  157. ml4t/diagnostic/results/multi_signal_results.py +431 -0
  158. ml4t/diagnostic/results/portfolio_results.py +281 -0
  159. ml4t/diagnostic/results/sharpe_results.py +448 -0
  160. ml4t/diagnostic/results/signal_results/__init__.py +74 -0
  161. ml4t/diagnostic/results/signal_results/ic.py +581 -0
  162. ml4t/diagnostic/results/signal_results/irtc.py +110 -0
  163. ml4t/diagnostic/results/signal_results/quantile.py +392 -0
  164. ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
  165. ml4t/diagnostic/results/signal_results/turnover.py +213 -0
  166. ml4t/diagnostic/results/signal_results/validation.py +147 -0
  167. ml4t/diagnostic/signal/AGENT.md +17 -0
  168. ml4t/diagnostic/signal/__init__.py +69 -0
  169. ml4t/diagnostic/signal/_report.py +152 -0
  170. ml4t/diagnostic/signal/_utils.py +261 -0
  171. ml4t/diagnostic/signal/core.py +275 -0
  172. ml4t/diagnostic/signal/quantile.py +148 -0
  173. ml4t/diagnostic/signal/result.py +214 -0
  174. ml4t/diagnostic/signal/signal_ic.py +129 -0
  175. ml4t/diagnostic/signal/turnover.py +182 -0
  176. ml4t/diagnostic/splitters/AGENT.md +19 -0
  177. ml4t/diagnostic/splitters/__init__.py +36 -0
  178. ml4t/diagnostic/splitters/base.py +501 -0
  179. ml4t/diagnostic/splitters/calendar.py +421 -0
  180. ml4t/diagnostic/splitters/calendar_config.py +91 -0
  181. ml4t/diagnostic/splitters/combinatorial.py +1064 -0
  182. ml4t/diagnostic/splitters/config.py +322 -0
  183. ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
  184. ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
  185. ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
  186. ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
  187. ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
  188. ml4t/diagnostic/splitters/group_isolation.py +329 -0
  189. ml4t/diagnostic/splitters/persistence.py +316 -0
  190. ml4t/diagnostic/splitters/utils.py +207 -0
  191. ml4t/diagnostic/splitters/walk_forward.py +757 -0
  192. ml4t/diagnostic/utils/__init__.py +42 -0
  193. ml4t/diagnostic/utils/config.py +542 -0
  194. ml4t/diagnostic/utils/dependencies.py +318 -0
  195. ml4t/diagnostic/utils/sessions.py +127 -0
  196. ml4t/diagnostic/validation/__init__.py +54 -0
  197. ml4t/diagnostic/validation/dataframe.py +274 -0
  198. ml4t/diagnostic/validation/returns.py +280 -0
  199. ml4t/diagnostic/validation/timeseries.py +299 -0
  200. ml4t/diagnostic/visualization/AGENT.md +19 -0
  201. ml4t/diagnostic/visualization/__init__.py +223 -0
  202. ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
  203. ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
  204. ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
  205. ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
  206. ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
  207. ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
  208. ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
  209. ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
  210. ml4t/diagnostic/visualization/barrier_plots.py +782 -0
  211. ml4t/diagnostic/visualization/core.py +1060 -0
  212. ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
  213. ml4t/diagnostic/visualization/dashboards/base.py +582 -0
  214. ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
  215. ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
  216. ml4t/diagnostic/visualization/dashboards.py +43 -0
  217. ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
  218. ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
  219. ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
  220. ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
  221. ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
  222. ml4t/diagnostic/visualization/feature_plots.py +888 -0
  223. ml4t/diagnostic/visualization/interaction_plots.py +618 -0
  224. ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
  225. ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
  226. ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
  227. ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
  228. ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
  229. ml4t/diagnostic/visualization/report_generation.py +1343 -0
  230. ml4t/diagnostic/visualization/signal/__init__.py +103 -0
  231. ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
  232. ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
  233. ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
  234. ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
  235. ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
  236. ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
  237. ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
  238. ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
  239. ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
  240. ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
  241. ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
  242. ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,777 @@
1
+ """Tail analysis for heavy-tailed distributions.
2
+
3
+ This module provides tools for analyzing tail behavior:
4
+ - Hill estimator for tail index estimation
5
+ - Q-Q plots for distribution comparison
6
+ - Comprehensive tail analysis combining multiple methods
7
+
8
+ Tail Classification:
9
+ - Heavy tails (α ≤ 2): Infinite variance regime, extreme power law behavior
10
+ - Medium tails (2 < α ≤ 4): Finite variance, infinite 4th moment
11
+ - Thin tails (α > 4): All moments finite, close to normal
12
+
13
+ References:
14
+ - Hill, B. M. (1975). A simple general approach to inference about the tail
15
+ of a distribution. The Annals of Statistics, 3(5), 1163-1174.
16
+ - Mandelbrot, B. (1963). The variation of certain speculative prices.
17
+ The Journal of Business, 36(4), 394-419.
18
+ - Clauset, A., Shalizi, C. R., & Newman, M. E. (2009). Power-law distributions
19
+ in empirical data. SIAM Review, 51(4), 661-703.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass
25
+
26
+ import numpy as np
27
+ import pandas as pd
28
+ from scipy import stats
29
+
30
+ from ml4t.diagnostic.errors import ComputationError, ValidationError
31
+ from ml4t.diagnostic.logging import get_logger
32
+
33
+ logger = get_logger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class HillEstimatorResult:
38
+ """Hill estimator for tail index of heavy-tailed distributions.
39
+
40
+ The Hill estimator estimates the tail index α (alpha) for power law distributions.
41
+ Higher α indicates thinner tails. The tail index characterizes how quickly the
42
+ probability density decays in the tail:
43
+
44
+ P(X > x) ~ x^(-α)
45
+
46
+ Tail Classification:
47
+ - Heavy tails (α ≤ 2): Infinite variance regime, extreme power law behavior
48
+ - Medium tails (2 < α ≤ 4): Finite variance, infinite 4th moment
49
+ - Thin tails (α > 4): All moments finite, approaching normal
50
+
51
+ Attributes:
52
+ tail_index: Estimated tail index α (higher = thinner tail)
53
+ tail_index_se: Standard error of tail index estimate
54
+ k: Number of upper order statistics used in estimation
55
+ classification: Tail classification ("heavy", "medium", "thin")
56
+ tail: Which tail was analyzed ("upper", "lower", "both")
57
+ n_obs: Total number of observations
58
+
59
+ Notes:
60
+ - SE(α̂) = α̂ / sqrt(k)
61
+ - Financial returns typically have α ∈ [2, 4] (medium tails)
62
+ - Normal distribution has α → ∞ (exponential tail decay)
63
+ """
64
+
65
+ tail_index: float
66
+ tail_index_se: float
67
+ k: int
68
+ classification: str
69
+ tail: str
70
+ n_obs: int
71
+
72
+ def __repr__(self) -> str:
73
+ """String representation."""
74
+ return f"HillEstimatorResult(alpha={self.tail_index:.4f}, classification='{self.classification}', k={self.k})"
75
+
76
+ def summary(self) -> str:
77
+ """Human-readable summary of Hill estimator analysis.
78
+
79
+ Returns:
80
+ Formatted summary string
81
+ """
82
+ lines = [
83
+ "Hill Estimator - Tail Index Analysis",
84
+ "=" * 50,
85
+ f"Tail Index (α): {self.tail_index:.4f}",
86
+ f"Standard Error: {self.tail_index_se:.4f}",
87
+ f"Z-score: {self.tail_index / self.tail_index_se:.4f}",
88
+ f"Order Statistics: k={self.k}",
89
+ f"Total Observations: n={self.n_obs}",
90
+ f"Tail Analyzed: {self.tail}",
91
+ ]
92
+
93
+ lines.append("")
94
+ lines.append(f"Classification: {self.classification.upper()}")
95
+
96
+ if self.classification == "heavy":
97
+ interpretation = [
98
+ " - Infinite variance regime (α ≤ 2)",
99
+ " - Extreme power law behavior",
100
+ " - Mean may not exist for α ≤ 1",
101
+ " - Very high probability of extreme events",
102
+ " - Standard risk measures (VaR, Sharpe) unreliable",
103
+ ]
104
+ elif self.classification == "medium":
105
+ interpretation = [
106
+ " - Finite variance but heavy-tailed (2 < α ≤ 4)",
107
+ " - Fourth moment may not exist",
108
+ " - Higher extreme event probability than normal",
109
+ " - Typical for financial returns",
110
+ " - Use robust risk measures (CVaR, drawdown)",
111
+ ]
112
+ else: # thin
113
+ interpretation = [
114
+ " - All moments finite (α > 4)",
115
+ " - Tail behavior approaching normal distribution",
116
+ " - Standard statistical methods applicable",
117
+ " - Lower extreme event probability",
118
+ ]
119
+
120
+ lines.extend(interpretation)
121
+
122
+ lines.append("")
123
+ lines.append("Methodology:")
124
+ lines.append(" - Hill estimator: α̂ = k / Σ(log(X_i) - log(X_{k+1}))")
125
+ lines.append(f" - Uses k={self.k} largest order statistics")
126
+ lines.append(" - Asymptotic SE: α̂ / sqrt(k)")
127
+
128
+ return "\n".join(lines)
129
+
130
+
131
+ @dataclass
132
+ class QQPlotData:
133
+ """Q-Q plot data for distribution comparison.
134
+
135
+ Quantile-Quantile (Q-Q) plots compare empirical quantiles against theoretical
136
+ quantiles from a reference distribution. If data follows the reference distribution,
137
+ points should lie on the diagonal line y=x.
138
+
139
+ Attributes:
140
+ theoretical_quantiles: Quantiles from reference distribution
141
+ sample_quantiles: Empirical quantiles from data
142
+ distribution: Reference distribution name ("normal", "t", "uniform", etc.)
143
+ r_squared: R² goodness of fit (closer to 1 = better fit)
144
+ df: Degrees of freedom (for t-distribution, None otherwise)
145
+ n_obs: Number of observations
146
+ """
147
+
148
+ theoretical_quantiles: np.ndarray
149
+ sample_quantiles: np.ndarray
150
+ distribution: str
151
+ r_squared: float
152
+ df: int | None = None
153
+ n_obs: int = 0
154
+
155
+ def __repr__(self) -> str:
156
+ """String representation."""
157
+ df_str = f", df={self.df}" if self.df is not None else ""
158
+ return f"QQPlotData(distribution='{self.distribution}', R²={self.r_squared:.4f}{df_str})"
159
+
160
+ def summary(self) -> str:
161
+ """Human-readable summary of QQ plot analysis.
162
+
163
+ Returns:
164
+ Formatted summary string
165
+ """
166
+ lines = [
167
+ f"Q-Q Plot Analysis - {self.distribution.title()} Distribution",
168
+ "=" * 50,
169
+ f"Reference Dist: {self.distribution}",
170
+ f"R² (Goodness): {self.r_squared:.4f}",
171
+ f"Observations: {self.n_obs}",
172
+ ]
173
+
174
+ if self.df is not None:
175
+ lines.append(f"Degrees of Freedom: {self.df}")
176
+
177
+ lines.append("")
178
+ if self.r_squared >= 0.99:
179
+ fit_quality = "Excellent"
180
+ interpretation = "Data closely follows reference distribution"
181
+ elif self.r_squared >= 0.95:
182
+ fit_quality = "Good"
183
+ interpretation = "Data reasonably follows reference distribution"
184
+ elif self.r_squared >= 0.90:
185
+ fit_quality = "Moderate"
186
+ interpretation = "Some deviation from reference distribution"
187
+ else:
188
+ fit_quality = "Poor"
189
+ interpretation = "Significant deviation from reference distribution"
190
+
191
+ lines.append(f"Fit Quality: {fit_quality}")
192
+ lines.append(f" {interpretation}")
193
+
194
+ lines.append("")
195
+ lines.append("Interpretation:")
196
+ lines.append(" - Points on diagonal => data follows reference distribution")
197
+ lines.append(" - Deviations in tails => different tail behavior")
198
+ lines.append(" - S-shaped pattern => skewness difference")
199
+ lines.append(" - Curved pattern => kurtosis difference")
200
+
201
+ return "\n".join(lines)
202
+
203
+
204
+ @dataclass
205
+ class TailAnalysisResult:
206
+ """Comprehensive tail analysis combining Hill estimator and QQ plots.
207
+
208
+ Analyzes tail behavior by:
209
+ 1. Estimating tail index using Hill estimator
210
+ 2. Comparing against normal distribution (QQ plot)
211
+ 3. Comparing against Student's t distribution (QQ plot)
212
+ 4. Determining best-fit distribution
213
+
214
+ This multi-method approach provides robust characterization of tail behavior
215
+ and helps identify appropriate distributional assumptions for modeling.
216
+
217
+ Attributes:
218
+ hill_result: Hill estimator analysis results
219
+ qq_normal: QQ plot comparison with normal distribution
220
+ qq_t: QQ plot comparison with Student's t (None if not computed)
221
+ best_fit: Best fitting distribution ("normal", "t", "heavy-tailed")
222
+ """
223
+
224
+ hill_result: HillEstimatorResult
225
+ qq_normal: QQPlotData
226
+ qq_t: QQPlotData | None
227
+ best_fit: str
228
+
229
+ def __repr__(self) -> str:
230
+ """String representation."""
231
+ return (
232
+ f"TailAnalysisResult(tail_index={self.hill_result.tail_index:.4f}, "
233
+ f"classification='{self.hill_result.classification}', "
234
+ f"best_fit='{self.best_fit}')"
235
+ )
236
+
237
+ def summary(self) -> str:
238
+ """Human-readable summary of comprehensive tail analysis.
239
+
240
+ Returns:
241
+ Formatted summary string
242
+ """
243
+ lines = [
244
+ "Comprehensive Tail Analysis",
245
+ "=" * 50,
246
+ ]
247
+
248
+ # Hill estimator summary
249
+ lines.append("")
250
+ lines.append("TAIL INDEX ESTIMATION:")
251
+ lines.append(f" Hill α: {self.hill_result.tail_index:.4f}")
252
+ lines.append(f" Classification: {self.hill_result.classification}")
253
+ lines.append(f" Tail Type: {self.hill_result.tail}")
254
+
255
+ # QQ plot comparisons
256
+ lines.append("")
257
+ lines.append("DISTRIBUTION COMPARISON:")
258
+ lines.append(f" Normal R²: {self.qq_normal.r_squared:.4f}")
259
+ if self.qq_t is not None:
260
+ lines.append(f" Student's t R²: {self.qq_t.r_squared:.4f} (df={self.qq_t.df})")
261
+
262
+ lines.append(f" Best Fit: {self.best_fit}")
263
+
264
+ # Interpretation
265
+ lines.append("")
266
+ lines.append("INTERPRETATION:")
267
+ if self.best_fit == "normal":
268
+ interpretation = [
269
+ " - Data is consistent with normal distribution",
270
+ " - Thin tails (low extreme event probability)",
271
+ " - Standard statistical methods appropriate",
272
+ ]
273
+ elif self.best_fit == "t":
274
+ interpretation = [
275
+ f" - Data best fit by Student's t (df={self.qq_t.df if self.qq_t else 'unknown'})",
276
+ " - Heavier tails than normal but finite variance",
277
+ " - Moderate extreme event probability",
278
+ " - Use robust statistical methods",
279
+ ]
280
+ else: # heavy-tailed
281
+ interpretation = [
282
+ " - Data exhibits heavy tail behavior",
283
+ " - Power law distribution indicated",
284
+ " - High extreme event probability",
285
+ " - Standard risk measures may be unreliable",
286
+ " - Consider tail risk models (CVaR, extreme value theory)",
287
+ ]
288
+
289
+ lines.extend(interpretation)
290
+
291
+ # Recommendations
292
+ lines.append("")
293
+ lines.append("RECOMMENDATIONS:")
294
+ if self.hill_result.classification == "heavy":
295
+ lines.append(" - Use tail risk measures (CVaR, expected shortfall)")
296
+ lines.append(" - Consider extreme value theory for VaR")
297
+ lines.append(" - Apply robust portfolio optimization")
298
+ lines.append(" - Monitor for regime changes")
299
+ elif self.hill_result.classification == "medium":
300
+ lines.append(" - Use robust Sharpe ratio alternatives")
301
+ lines.append(" - Consider CVaR alongside VaR")
302
+ lines.append(" - Account for non-normality in models")
303
+ else:
304
+ lines.append(" - Standard statistical methods appropriate")
305
+ lines.append(" - Monitor for changes in tail behavior")
306
+
307
+ return "\n".join(lines)
308
+
309
+
310
+ def hill_estimator(
311
+ data: pd.Series | np.ndarray,
312
+ k: int | None = None,
313
+ tail: str = "both",
314
+ ) -> HillEstimatorResult:
315
+ """Estimate tail index using Hill estimator.
316
+
317
+ The Hill estimator computes the tail index α for power law distributions.
318
+ For a power law tail P(X > x) ~ x^(-α), the Hill estimator is:
319
+
320
+ α̂ = k / Σ(log(X_i) - log(X_{k+1}))
321
+
322
+ where X_1 ≥ X_2 ≥ ... ≥ X_n are order statistics and k is the number of
323
+ upper order statistics used.
324
+
325
+ Tail Classification:
326
+ - Heavy tails (α ≤ 2): Infinite variance regime
327
+ - Medium tails (2 < α ≤ 4): Finite variance, heavy-tailed
328
+ - Thin tails (α > 4): All moments finite
329
+
330
+ Args:
331
+ data: Time series data (1D array or Series)
332
+ k: Number of upper order statistics (default: sqrt(n))
333
+ tail: Which tail to analyze - "upper", "lower", or "both" (default)
334
+
335
+ Returns:
336
+ HillEstimatorResult with tail index and classification
337
+
338
+ Raises:
339
+ ValidationError: If data is invalid
340
+ ComputationError: If estimation fails
341
+
342
+ Example:
343
+ >>> import numpy as np
344
+ >>> # Student's t distribution (df=3) has heavy tails
345
+ >>> t_data = np.random.standard_t(df=3, size=1000)
346
+ >>> result = hill_estimator(t_data)
347
+ >>> print(f"Tail index: {result.tail_index:.2f}")
348
+ >>> print(f"Classification: {result.classification}")
349
+ >>>
350
+ >>> # Normal distribution has thin tails (large α)
351
+ >>> normal_data = np.random.normal(0, 1, 1000)
352
+ >>> result = hill_estimator(normal_data)
353
+ >>> print(f"Tail index: {result.tail_index:.2f}")
354
+
355
+ Notes:
356
+ - Optimal k selection is an open research problem
357
+ - Default k = sqrt(n) is a common heuristic
358
+ - SE(α̂) = α̂ / sqrt(k)
359
+ - Works best for truly power law tails
360
+ - For "both" tails, returns minimum of upper and lower estimates
361
+
362
+ References:
363
+ - Hill, B. M. (1975). A simple general approach to inference about the
364
+ tail of a distribution. The Annals of Statistics, 3(5), 1163-1174.
365
+ """
366
+ # Input validation
367
+ if data is None:
368
+ raise ValidationError("Data cannot be None", context={"function": "hill_estimator"})
369
+
370
+ # Convert to numpy array
371
+ if isinstance(data, pd.Series):
372
+ arr = data.to_numpy()
373
+ elif isinstance(data, np.ndarray):
374
+ arr = data
375
+ else:
376
+ raise ValidationError(
377
+ f"Data must be pandas Series or numpy array, got {type(data)}",
378
+ context={"function": "hill_estimator", "data_type": type(data).__name__},
379
+ )
380
+
381
+ # Check array properties
382
+ if arr.ndim != 1:
383
+ raise ValidationError(
384
+ f"Data must be 1-dimensional, got {arr.ndim}D",
385
+ context={"function": "hill_estimator", "shape": arr.shape},
386
+ )
387
+
388
+ if len(arr) == 0:
389
+ raise ValidationError(
390
+ "Data cannot be empty", context={"function": "hill_estimator", "length": 0}
391
+ )
392
+
393
+ # Check for missing/infinite values
394
+ if np.any(~np.isfinite(arr)):
395
+ n_invalid = np.sum(~np.isfinite(arr))
396
+ raise ValidationError(
397
+ f"Data contains {n_invalid} NaN or infinite values",
398
+ context={"function": "hill_estimator", "n_invalid": n_invalid, "length": len(arr)},
399
+ )
400
+
401
+ # Check minimum length
402
+ min_length = 50 # Need sufficient data for tail estimation
403
+ if len(arr) < min_length:
404
+ raise ValidationError(
405
+ f"Insufficient data for Hill estimator (need at least {min_length} observations)",
406
+ context={"function": "hill_estimator", "length": len(arr), "min_length": min_length},
407
+ )
408
+
409
+ # Validate tail parameter
410
+ if tail not in ["upper", "lower", "both"]:
411
+ raise ValidationError(
412
+ f"Invalid tail parameter: {tail}. Must be 'upper', 'lower', or 'both'",
413
+ context={"function": "hill_estimator", "tail": tail},
414
+ )
415
+
416
+ # Set k if not provided (common heuristic: sqrt(n))
417
+ n = len(arr)
418
+ if k is None:
419
+ k = int(np.sqrt(n))
420
+ elif k < 2:
421
+ raise ValidationError(
422
+ f"k must be at least 2, got {k}",
423
+ context={"function": "hill_estimator", "k": k},
424
+ )
425
+ elif k >= n:
426
+ raise ValidationError(
427
+ f"k must be less than n={n}, got {k}",
428
+ context={"function": "hill_estimator", "k": k, "n": n},
429
+ )
430
+
431
+ logger.info("Computing Hill estimator", n_obs=n, k=k, tail=tail)
432
+
433
+ try:
434
+
435
+ def compute_hill_alpha(data_sorted: np.ndarray, k: int) -> tuple[float, float]:
436
+ """Compute Hill estimator for sorted data (descending order)."""
437
+ # Get k largest values and the (k+1)th value
438
+ X_k_plus_1 = data_sorted[k] # (k+1)th largest value
439
+
440
+ # Check for zero or negative values (can't take log)
441
+ if X_k_plus_1 <= 0:
442
+ raise ComputationError(
443
+ "Hill estimator requires positive data for log transform",
444
+ context={"function": "hill_estimator", "X_k_plus_1": float(X_k_plus_1)},
445
+ )
446
+
447
+ # Compute Hill estimator: α̂ = k / Σ(log(X_i) - log(X_{k+1}))
448
+ log_ratios = np.log(data_sorted[:k]) - np.log(X_k_plus_1)
449
+ alpha = float(k / np.sum(log_ratios))
450
+
451
+ # Standard error: SE(α̂) = α̂ / sqrt(k)
452
+ alpha_se = float(alpha / np.sqrt(k))
453
+
454
+ return alpha, alpha_se
455
+
456
+ # Compute for requested tail(s)
457
+ if tail == "upper":
458
+ # Sort descending for upper tail
459
+ sorted_data = np.sort(arr)[::-1]
460
+ alpha, alpha_se = compute_hill_alpha(sorted_data, k)
461
+
462
+ elif tail == "lower":
463
+ # For lower tail, analyze absolute values of negative tail
464
+ # Take absolute values to ensure positive data for log transform
465
+ sorted_data = np.sort(np.abs(arr))[::-1]
466
+ alpha, alpha_se = compute_hill_alpha(sorted_data, k)
467
+
468
+ else: # both
469
+ # Compute both tails and take minimum (more conservative)
470
+ sorted_upper = np.sort(arr)[::-1]
471
+ alpha_upper, alpha_se_upper = compute_hill_alpha(sorted_upper, k)
472
+
473
+ # For lower tail, use absolute values
474
+ sorted_lower = np.sort(np.abs(arr))[::-1]
475
+ alpha_lower, alpha_se_lower = compute_hill_alpha(sorted_lower, k)
476
+
477
+ # Use minimum (heavier tail)
478
+ if alpha_upper < alpha_lower:
479
+ alpha, alpha_se = alpha_upper, alpha_se_upper
480
+ else:
481
+ alpha, alpha_se = alpha_lower, alpha_se_lower
482
+
483
+ # Classify tail
484
+ if alpha <= 2.0:
485
+ classification = "heavy"
486
+ elif alpha <= 4.0:
487
+ classification = "medium"
488
+ else:
489
+ classification = "thin"
490
+
491
+ logger.info(
492
+ "Hill estimator computed",
493
+ alpha=alpha,
494
+ classification=classification,
495
+ k=k,
496
+ )
497
+
498
+ return HillEstimatorResult(
499
+ tail_index=alpha,
500
+ tail_index_se=alpha_se,
501
+ k=k,
502
+ classification=classification,
503
+ tail=tail,
504
+ n_obs=n,
505
+ )
506
+
507
+ except ComputationError:
508
+ raise
509
+ except Exception as e:
510
+ logger.error("Hill estimator failed", error=str(e), n_obs=n, k=k)
511
+ raise ComputationError( # noqa: B904
512
+ f"Hill estimator computation failed: {e}",
513
+ context={"function": "hill_estimator", "n_obs": n, "k": k, "tail": tail},
514
+ cause=e,
515
+ )
516
+
517
+
518
+ def generate_qq_data(
519
+ data: pd.Series | np.ndarray,
520
+ distribution: str = "normal",
521
+ df: int | None = None,
522
+ ) -> QQPlotData:
523
+ """Generate Q-Q plot data for distribution comparison.
524
+
525
+ Computes empirical quantiles and theoretical quantiles from a reference
526
+ distribution. Q-Q plots visualize how well data follows a theoretical
527
+ distribution - points on the diagonal indicate good fit.
528
+
529
+ Args:
530
+ data: Time series data (1D array or Series)
531
+ distribution: Reference distribution ("normal", "t", "uniform", "exponential")
532
+ df: Degrees of freedom for Student's t (required if distribution="t")
533
+
534
+ Returns:
535
+ QQPlotData with quantiles and R² goodness of fit
536
+
537
+ Raises:
538
+ ValidationError: If data or parameters are invalid
539
+ ComputationError: If computation fails
540
+
541
+ Example:
542
+ >>> import numpy as np
543
+ >>> # Normal data should fit normal QQ plot well
544
+ >>> normal_data = np.random.normal(0, 1, 1000)
545
+ >>> qq = generate_qq_data(normal_data, distribution="normal")
546
+ >>> print(f"R²: {qq.r_squared:.4f}") # Should be close to 1
547
+ >>>
548
+ >>> # Heavy-tailed data fits t-distribution better
549
+ >>> t_data = np.random.standard_t(df=3, size=1000)
550
+ >>> qq_normal = generate_qq_data(t_data, distribution="normal")
551
+ >>> qq_t = generate_qq_data(t_data, distribution="t", df=3)
552
+ >>> print(f"Normal R²: {qq_normal.r_squared:.4f}")
553
+ >>> print(f"t R²: {qq_t.r_squared:.4f}") # Better fit
554
+
555
+ Notes:
556
+ - Uses scipy.stats.probplot for QQ data generation
557
+ - R² measures goodness of fit (1 = perfect fit)
558
+ - Deviations in tails indicate different tail behavior
559
+ - Works for any sample size, but more reliable for n > 100
560
+ """
561
+ # Input validation
562
+ if data is None:
563
+ raise ValidationError("Data cannot be None", context={"function": "generate_qq_data"})
564
+
565
+ # Convert to numpy array
566
+ if isinstance(data, pd.Series):
567
+ arr = data.to_numpy()
568
+ elif isinstance(data, np.ndarray):
569
+ arr = data
570
+ else:
571
+ raise ValidationError(
572
+ f"Data must be pandas Series or numpy array, got {type(data)}",
573
+ context={"function": "generate_qq_data", "data_type": type(data).__name__},
574
+ )
575
+
576
+ # Check array properties
577
+ if arr.ndim != 1:
578
+ raise ValidationError(
579
+ f"Data must be 1-dimensional, got {arr.ndim}D",
580
+ context={"function": "generate_qq_data", "shape": arr.shape},
581
+ )
582
+
583
+ if len(arr) == 0:
584
+ raise ValidationError(
585
+ "Data cannot be empty", context={"function": "generate_qq_data", "length": 0}
586
+ )
587
+
588
+ # Check for missing/infinite values
589
+ if np.any(~np.isfinite(arr)):
590
+ n_invalid = np.sum(~np.isfinite(arr))
591
+ raise ValidationError(
592
+ f"Data contains {n_invalid} NaN or infinite values",
593
+ context={"function": "generate_qq_data", "n_invalid": n_invalid, "length": len(arr)},
594
+ )
595
+
596
+ # Validate distribution parameter
597
+ valid_distributions = ["normal", "t", "uniform", "exponential"]
598
+ if distribution not in valid_distributions:
599
+ raise ValidationError(
600
+ f"Invalid distribution: {distribution}. Must be one of {valid_distributions}",
601
+ context={"function": "generate_qq_data", "distribution": distribution},
602
+ )
603
+
604
+ # Validate df for t-distribution
605
+ if distribution == "t":
606
+ if df is None:
607
+ raise ValidationError(
608
+ "Degrees of freedom (df) required for t-distribution",
609
+ context={"function": "generate_qq_data", "distribution": distribution},
610
+ )
611
+ if df < 1:
612
+ raise ValidationError(
613
+ f"Degrees of freedom must be >= 1, got {df}",
614
+ context={"function": "generate_qq_data", "df": df},
615
+ )
616
+
617
+ logger.info("Generating QQ plot data", n_obs=len(arr), distribution=distribution)
618
+
619
+ try:
620
+ # Generate QQ plot data using scipy
621
+ if distribution == "normal":
622
+ # Default: compare to standard normal
623
+ (theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="norm")
624
+ elif distribution == "t":
625
+ # Student's t distribution with specified df
626
+ (theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(
627
+ arr, dist="t", sparams=(df,)
628
+ )
629
+ elif distribution == "uniform":
630
+ (theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="uniform")
631
+ elif distribution == "exponential":
632
+ (theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="expon")
633
+ else:
634
+ raise ValidationError(
635
+ f"Distribution '{distribution}' not implemented",
636
+ context={"function": "generate_qq_data", "distribution": distribution},
637
+ )
638
+
639
+ # Compute R² from correlation coefficient
640
+ r_squared = float(r**2)
641
+
642
+ logger.info(
643
+ "QQ plot data generated",
644
+ distribution=distribution,
645
+ r_squared=r_squared,
646
+ )
647
+
648
+ return QQPlotData(
649
+ theoretical_quantiles=theoretical_q,
650
+ sample_quantiles=sample_q,
651
+ distribution=distribution,
652
+ r_squared=r_squared,
653
+ df=df,
654
+ n_obs=len(arr),
655
+ )
656
+
657
+ except Exception as e:
658
+ logger.error("QQ plot generation failed", error=str(e), distribution=distribution)
659
+ raise ComputationError( # noqa: B904
660
+ f"QQ plot generation failed: {e}",
661
+ context={
662
+ "function": "generate_qq_data",
663
+ "distribution": distribution,
664
+ "n_obs": len(arr),
665
+ },
666
+ cause=e,
667
+ )
668
+
669
+
670
+ def analyze_tails(
671
+ data: pd.Series | np.ndarray,
672
+ k: int | None = None,
673
+ ) -> TailAnalysisResult:
674
+ """Comprehensive tail analysis combining Hill estimator and QQ plots.
675
+
676
+ Performs multi-method tail analysis:
677
+ 1. Hill estimator for tail index
678
+ 2. QQ plot comparison with normal distribution
679
+ 3. QQ plot comparison with Student's t (if heavy-tailed)
680
+ 4. Best-fit distribution determination
681
+
682
+ This provides robust characterization of tail behavior and helps identify
683
+ appropriate distributional assumptions for risk modeling.
684
+
685
+ Args:
686
+ data: Time series data (1D array or Series)
687
+ k: Number of order statistics for Hill estimator (default: sqrt(n))
688
+
689
+ Returns:
690
+ TailAnalysisResult with comprehensive tail diagnostics
691
+
692
+ Raises:
693
+ ValidationError: If data is invalid
694
+ ComputationError: If analysis fails
695
+
696
+ Example:
697
+ >>> import numpy as np
698
+ >>> # Analyze heavy-tailed data
699
+ >>> t_data = np.random.standard_t(df=3, size=1000)
700
+ >>> result = analyze_tails(t_data)
701
+ >>> print(result.summary())
702
+ >>>
703
+ >>> # Check best fit
704
+ >>> print(f"Best fit: {result.best_fit}")
705
+ >>> print(f"Tail classification: {result.hill_result.classification}")
706
+ >>>
707
+ >>> # Analyze normal data for comparison
708
+ >>> normal_data = np.random.normal(0, 1, 1000)
709
+ >>> result = analyze_tails(normal_data)
710
+ >>> print(f"Best fit: {result.best_fit}")
711
+
712
+ Notes:
713
+ - Combines multiple methods for robust analysis
714
+ - Best fit selected based on Hill estimator and R² values
715
+ - Heavy tails (α ≤ 2) automatically compared to t-distribution
716
+ - Provides actionable recommendations for risk modeling
717
+ """
718
+ # Input validation (basic check, detailed checks in subfunctions)
719
+ if data is None:
720
+ raise ValidationError("Data cannot be None", context={"function": "analyze_tails"})
721
+
722
+ logger.info("Starting comprehensive tail analysis")
723
+
724
+ try:
725
+ # 1. Hill estimator for tail index
726
+ hill_result = hill_estimator(data, k=k, tail="both")
727
+
728
+ # 2. QQ plot with normal distribution
729
+ qq_normal = generate_qq_data(data, distribution="normal")
730
+
731
+ # 3. QQ plot with Student's t (if heavy or medium tails)
732
+ qq_t = None
733
+ if hill_result.classification in ["heavy", "medium"]:
734
+ # Estimate df based on tail index
735
+ # For Student's t: tail index α ≈ df
736
+ # Use tail index as starting point, clamp to reasonable range
737
+ estimated_df = max(2, min(30, int(round(hill_result.tail_index))))
738
+
739
+ qq_t = generate_qq_data(data, distribution="t", df=estimated_df)
740
+
741
+ # 4. Determine best fit
742
+ if hill_result.classification == "thin" and qq_normal.r_squared >= 0.95:
743
+ best_fit = "normal"
744
+ elif qq_t is not None and qq_t.r_squared > qq_normal.r_squared + 0.02:
745
+ # t-distribution fits significantly better
746
+ best_fit = "t"
747
+ elif hill_result.classification == "heavy":
748
+ best_fit = "heavy-tailed"
749
+ elif qq_normal.r_squared >= 0.90:
750
+ best_fit = "normal"
751
+ else:
752
+ # Neither fits well, classify based on Hill estimator
753
+ best_fit = "heavy-tailed" if hill_result.classification != "thin" else "normal"
754
+
755
+ logger.info(
756
+ "Tail analysis completed",
757
+ tail_index=hill_result.tail_index,
758
+ classification=hill_result.classification,
759
+ best_fit=best_fit,
760
+ )
761
+
762
+ return TailAnalysisResult(
763
+ hill_result=hill_result,
764
+ qq_normal=qq_normal,
765
+ qq_t=qq_t,
766
+ best_fit=best_fit,
767
+ )
768
+
769
+ except (ValidationError, ComputationError):
770
+ raise
771
+ except Exception as e:
772
+ logger.error("Tail analysis failed", error=str(e))
773
+ raise ComputationError( # noqa: B904
774
+ f"Tail analysis failed: {e}",
775
+ context={"function": "analyze_tails"},
776
+ cause=e,
777
+ )