ml4t-diagnostic 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml4t/diagnostic/AGENT.md +25 -0
- ml4t/diagnostic/__init__.py +166 -0
- ml4t/diagnostic/backends/__init__.py +10 -0
- ml4t/diagnostic/backends/adapter.py +192 -0
- ml4t/diagnostic/backends/polars_backend.py +899 -0
- ml4t/diagnostic/caching/__init__.py +40 -0
- ml4t/diagnostic/caching/cache.py +331 -0
- ml4t/diagnostic/caching/decorators.py +131 -0
- ml4t/diagnostic/caching/smart_cache.py +339 -0
- ml4t/diagnostic/config/AGENT.md +24 -0
- ml4t/diagnostic/config/README.md +267 -0
- ml4t/diagnostic/config/__init__.py +219 -0
- ml4t/diagnostic/config/barrier_config.py +277 -0
- ml4t/diagnostic/config/base.py +301 -0
- ml4t/diagnostic/config/event_config.py +148 -0
- ml4t/diagnostic/config/feature_config.py +404 -0
- ml4t/diagnostic/config/multi_signal_config.py +55 -0
- ml4t/diagnostic/config/portfolio_config.py +215 -0
- ml4t/diagnostic/config/report_config.py +391 -0
- ml4t/diagnostic/config/sharpe_config.py +202 -0
- ml4t/diagnostic/config/signal_config.py +206 -0
- ml4t/diagnostic/config/trade_analysis_config.py +310 -0
- ml4t/diagnostic/config/validation.py +279 -0
- ml4t/diagnostic/core/__init__.py +29 -0
- ml4t/diagnostic/core/numba_utils.py +315 -0
- ml4t/diagnostic/core/purging.py +372 -0
- ml4t/diagnostic/core/sampling.py +471 -0
- ml4t/diagnostic/errors/__init__.py +205 -0
- ml4t/diagnostic/evaluation/AGENT.md +26 -0
- ml4t/diagnostic/evaluation/__init__.py +437 -0
- ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
- ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
- ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
- ml4t/diagnostic/evaluation/dashboard.py +715 -0
- ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
- ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
- ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
- ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
- ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
- ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
- ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
- ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
- ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
- ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
- ml4t/diagnostic/evaluation/event_analysis.py +647 -0
- ml4t/diagnostic/evaluation/excursion.py +390 -0
- ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
- ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
- ml4t/diagnostic/evaluation/framework.py +935 -0
- ml4t/diagnostic/evaluation/metric_registry.py +255 -0
- ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
- ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
- ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
- ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
- ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
- ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
- ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
- ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
- ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
- ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
- ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
- ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
- ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
- ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
- ml4t/diagnostic/evaluation/multi_signal.py +550 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
- ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
- ml4t/diagnostic/evaluation/report_generation.py +824 -0
- ml4t/diagnostic/evaluation/signal_selector.py +452 -0
- ml4t/diagnostic/evaluation/stat_registry.py +139 -0
- ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
- ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
- ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
- ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
- ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
- ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
- ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
- ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
- ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
- ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
- ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
- ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
- ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
- ml4t/diagnostic/evaluation/stats/moments.py +164 -0
- ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
- ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
- ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
- ml4t/diagnostic/evaluation/themes.py +330 -0
- ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
- ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
- ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
- ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
- ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
- ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
- ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
- ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
- ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
- ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
- ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
- ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
- ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
- ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
- ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
- ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
- ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
- ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
- ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
- ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
- ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
- ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
- ml4t/diagnostic/evaluation/validated_cv.py +535 -0
- ml4t/diagnostic/evaluation/visualization.py +1050 -0
- ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
- ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
- ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
- ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
- ml4t/diagnostic/integration/__init__.py +48 -0
- ml4t/diagnostic/integration/backtest_contract.py +671 -0
- ml4t/diagnostic/integration/data_contract.py +316 -0
- ml4t/diagnostic/integration/engineer_contract.py +226 -0
- ml4t/diagnostic/logging/__init__.py +77 -0
- ml4t/diagnostic/logging/logger.py +245 -0
- ml4t/diagnostic/logging/performance.py +234 -0
- ml4t/diagnostic/logging/progress.py +234 -0
- ml4t/diagnostic/logging/wandb.py +412 -0
- ml4t/diagnostic/metrics/__init__.py +9 -0
- ml4t/diagnostic/metrics/percentiles.py +128 -0
- ml4t/diagnostic/py.typed +1 -0
- ml4t/diagnostic/reporting/__init__.py +43 -0
- ml4t/diagnostic/reporting/base.py +130 -0
- ml4t/diagnostic/reporting/html_renderer.py +275 -0
- ml4t/diagnostic/reporting/json_renderer.py +51 -0
- ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
- ml4t/diagnostic/results/AGENT.md +24 -0
- ml4t/diagnostic/results/__init__.py +105 -0
- ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
- ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
- ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
- ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
- ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
- ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
- ml4t/diagnostic/results/barrier_results/validation.py +38 -0
- ml4t/diagnostic/results/base.py +177 -0
- ml4t/diagnostic/results/event_results.py +349 -0
- ml4t/diagnostic/results/feature_results.py +787 -0
- ml4t/diagnostic/results/multi_signal_results.py +431 -0
- ml4t/diagnostic/results/portfolio_results.py +281 -0
- ml4t/diagnostic/results/sharpe_results.py +448 -0
- ml4t/diagnostic/results/signal_results/__init__.py +74 -0
- ml4t/diagnostic/results/signal_results/ic.py +581 -0
- ml4t/diagnostic/results/signal_results/irtc.py +110 -0
- ml4t/diagnostic/results/signal_results/quantile.py +392 -0
- ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
- ml4t/diagnostic/results/signal_results/turnover.py +213 -0
- ml4t/diagnostic/results/signal_results/validation.py +147 -0
- ml4t/diagnostic/signal/AGENT.md +17 -0
- ml4t/diagnostic/signal/__init__.py +69 -0
- ml4t/diagnostic/signal/_report.py +152 -0
- ml4t/diagnostic/signal/_utils.py +261 -0
- ml4t/diagnostic/signal/core.py +275 -0
- ml4t/diagnostic/signal/quantile.py +148 -0
- ml4t/diagnostic/signal/result.py +214 -0
- ml4t/diagnostic/signal/signal_ic.py +129 -0
- ml4t/diagnostic/signal/turnover.py +182 -0
- ml4t/diagnostic/splitters/AGENT.md +19 -0
- ml4t/diagnostic/splitters/__init__.py +36 -0
- ml4t/diagnostic/splitters/base.py +501 -0
- ml4t/diagnostic/splitters/calendar.py +421 -0
- ml4t/diagnostic/splitters/calendar_config.py +91 -0
- ml4t/diagnostic/splitters/combinatorial.py +1064 -0
- ml4t/diagnostic/splitters/config.py +322 -0
- ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
- ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
- ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
- ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
- ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
- ml4t/diagnostic/splitters/group_isolation.py +329 -0
- ml4t/diagnostic/splitters/persistence.py +316 -0
- ml4t/diagnostic/splitters/utils.py +207 -0
- ml4t/diagnostic/splitters/walk_forward.py +757 -0
- ml4t/diagnostic/utils/__init__.py +42 -0
- ml4t/diagnostic/utils/config.py +542 -0
- ml4t/diagnostic/utils/dependencies.py +318 -0
- ml4t/diagnostic/utils/sessions.py +127 -0
- ml4t/diagnostic/validation/__init__.py +54 -0
- ml4t/diagnostic/validation/dataframe.py +274 -0
- ml4t/diagnostic/validation/returns.py +280 -0
- ml4t/diagnostic/validation/timeseries.py +299 -0
- ml4t/diagnostic/visualization/AGENT.md +19 -0
- ml4t/diagnostic/visualization/__init__.py +223 -0
- ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
- ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
- ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
- ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
- ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
- ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
- ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
- ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
- ml4t/diagnostic/visualization/barrier_plots.py +782 -0
- ml4t/diagnostic/visualization/core.py +1060 -0
- ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
- ml4t/diagnostic/visualization/dashboards/base.py +582 -0
- ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
- ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
- ml4t/diagnostic/visualization/dashboards.py +43 -0
- ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
- ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
- ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
- ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
- ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
- ml4t/diagnostic/visualization/feature_plots.py +888 -0
- ml4t/diagnostic/visualization/interaction_plots.py +618 -0
- ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
- ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
- ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
- ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
- ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
- ml4t/diagnostic/visualization/report_generation.py +1343 -0
- ml4t/diagnostic/visualization/signal/__init__.py +103 -0
- ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
- ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
- ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
- ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
- ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
- ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
- ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
- ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
- ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
- ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
- ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
- ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
"""Tail analysis for heavy-tailed distributions.
|
|
2
|
+
|
|
3
|
+
This module provides tools for analyzing tail behavior:
|
|
4
|
+
- Hill estimator for tail index estimation
|
|
5
|
+
- Q-Q plots for distribution comparison
|
|
6
|
+
- Comprehensive tail analysis combining multiple methods
|
|
7
|
+
|
|
8
|
+
Tail Classification:
|
|
9
|
+
- Heavy tails (α ≤ 2): Infinite variance regime, extreme power law behavior
|
|
10
|
+
- Medium tails (2 < α ≤ 4): Finite variance, infinite 4th moment
|
|
11
|
+
- Thin tails (α > 4): All moments finite, close to normal
|
|
12
|
+
|
|
13
|
+
References:
|
|
14
|
+
- Hill, B. M. (1975). A simple general approach to inference about the tail
|
|
15
|
+
of a distribution. The Annals of Statistics, 3(5), 1163-1174.
|
|
16
|
+
- Mandelbrot, B. (1963). The variation of certain speculative prices.
|
|
17
|
+
The Journal of Business, 36(4), 394-419.
|
|
18
|
+
- Clauset, A., Shalizi, C. R., & Newman, M. E. (2009). Power-law distributions
|
|
19
|
+
in empirical data. SIAM Review, 51(4), 661-703.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import pandas as pd
|
|
28
|
+
from scipy import stats
|
|
29
|
+
|
|
30
|
+
from ml4t.diagnostic.errors import ComputationError, ValidationError
|
|
31
|
+
from ml4t.diagnostic.logging import get_logger
|
|
32
|
+
|
|
33
|
+
logger = get_logger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class HillEstimatorResult:
|
|
38
|
+
"""Hill estimator for tail index of heavy-tailed distributions.
|
|
39
|
+
|
|
40
|
+
The Hill estimator estimates the tail index α (alpha) for power law distributions.
|
|
41
|
+
Higher α indicates thinner tails. The tail index characterizes how quickly the
|
|
42
|
+
probability density decays in the tail:
|
|
43
|
+
|
|
44
|
+
P(X > x) ~ x^(-α)
|
|
45
|
+
|
|
46
|
+
Tail Classification:
|
|
47
|
+
- Heavy tails (α ≤ 2): Infinite variance regime, extreme power law behavior
|
|
48
|
+
- Medium tails (2 < α ≤ 4): Finite variance, infinite 4th moment
|
|
49
|
+
- Thin tails (α > 4): All moments finite, approaching normal
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
tail_index: Estimated tail index α (higher = thinner tail)
|
|
53
|
+
tail_index_se: Standard error of tail index estimate
|
|
54
|
+
k: Number of upper order statistics used in estimation
|
|
55
|
+
classification: Tail classification ("heavy", "medium", "thin")
|
|
56
|
+
tail: Which tail was analyzed ("upper", "lower", "both")
|
|
57
|
+
n_obs: Total number of observations
|
|
58
|
+
|
|
59
|
+
Notes:
|
|
60
|
+
- SE(α̂) = α̂ / sqrt(k)
|
|
61
|
+
- Financial returns typically have α ∈ [2, 4] (medium tails)
|
|
62
|
+
- Normal distribution has α → ∞ (exponential tail decay)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
tail_index: float
|
|
66
|
+
tail_index_se: float
|
|
67
|
+
k: int
|
|
68
|
+
classification: str
|
|
69
|
+
tail: str
|
|
70
|
+
n_obs: int
|
|
71
|
+
|
|
72
|
+
def __repr__(self) -> str:
|
|
73
|
+
"""String representation."""
|
|
74
|
+
return f"HillEstimatorResult(alpha={self.tail_index:.4f}, classification='{self.classification}', k={self.k})"
|
|
75
|
+
|
|
76
|
+
def summary(self) -> str:
|
|
77
|
+
"""Human-readable summary of Hill estimator analysis.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Formatted summary string
|
|
81
|
+
"""
|
|
82
|
+
lines = [
|
|
83
|
+
"Hill Estimator - Tail Index Analysis",
|
|
84
|
+
"=" * 50,
|
|
85
|
+
f"Tail Index (α): {self.tail_index:.4f}",
|
|
86
|
+
f"Standard Error: {self.tail_index_se:.4f}",
|
|
87
|
+
f"Z-score: {self.tail_index / self.tail_index_se:.4f}",
|
|
88
|
+
f"Order Statistics: k={self.k}",
|
|
89
|
+
f"Total Observations: n={self.n_obs}",
|
|
90
|
+
f"Tail Analyzed: {self.tail}",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
lines.append("")
|
|
94
|
+
lines.append(f"Classification: {self.classification.upper()}")
|
|
95
|
+
|
|
96
|
+
if self.classification == "heavy":
|
|
97
|
+
interpretation = [
|
|
98
|
+
" - Infinite variance regime (α ≤ 2)",
|
|
99
|
+
" - Extreme power law behavior",
|
|
100
|
+
" - Mean may not exist for α ≤ 1",
|
|
101
|
+
" - Very high probability of extreme events",
|
|
102
|
+
" - Standard risk measures (VaR, Sharpe) unreliable",
|
|
103
|
+
]
|
|
104
|
+
elif self.classification == "medium":
|
|
105
|
+
interpretation = [
|
|
106
|
+
" - Finite variance but heavy-tailed (2 < α ≤ 4)",
|
|
107
|
+
" - Fourth moment may not exist",
|
|
108
|
+
" - Higher extreme event probability than normal",
|
|
109
|
+
" - Typical for financial returns",
|
|
110
|
+
" - Use robust risk measures (CVaR, drawdown)",
|
|
111
|
+
]
|
|
112
|
+
else: # thin
|
|
113
|
+
interpretation = [
|
|
114
|
+
" - All moments finite (α > 4)",
|
|
115
|
+
" - Tail behavior approaching normal distribution",
|
|
116
|
+
" - Standard statistical methods applicable",
|
|
117
|
+
" - Lower extreme event probability",
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
lines.extend(interpretation)
|
|
121
|
+
|
|
122
|
+
lines.append("")
|
|
123
|
+
lines.append("Methodology:")
|
|
124
|
+
lines.append(" - Hill estimator: α̂ = k / Σ(log(X_i) - log(X_{k+1}))")
|
|
125
|
+
lines.append(f" - Uses k={self.k} largest order statistics")
|
|
126
|
+
lines.append(" - Asymptotic SE: α̂ / sqrt(k)")
|
|
127
|
+
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class QQPlotData:
|
|
133
|
+
"""Q-Q plot data for distribution comparison.
|
|
134
|
+
|
|
135
|
+
Quantile-Quantile (Q-Q) plots compare empirical quantiles against theoretical
|
|
136
|
+
quantiles from a reference distribution. If data follows the reference distribution,
|
|
137
|
+
points should lie on the diagonal line y=x.
|
|
138
|
+
|
|
139
|
+
Attributes:
|
|
140
|
+
theoretical_quantiles: Quantiles from reference distribution
|
|
141
|
+
sample_quantiles: Empirical quantiles from data
|
|
142
|
+
distribution: Reference distribution name ("normal", "t", "uniform", etc.)
|
|
143
|
+
r_squared: R² goodness of fit (closer to 1 = better fit)
|
|
144
|
+
df: Degrees of freedom (for t-distribution, None otherwise)
|
|
145
|
+
n_obs: Number of observations
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
theoretical_quantiles: np.ndarray
|
|
149
|
+
sample_quantiles: np.ndarray
|
|
150
|
+
distribution: str
|
|
151
|
+
r_squared: float
|
|
152
|
+
df: int | None = None
|
|
153
|
+
n_obs: int = 0
|
|
154
|
+
|
|
155
|
+
def __repr__(self) -> str:
|
|
156
|
+
"""String representation."""
|
|
157
|
+
df_str = f", df={self.df}" if self.df is not None else ""
|
|
158
|
+
return f"QQPlotData(distribution='{self.distribution}', R²={self.r_squared:.4f}{df_str})"
|
|
159
|
+
|
|
160
|
+
def summary(self) -> str:
|
|
161
|
+
"""Human-readable summary of QQ plot analysis.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Formatted summary string
|
|
165
|
+
"""
|
|
166
|
+
lines = [
|
|
167
|
+
f"Q-Q Plot Analysis - {self.distribution.title()} Distribution",
|
|
168
|
+
"=" * 50,
|
|
169
|
+
f"Reference Dist: {self.distribution}",
|
|
170
|
+
f"R² (Goodness): {self.r_squared:.4f}",
|
|
171
|
+
f"Observations: {self.n_obs}",
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
if self.df is not None:
|
|
175
|
+
lines.append(f"Degrees of Freedom: {self.df}")
|
|
176
|
+
|
|
177
|
+
lines.append("")
|
|
178
|
+
if self.r_squared >= 0.99:
|
|
179
|
+
fit_quality = "Excellent"
|
|
180
|
+
interpretation = "Data closely follows reference distribution"
|
|
181
|
+
elif self.r_squared >= 0.95:
|
|
182
|
+
fit_quality = "Good"
|
|
183
|
+
interpretation = "Data reasonably follows reference distribution"
|
|
184
|
+
elif self.r_squared >= 0.90:
|
|
185
|
+
fit_quality = "Moderate"
|
|
186
|
+
interpretation = "Some deviation from reference distribution"
|
|
187
|
+
else:
|
|
188
|
+
fit_quality = "Poor"
|
|
189
|
+
interpretation = "Significant deviation from reference distribution"
|
|
190
|
+
|
|
191
|
+
lines.append(f"Fit Quality: {fit_quality}")
|
|
192
|
+
lines.append(f" {interpretation}")
|
|
193
|
+
|
|
194
|
+
lines.append("")
|
|
195
|
+
lines.append("Interpretation:")
|
|
196
|
+
lines.append(" - Points on diagonal => data follows reference distribution")
|
|
197
|
+
lines.append(" - Deviations in tails => different tail behavior")
|
|
198
|
+
lines.append(" - S-shaped pattern => skewness difference")
|
|
199
|
+
lines.append(" - Curved pattern => kurtosis difference")
|
|
200
|
+
|
|
201
|
+
return "\n".join(lines)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@dataclass
|
|
205
|
+
class TailAnalysisResult:
|
|
206
|
+
"""Comprehensive tail analysis combining Hill estimator and QQ plots.
|
|
207
|
+
|
|
208
|
+
Analyzes tail behavior by:
|
|
209
|
+
1. Estimating tail index using Hill estimator
|
|
210
|
+
2. Comparing against normal distribution (QQ plot)
|
|
211
|
+
3. Comparing against Student's t distribution (QQ plot)
|
|
212
|
+
4. Determining best-fit distribution
|
|
213
|
+
|
|
214
|
+
This multi-method approach provides robust characterization of tail behavior
|
|
215
|
+
and helps identify appropriate distributional assumptions for modeling.
|
|
216
|
+
|
|
217
|
+
Attributes:
|
|
218
|
+
hill_result: Hill estimator analysis results
|
|
219
|
+
qq_normal: QQ plot comparison with normal distribution
|
|
220
|
+
qq_t: QQ plot comparison with Student's t (None if not computed)
|
|
221
|
+
best_fit: Best fitting distribution ("normal", "t", "heavy-tailed")
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
hill_result: HillEstimatorResult
|
|
225
|
+
qq_normal: QQPlotData
|
|
226
|
+
qq_t: QQPlotData | None
|
|
227
|
+
best_fit: str
|
|
228
|
+
|
|
229
|
+
def __repr__(self) -> str:
|
|
230
|
+
"""String representation."""
|
|
231
|
+
return (
|
|
232
|
+
f"TailAnalysisResult(tail_index={self.hill_result.tail_index:.4f}, "
|
|
233
|
+
f"classification='{self.hill_result.classification}', "
|
|
234
|
+
f"best_fit='{self.best_fit}')"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def summary(self) -> str:
|
|
238
|
+
"""Human-readable summary of comprehensive tail analysis.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Formatted summary string
|
|
242
|
+
"""
|
|
243
|
+
lines = [
|
|
244
|
+
"Comprehensive Tail Analysis",
|
|
245
|
+
"=" * 50,
|
|
246
|
+
]
|
|
247
|
+
|
|
248
|
+
# Hill estimator summary
|
|
249
|
+
lines.append("")
|
|
250
|
+
lines.append("TAIL INDEX ESTIMATION:")
|
|
251
|
+
lines.append(f" Hill α: {self.hill_result.tail_index:.4f}")
|
|
252
|
+
lines.append(f" Classification: {self.hill_result.classification}")
|
|
253
|
+
lines.append(f" Tail Type: {self.hill_result.tail}")
|
|
254
|
+
|
|
255
|
+
# QQ plot comparisons
|
|
256
|
+
lines.append("")
|
|
257
|
+
lines.append("DISTRIBUTION COMPARISON:")
|
|
258
|
+
lines.append(f" Normal R²: {self.qq_normal.r_squared:.4f}")
|
|
259
|
+
if self.qq_t is not None:
|
|
260
|
+
lines.append(f" Student's t R²: {self.qq_t.r_squared:.4f} (df={self.qq_t.df})")
|
|
261
|
+
|
|
262
|
+
lines.append(f" Best Fit: {self.best_fit}")
|
|
263
|
+
|
|
264
|
+
# Interpretation
|
|
265
|
+
lines.append("")
|
|
266
|
+
lines.append("INTERPRETATION:")
|
|
267
|
+
if self.best_fit == "normal":
|
|
268
|
+
interpretation = [
|
|
269
|
+
" - Data is consistent with normal distribution",
|
|
270
|
+
" - Thin tails (low extreme event probability)",
|
|
271
|
+
" - Standard statistical methods appropriate",
|
|
272
|
+
]
|
|
273
|
+
elif self.best_fit == "t":
|
|
274
|
+
interpretation = [
|
|
275
|
+
f" - Data best fit by Student's t (df={self.qq_t.df if self.qq_t else 'unknown'})",
|
|
276
|
+
" - Heavier tails than normal but finite variance",
|
|
277
|
+
" - Moderate extreme event probability",
|
|
278
|
+
" - Use robust statistical methods",
|
|
279
|
+
]
|
|
280
|
+
else: # heavy-tailed
|
|
281
|
+
interpretation = [
|
|
282
|
+
" - Data exhibits heavy tail behavior",
|
|
283
|
+
" - Power law distribution indicated",
|
|
284
|
+
" - High extreme event probability",
|
|
285
|
+
" - Standard risk measures may be unreliable",
|
|
286
|
+
" - Consider tail risk models (CVaR, extreme value theory)",
|
|
287
|
+
]
|
|
288
|
+
|
|
289
|
+
lines.extend(interpretation)
|
|
290
|
+
|
|
291
|
+
# Recommendations
|
|
292
|
+
lines.append("")
|
|
293
|
+
lines.append("RECOMMENDATIONS:")
|
|
294
|
+
if self.hill_result.classification == "heavy":
|
|
295
|
+
lines.append(" - Use tail risk measures (CVaR, expected shortfall)")
|
|
296
|
+
lines.append(" - Consider extreme value theory for VaR")
|
|
297
|
+
lines.append(" - Apply robust portfolio optimization")
|
|
298
|
+
lines.append(" - Monitor for regime changes")
|
|
299
|
+
elif self.hill_result.classification == "medium":
|
|
300
|
+
lines.append(" - Use robust Sharpe ratio alternatives")
|
|
301
|
+
lines.append(" - Consider CVaR alongside VaR")
|
|
302
|
+
lines.append(" - Account for non-normality in models")
|
|
303
|
+
else:
|
|
304
|
+
lines.append(" - Standard statistical methods appropriate")
|
|
305
|
+
lines.append(" - Monitor for changes in tail behavior")
|
|
306
|
+
|
|
307
|
+
return "\n".join(lines)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def hill_estimator(
|
|
311
|
+
data: pd.Series | np.ndarray,
|
|
312
|
+
k: int | None = None,
|
|
313
|
+
tail: str = "both",
|
|
314
|
+
) -> HillEstimatorResult:
|
|
315
|
+
"""Estimate tail index using Hill estimator.
|
|
316
|
+
|
|
317
|
+
The Hill estimator computes the tail index α for power law distributions.
|
|
318
|
+
For a power law tail P(X > x) ~ x^(-α), the Hill estimator is:
|
|
319
|
+
|
|
320
|
+
α̂ = k / Σ(log(X_i) - log(X_{k+1}))
|
|
321
|
+
|
|
322
|
+
where X_1 ≥ X_2 ≥ ... ≥ X_n are order statistics and k is the number of
|
|
323
|
+
upper order statistics used.
|
|
324
|
+
|
|
325
|
+
Tail Classification:
|
|
326
|
+
- Heavy tails (α ≤ 2): Infinite variance regime
|
|
327
|
+
- Medium tails (2 < α ≤ 4): Finite variance, heavy-tailed
|
|
328
|
+
- Thin tails (α > 4): All moments finite
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
data: Time series data (1D array or Series)
|
|
332
|
+
k: Number of upper order statistics (default: sqrt(n))
|
|
333
|
+
tail: Which tail to analyze - "upper", "lower", or "both" (default)
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
HillEstimatorResult with tail index and classification
|
|
337
|
+
|
|
338
|
+
Raises:
|
|
339
|
+
ValidationError: If data is invalid
|
|
340
|
+
ComputationError: If estimation fails
|
|
341
|
+
|
|
342
|
+
Example:
|
|
343
|
+
>>> import numpy as np
|
|
344
|
+
>>> # Student's t distribution (df=3) has heavy tails
|
|
345
|
+
>>> t_data = np.random.standard_t(df=3, size=1000)
|
|
346
|
+
>>> result = hill_estimator(t_data)
|
|
347
|
+
>>> print(f"Tail index: {result.tail_index:.2f}")
|
|
348
|
+
>>> print(f"Classification: {result.classification}")
|
|
349
|
+
>>>
|
|
350
|
+
>>> # Normal distribution has thin tails (large α)
|
|
351
|
+
>>> normal_data = np.random.normal(0, 1, 1000)
|
|
352
|
+
>>> result = hill_estimator(normal_data)
|
|
353
|
+
>>> print(f"Tail index: {result.tail_index:.2f}")
|
|
354
|
+
|
|
355
|
+
Notes:
|
|
356
|
+
- Optimal k selection is an open research problem
|
|
357
|
+
- Default k = sqrt(n) is a common heuristic
|
|
358
|
+
- SE(α̂) = α̂ / sqrt(k)
|
|
359
|
+
- Works best for truly power law tails
|
|
360
|
+
- For "both" tails, returns minimum of upper and lower estimates
|
|
361
|
+
|
|
362
|
+
References:
|
|
363
|
+
- Hill, B. M. (1975). A simple general approach to inference about the
|
|
364
|
+
tail of a distribution. The Annals of Statistics, 3(5), 1163-1174.
|
|
365
|
+
"""
|
|
366
|
+
# Input validation
|
|
367
|
+
if data is None:
|
|
368
|
+
raise ValidationError("Data cannot be None", context={"function": "hill_estimator"})
|
|
369
|
+
|
|
370
|
+
# Convert to numpy array
|
|
371
|
+
if isinstance(data, pd.Series):
|
|
372
|
+
arr = data.to_numpy()
|
|
373
|
+
elif isinstance(data, np.ndarray):
|
|
374
|
+
arr = data
|
|
375
|
+
else:
|
|
376
|
+
raise ValidationError(
|
|
377
|
+
f"Data must be pandas Series or numpy array, got {type(data)}",
|
|
378
|
+
context={"function": "hill_estimator", "data_type": type(data).__name__},
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Check array properties
|
|
382
|
+
if arr.ndim != 1:
|
|
383
|
+
raise ValidationError(
|
|
384
|
+
f"Data must be 1-dimensional, got {arr.ndim}D",
|
|
385
|
+
context={"function": "hill_estimator", "shape": arr.shape},
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if len(arr) == 0:
|
|
389
|
+
raise ValidationError(
|
|
390
|
+
"Data cannot be empty", context={"function": "hill_estimator", "length": 0}
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# Check for missing/infinite values
|
|
394
|
+
if np.any(~np.isfinite(arr)):
|
|
395
|
+
n_invalid = np.sum(~np.isfinite(arr))
|
|
396
|
+
raise ValidationError(
|
|
397
|
+
f"Data contains {n_invalid} NaN or infinite values",
|
|
398
|
+
context={"function": "hill_estimator", "n_invalid": n_invalid, "length": len(arr)},
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Check minimum length
|
|
402
|
+
min_length = 50 # Need sufficient data for tail estimation
|
|
403
|
+
if len(arr) < min_length:
|
|
404
|
+
raise ValidationError(
|
|
405
|
+
f"Insufficient data for Hill estimator (need at least {min_length} observations)",
|
|
406
|
+
context={"function": "hill_estimator", "length": len(arr), "min_length": min_length},
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Validate tail parameter
|
|
410
|
+
if tail not in ["upper", "lower", "both"]:
|
|
411
|
+
raise ValidationError(
|
|
412
|
+
f"Invalid tail parameter: {tail}. Must be 'upper', 'lower', or 'both'",
|
|
413
|
+
context={"function": "hill_estimator", "tail": tail},
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Set k if not provided (common heuristic: sqrt(n))
|
|
417
|
+
n = len(arr)
|
|
418
|
+
if k is None:
|
|
419
|
+
k = int(np.sqrt(n))
|
|
420
|
+
elif k < 2:
|
|
421
|
+
raise ValidationError(
|
|
422
|
+
f"k must be at least 2, got {k}",
|
|
423
|
+
context={"function": "hill_estimator", "k": k},
|
|
424
|
+
)
|
|
425
|
+
elif k >= n:
|
|
426
|
+
raise ValidationError(
|
|
427
|
+
f"k must be less than n={n}, got {k}",
|
|
428
|
+
context={"function": "hill_estimator", "k": k, "n": n},
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
logger.info("Computing Hill estimator", n_obs=n, k=k, tail=tail)
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
|
|
435
|
+
def compute_hill_alpha(data_sorted: np.ndarray, k: int) -> tuple[float, float]:
|
|
436
|
+
"""Compute Hill estimator for sorted data (descending order)."""
|
|
437
|
+
# Get k largest values and the (k+1)th value
|
|
438
|
+
X_k_plus_1 = data_sorted[k] # (k+1)th largest value
|
|
439
|
+
|
|
440
|
+
# Check for zero or negative values (can't take log)
|
|
441
|
+
if X_k_plus_1 <= 0:
|
|
442
|
+
raise ComputationError(
|
|
443
|
+
"Hill estimator requires positive data for log transform",
|
|
444
|
+
context={"function": "hill_estimator", "X_k_plus_1": float(X_k_plus_1)},
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Compute Hill estimator: α̂ = k / Σ(log(X_i) - log(X_{k+1}))
|
|
448
|
+
log_ratios = np.log(data_sorted[:k]) - np.log(X_k_plus_1)
|
|
449
|
+
alpha = float(k / np.sum(log_ratios))
|
|
450
|
+
|
|
451
|
+
# Standard error: SE(α̂) = α̂ / sqrt(k)
|
|
452
|
+
alpha_se = float(alpha / np.sqrt(k))
|
|
453
|
+
|
|
454
|
+
return alpha, alpha_se
|
|
455
|
+
|
|
456
|
+
# Compute for requested tail(s)
|
|
457
|
+
if tail == "upper":
|
|
458
|
+
# Sort descending for upper tail
|
|
459
|
+
sorted_data = np.sort(arr)[::-1]
|
|
460
|
+
alpha, alpha_se = compute_hill_alpha(sorted_data, k)
|
|
461
|
+
|
|
462
|
+
elif tail == "lower":
|
|
463
|
+
# For lower tail, analyze absolute values of negative tail
|
|
464
|
+
# Take absolute values to ensure positive data for log transform
|
|
465
|
+
sorted_data = np.sort(np.abs(arr))[::-1]
|
|
466
|
+
alpha, alpha_se = compute_hill_alpha(sorted_data, k)
|
|
467
|
+
|
|
468
|
+
else: # both
|
|
469
|
+
# Compute both tails and take minimum (more conservative)
|
|
470
|
+
sorted_upper = np.sort(arr)[::-1]
|
|
471
|
+
alpha_upper, alpha_se_upper = compute_hill_alpha(sorted_upper, k)
|
|
472
|
+
|
|
473
|
+
# For lower tail, use absolute values
|
|
474
|
+
sorted_lower = np.sort(np.abs(arr))[::-1]
|
|
475
|
+
alpha_lower, alpha_se_lower = compute_hill_alpha(sorted_lower, k)
|
|
476
|
+
|
|
477
|
+
# Use minimum (heavier tail)
|
|
478
|
+
if alpha_upper < alpha_lower:
|
|
479
|
+
alpha, alpha_se = alpha_upper, alpha_se_upper
|
|
480
|
+
else:
|
|
481
|
+
alpha, alpha_se = alpha_lower, alpha_se_lower
|
|
482
|
+
|
|
483
|
+
# Classify tail
|
|
484
|
+
if alpha <= 2.0:
|
|
485
|
+
classification = "heavy"
|
|
486
|
+
elif alpha <= 4.0:
|
|
487
|
+
classification = "medium"
|
|
488
|
+
else:
|
|
489
|
+
classification = "thin"
|
|
490
|
+
|
|
491
|
+
logger.info(
|
|
492
|
+
"Hill estimator computed",
|
|
493
|
+
alpha=alpha,
|
|
494
|
+
classification=classification,
|
|
495
|
+
k=k,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
return HillEstimatorResult(
|
|
499
|
+
tail_index=alpha,
|
|
500
|
+
tail_index_se=alpha_se,
|
|
501
|
+
k=k,
|
|
502
|
+
classification=classification,
|
|
503
|
+
tail=tail,
|
|
504
|
+
n_obs=n,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
except ComputationError:
|
|
508
|
+
raise
|
|
509
|
+
except Exception as e:
|
|
510
|
+
logger.error("Hill estimator failed", error=str(e), n_obs=n, k=k)
|
|
511
|
+
raise ComputationError( # noqa: B904
|
|
512
|
+
f"Hill estimator computation failed: {e}",
|
|
513
|
+
context={"function": "hill_estimator", "n_obs": n, "k": k, "tail": tail},
|
|
514
|
+
cause=e,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def generate_qq_data(
|
|
519
|
+
data: pd.Series | np.ndarray,
|
|
520
|
+
distribution: str = "normal",
|
|
521
|
+
df: int | None = None,
|
|
522
|
+
) -> QQPlotData:
|
|
523
|
+
"""Generate Q-Q plot data for distribution comparison.
|
|
524
|
+
|
|
525
|
+
Computes empirical quantiles and theoretical quantiles from a reference
|
|
526
|
+
distribution. Q-Q plots visualize how well data follows a theoretical
|
|
527
|
+
distribution - points on the diagonal indicate good fit.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
data: Time series data (1D array or Series)
|
|
531
|
+
distribution: Reference distribution ("normal", "t", "uniform", "exponential")
|
|
532
|
+
df: Degrees of freedom for Student's t (required if distribution="t")
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
QQPlotData with quantiles and R² goodness of fit
|
|
536
|
+
|
|
537
|
+
Raises:
|
|
538
|
+
ValidationError: If data or parameters are invalid
|
|
539
|
+
ComputationError: If computation fails
|
|
540
|
+
|
|
541
|
+
Example:
|
|
542
|
+
>>> import numpy as np
|
|
543
|
+
>>> # Normal data should fit normal QQ plot well
|
|
544
|
+
>>> normal_data = np.random.normal(0, 1, 1000)
|
|
545
|
+
>>> qq = generate_qq_data(normal_data, distribution="normal")
|
|
546
|
+
>>> print(f"R²: {qq.r_squared:.4f}") # Should be close to 1
|
|
547
|
+
>>>
|
|
548
|
+
>>> # Heavy-tailed data fits t-distribution better
|
|
549
|
+
>>> t_data = np.random.standard_t(df=3, size=1000)
|
|
550
|
+
>>> qq_normal = generate_qq_data(t_data, distribution="normal")
|
|
551
|
+
>>> qq_t = generate_qq_data(t_data, distribution="t", df=3)
|
|
552
|
+
>>> print(f"Normal R²: {qq_normal.r_squared:.4f}")
|
|
553
|
+
>>> print(f"t R²: {qq_t.r_squared:.4f}") # Better fit
|
|
554
|
+
|
|
555
|
+
Notes:
|
|
556
|
+
- Uses scipy.stats.probplot for QQ data generation
|
|
557
|
+
- R² measures goodness of fit (1 = perfect fit)
|
|
558
|
+
- Deviations in tails indicate different tail behavior
|
|
559
|
+
- Works for any sample size, but more reliable for n > 100
|
|
560
|
+
"""
|
|
561
|
+
# Input validation
|
|
562
|
+
if data is None:
|
|
563
|
+
raise ValidationError("Data cannot be None", context={"function": "generate_qq_data"})
|
|
564
|
+
|
|
565
|
+
# Convert to numpy array
|
|
566
|
+
if isinstance(data, pd.Series):
|
|
567
|
+
arr = data.to_numpy()
|
|
568
|
+
elif isinstance(data, np.ndarray):
|
|
569
|
+
arr = data
|
|
570
|
+
else:
|
|
571
|
+
raise ValidationError(
|
|
572
|
+
f"Data must be pandas Series or numpy array, got {type(data)}",
|
|
573
|
+
context={"function": "generate_qq_data", "data_type": type(data).__name__},
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Check array properties
|
|
577
|
+
if arr.ndim != 1:
|
|
578
|
+
raise ValidationError(
|
|
579
|
+
f"Data must be 1-dimensional, got {arr.ndim}D",
|
|
580
|
+
context={"function": "generate_qq_data", "shape": arr.shape},
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
if len(arr) == 0:
|
|
584
|
+
raise ValidationError(
|
|
585
|
+
"Data cannot be empty", context={"function": "generate_qq_data", "length": 0}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Check for missing/infinite values
|
|
589
|
+
if np.any(~np.isfinite(arr)):
|
|
590
|
+
n_invalid = np.sum(~np.isfinite(arr))
|
|
591
|
+
raise ValidationError(
|
|
592
|
+
f"Data contains {n_invalid} NaN or infinite values",
|
|
593
|
+
context={"function": "generate_qq_data", "n_invalid": n_invalid, "length": len(arr)},
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# Validate distribution parameter
|
|
597
|
+
valid_distributions = ["normal", "t", "uniform", "exponential"]
|
|
598
|
+
if distribution not in valid_distributions:
|
|
599
|
+
raise ValidationError(
|
|
600
|
+
f"Invalid distribution: {distribution}. Must be one of {valid_distributions}",
|
|
601
|
+
context={"function": "generate_qq_data", "distribution": distribution},
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Validate df for t-distribution
|
|
605
|
+
if distribution == "t":
|
|
606
|
+
if df is None:
|
|
607
|
+
raise ValidationError(
|
|
608
|
+
"Degrees of freedom (df) required for t-distribution",
|
|
609
|
+
context={"function": "generate_qq_data", "distribution": distribution},
|
|
610
|
+
)
|
|
611
|
+
if df < 1:
|
|
612
|
+
raise ValidationError(
|
|
613
|
+
f"Degrees of freedom must be >= 1, got {df}",
|
|
614
|
+
context={"function": "generate_qq_data", "df": df},
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
logger.info("Generating QQ plot data", n_obs=len(arr), distribution=distribution)
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
# Generate QQ plot data using scipy
|
|
621
|
+
if distribution == "normal":
|
|
622
|
+
# Default: compare to standard normal
|
|
623
|
+
(theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="norm")
|
|
624
|
+
elif distribution == "t":
|
|
625
|
+
# Student's t distribution with specified df
|
|
626
|
+
(theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(
|
|
627
|
+
arr, dist="t", sparams=(df,)
|
|
628
|
+
)
|
|
629
|
+
elif distribution == "uniform":
|
|
630
|
+
(theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="uniform")
|
|
631
|
+
elif distribution == "exponential":
|
|
632
|
+
(theoretical_q, sample_q), (slope, intercept, r) = stats.probplot(arr, dist="expon")
|
|
633
|
+
else:
|
|
634
|
+
raise ValidationError(
|
|
635
|
+
f"Distribution '{distribution}' not implemented",
|
|
636
|
+
context={"function": "generate_qq_data", "distribution": distribution},
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Compute R² from correlation coefficient
|
|
640
|
+
r_squared = float(r**2)
|
|
641
|
+
|
|
642
|
+
logger.info(
|
|
643
|
+
"QQ plot data generated",
|
|
644
|
+
distribution=distribution,
|
|
645
|
+
r_squared=r_squared,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
return QQPlotData(
|
|
649
|
+
theoretical_quantiles=theoretical_q,
|
|
650
|
+
sample_quantiles=sample_q,
|
|
651
|
+
distribution=distribution,
|
|
652
|
+
r_squared=r_squared,
|
|
653
|
+
df=df,
|
|
654
|
+
n_obs=len(arr),
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.error("QQ plot generation failed", error=str(e), distribution=distribution)
|
|
659
|
+
raise ComputationError( # noqa: B904
|
|
660
|
+
f"QQ plot generation failed: {e}",
|
|
661
|
+
context={
|
|
662
|
+
"function": "generate_qq_data",
|
|
663
|
+
"distribution": distribution,
|
|
664
|
+
"n_obs": len(arr),
|
|
665
|
+
},
|
|
666
|
+
cause=e,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def analyze_tails(
|
|
671
|
+
data: pd.Series | np.ndarray,
|
|
672
|
+
k: int | None = None,
|
|
673
|
+
) -> TailAnalysisResult:
|
|
674
|
+
"""Comprehensive tail analysis combining Hill estimator and QQ plots.
|
|
675
|
+
|
|
676
|
+
Performs multi-method tail analysis:
|
|
677
|
+
1. Hill estimator for tail index
|
|
678
|
+
2. QQ plot comparison with normal distribution
|
|
679
|
+
3. QQ plot comparison with Student's t (if heavy-tailed)
|
|
680
|
+
4. Best-fit distribution determination
|
|
681
|
+
|
|
682
|
+
This provides robust characterization of tail behavior and helps identify
|
|
683
|
+
appropriate distributional assumptions for risk modeling.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
data: Time series data (1D array or Series)
|
|
687
|
+
k: Number of order statistics for Hill estimator (default: sqrt(n))
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
TailAnalysisResult with comprehensive tail diagnostics
|
|
691
|
+
|
|
692
|
+
Raises:
|
|
693
|
+
ValidationError: If data is invalid
|
|
694
|
+
ComputationError: If analysis fails
|
|
695
|
+
|
|
696
|
+
Example:
|
|
697
|
+
>>> import numpy as np
|
|
698
|
+
>>> # Analyze heavy-tailed data
|
|
699
|
+
>>> t_data = np.random.standard_t(df=3, size=1000)
|
|
700
|
+
>>> result = analyze_tails(t_data)
|
|
701
|
+
>>> print(result.summary())
|
|
702
|
+
>>>
|
|
703
|
+
>>> # Check best fit
|
|
704
|
+
>>> print(f"Best fit: {result.best_fit}")
|
|
705
|
+
>>> print(f"Tail classification: {result.hill_result.classification}")
|
|
706
|
+
>>>
|
|
707
|
+
>>> # Analyze normal data for comparison
|
|
708
|
+
>>> normal_data = np.random.normal(0, 1, 1000)
|
|
709
|
+
>>> result = analyze_tails(normal_data)
|
|
710
|
+
>>> print(f"Best fit: {result.best_fit}")
|
|
711
|
+
|
|
712
|
+
Notes:
|
|
713
|
+
- Combines multiple methods for robust analysis
|
|
714
|
+
- Best fit selected based on Hill estimator and R² values
|
|
715
|
+
- Heavy tails (α ≤ 2) automatically compared to t-distribution
|
|
716
|
+
- Provides actionable recommendations for risk modeling
|
|
717
|
+
"""
|
|
718
|
+
# Input validation (basic check, detailed checks in subfunctions)
|
|
719
|
+
if data is None:
|
|
720
|
+
raise ValidationError("Data cannot be None", context={"function": "analyze_tails"})
|
|
721
|
+
|
|
722
|
+
logger.info("Starting comprehensive tail analysis")
|
|
723
|
+
|
|
724
|
+
try:
|
|
725
|
+
# 1. Hill estimator for tail index
|
|
726
|
+
hill_result = hill_estimator(data, k=k, tail="both")
|
|
727
|
+
|
|
728
|
+
# 2. QQ plot with normal distribution
|
|
729
|
+
qq_normal = generate_qq_data(data, distribution="normal")
|
|
730
|
+
|
|
731
|
+
# 3. QQ plot with Student's t (if heavy or medium tails)
|
|
732
|
+
qq_t = None
|
|
733
|
+
if hill_result.classification in ["heavy", "medium"]:
|
|
734
|
+
# Estimate df based on tail index
|
|
735
|
+
# For Student's t: tail index α ≈ df
|
|
736
|
+
# Use tail index as starting point, clamp to reasonable range
|
|
737
|
+
estimated_df = max(2, min(30, int(round(hill_result.tail_index))))
|
|
738
|
+
|
|
739
|
+
qq_t = generate_qq_data(data, distribution="t", df=estimated_df)
|
|
740
|
+
|
|
741
|
+
# 4. Determine best fit
|
|
742
|
+
if hill_result.classification == "thin" and qq_normal.r_squared >= 0.95:
|
|
743
|
+
best_fit = "normal"
|
|
744
|
+
elif qq_t is not None and qq_t.r_squared > qq_normal.r_squared + 0.02:
|
|
745
|
+
# t-distribution fits significantly better
|
|
746
|
+
best_fit = "t"
|
|
747
|
+
elif hill_result.classification == "heavy":
|
|
748
|
+
best_fit = "heavy-tailed"
|
|
749
|
+
elif qq_normal.r_squared >= 0.90:
|
|
750
|
+
best_fit = "normal"
|
|
751
|
+
else:
|
|
752
|
+
# Neither fits well, classify based on Hill estimator
|
|
753
|
+
best_fit = "heavy-tailed" if hill_result.classification != "thin" else "normal"
|
|
754
|
+
|
|
755
|
+
logger.info(
|
|
756
|
+
"Tail analysis completed",
|
|
757
|
+
tail_index=hill_result.tail_index,
|
|
758
|
+
classification=hill_result.classification,
|
|
759
|
+
best_fit=best_fit,
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
return TailAnalysisResult(
|
|
763
|
+
hill_result=hill_result,
|
|
764
|
+
qq_normal=qq_normal,
|
|
765
|
+
qq_t=qq_t,
|
|
766
|
+
best_fit=best_fit,
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
except (ValidationError, ComputationError):
|
|
770
|
+
raise
|
|
771
|
+
except Exception as e:
|
|
772
|
+
logger.error("Tail analysis failed", error=str(e))
|
|
773
|
+
raise ComputationError( # noqa: B904
|
|
774
|
+
f"Tail analysis failed: {e}",
|
|
775
|
+
context={"function": "analyze_tails"},
|
|
776
|
+
cause=e,
|
|
777
|
+
)
|