rsd 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rsd/README.md +156 -0
- rsd/__init__.py +57 -0
- rsd/_kernels.py +230 -0
- rsd/_validation.py +217 -0
- rsd/_version.py +11 -0
- rsd/_warnings.py +24 -0
- rsd/accumulation.py +78 -0
- rsd/core.py +264 -0
- rsd/defaults.py +56 -0
- rsd/deseasonalize.py +159 -0
- rsd/diagnostics.py +1142 -0
- rsd/ecdf.py +87 -0
- rsd/parametric.py +200 -0
- rsd/py.typed +0 -0
- rsd/tails.py +239 -0
- rsd/transform.py +137 -0
- rsd/xarray.py +474 -0
- rsd-1.0.0.dist-info/METADATA +200 -0
- rsd-1.0.0.dist-info/RECORD +21 -0
- rsd-1.0.0.dist-info/WHEEL +4 -0
- rsd-1.0.0.dist-info/licenses/LICENSE +29 -0
rsd/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/thchilly/rsd/main/assets/rsd_logo_banner.png" alt="rsd - Reference-based standardization framework for drought indices under distribution shift" width="100%">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# rsd
|
|
6
|
+
|
|
7
|
+
*Standardized drought indices (SPI, SSI, SDI, SPEI) that are **comparable** across model runs, scenarios, and reanalysis products.*
|
|
8
|
+
|
|
9
|
+
Full documentation: <https://hydro-rsd.readthedocs.io>
|
|
10
|
+
|
|
11
|
+
`rsd` computes standardized hydroclimate indices by fitting the CDF from a
|
|
12
|
+
fixed reference dataset rather than from the target itself, so that values
|
|
13
|
+
from different model runs, scenarios, or observational products can be
|
|
14
|
+
compared on the same scale.
|
|
15
|
+
|
|
16
|
+
## Why RSD?
|
|
17
|
+
|
|
18
|
+
Standard implementations of SPI/SSI standardize each series against itself,
|
|
19
|
+
which removes the cross-series differences you want to measure. RSD solves
|
|
20
|
+
three interdependent problems:
|
|
21
|
+
|
|
22
|
+
1. **Fixed reference** - fit the CDF once from a reference period or
|
|
23
|
+
dataset; evaluate target values against it.
|
|
24
|
+
2. **GPD tail extension** - empirical CDFs cap at the observed range. RSD
|
|
25
|
+
fits Generalized Pareto Distribution tails for smooth extrapolation
|
|
26
|
+
beyond the reference sample.
|
|
27
|
+
3. **Pooled deseasonalization** - per-month samples are too sparse to
|
|
28
|
+
fit EVT tails. A 50-year record gives ~50 values per calendar month;
|
|
29
|
+
the top/bottom 10% (the tail) is only ~5 exceedances per month - too
|
|
30
|
+
few for a stable GPD fit. RSD removes the per-month location and
|
|
31
|
+
scale, then pools all 12 months into one sample (~600 values, ~60
|
|
32
|
+
exceedances per tail) where the fit becomes feasible.
|
|
33
|
+
|
|
34
|
+
This pooling is what keeps RSD usable on short records. With 20 years
|
|
35
|
+
of monthly data (typical for satellite-era datasets) the pooled tail
|
|
36
|
+
still has ~24 exceedances - enough to fit a single GPD - while a
|
|
37
|
+
per-month tail fit would have only ~2 exceedances per month per tail
|
|
38
|
+
and is infeasible.
|
|
39
|
+
|
|
40
|
+
Monthwise ECDF and fully parametric (e.g. gamma) methods are also included
|
|
41
|
+
as baselines.
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
|
|
45
|
+
- Python ≥ 3.10
|
|
46
|
+
- NumPy ≥ 1.24
|
|
47
|
+
- SciPy ≥ 1.10
|
|
48
|
+
|
|
49
|
+
Optional extras:
|
|
50
|
+
|
|
51
|
+
- `[xarray]` - xarray ≥ 2023.1, dask ≥ 2023.1 (N-D + parallel computation)
|
|
52
|
+
- `[diagnostics]` - matplotlib ≥ 3.7 (`rsd.diagnose` plots)
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install rsd # NumPy/SciPy only
|
|
58
|
+
pip install rsd[xarray] # adds xarray + dask support
|
|
59
|
+
pip install rsd[diagnostics] # adds matplotlib for rsd.diagnose
|
|
60
|
+
pip install rsd[all] # everything above
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick start
|
|
64
|
+
|
|
65
|
+
In RSD vocabulary, the **reference** defines what "normal" looks like (e.g.
|
|
66
|
+
observed climate over a baseline period) and the **target** is the series
|
|
67
|
+
you want to score against that normal (e.g. a future projection or a
|
|
68
|
+
different scenario). Output `z` is in standard-normal units: `z ≈ 0` is
|
|
69
|
+
climatology and `|z| > 2` is extreme.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import numpy as np
|
|
73
|
+
import rsd
|
|
74
|
+
|
|
75
|
+
# 1-D: standardize a 1200-month target against a 600-month reference
|
|
76
|
+
rng = np.random.default_rng(0)
|
|
77
|
+
months_ref = np.tile(np.arange(1, 13), 50) # 600 months
|
|
78
|
+
months_tgt = np.tile(np.arange(1, 13), 100) # 1200 months
|
|
79
|
+
ref = rng.gamma(shape=2, scale=5, size=600)
|
|
80
|
+
tgt = rng.gamma(shape=2, scale=5, size=1200)
|
|
81
|
+
|
|
82
|
+
z = rsd.standardize(
|
|
83
|
+
target=tgt,
|
|
84
|
+
reference=ref,
|
|
85
|
+
months_target=months_tgt,
|
|
86
|
+
months_reference=months_ref,
|
|
87
|
+
scale=3, # 3-month accumulation (e.g. SPI-3)
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# N-D: xarray wrapper (dask-parallelized for large grids).
|
|
93
|
+
# Months are extracted automatically from the time coordinate, so you do
|
|
94
|
+
# not pass months_target / months_reference here.
|
|
95
|
+
import rsd
|
|
96
|
+
|
|
97
|
+
z = rsd.standardize_xr(
|
|
98
|
+
target=target_da, # xr.DataArray with a "time" dimension
|
|
99
|
+
reference=ref_da, # xr.DataArray with a "time" dimension
|
|
100
|
+
method="rsd", # or "monthwise_ecdf" / "monthwise_parametric"
|
|
101
|
+
scale=3,
|
|
102
|
+
parallel=True,
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Diagnostics
|
|
107
|
+
|
|
108
|
+
`rsd.diagnose(values, months, name, ...)` is the one-call entry point
|
|
109
|
+
that verifies the exchangeability assumption underlying RSD pooling. It
|
|
110
|
+
prints an overview block (configuration plus extracted seasonal location
|
|
111
|
+
and scale), renders a combined summary figure (before / after
|
|
112
|
+
deseasonalization KDEs), and prints an Anderson-Darling omnibus plus
|
|
113
|
+
per-month Kolmogorov-Smirnov leave-one-out report. Pass `bounds=(L, U)`
|
|
114
|
+
to add a logit-bounded pathway alongside the baseline; add
|
|
115
|
+
`auto_bounds=True` to also see a heuristic data-driven bound via
|
|
116
|
+
`rsd.estimate_bounds`. Use `quiet=True` for batch / CI runs. See the
|
|
117
|
+
`diagnostics_showcase.ipynb` notebook for a worked example.
|
|
118
|
+
|
|
119
|
+
## Methods
|
|
120
|
+
|
|
121
|
+
| method | description |
|
|
122
|
+
|--------|-------------|
|
|
123
|
+
| `"rsd"` | Deseasonalize -> pool -> ECDF core + GPD tails |
|
|
124
|
+
| `"monthwise_ecdf"` | Per-month empirical CDF (classical SPI-style) |
|
|
125
|
+
| `"monthwise_parametric"` | Per-month parametric fit (gamma, norm, …) |
|
|
126
|
+
|
|
127
|
+
The `monthwise_ecdf` baseline matches the SDAT framework of
|
|
128
|
+
[Farahmand & AghaKouchak (2015)](https://doi.org/10.1016/j.advwatres.2014.11.012).
|
|
129
|
+
The `monthwise_parametric` path defaults to `floc=0` (the canonical SPI
|
|
130
|
+
convention of [Stagge et al. (2015)](https://doi.org/10.1002/joc.4267));
|
|
131
|
+
pass `floc=None` to recover scipy's free-location 3-parameter fit.
|
|
132
|
+
|
|
133
|
+
## Contributing & issues
|
|
134
|
+
|
|
135
|
+
Bug reports and questions are welcome at
|
|
136
|
+
<https://github.com/thchilly/rsd/issues>. Contributions follow the workflow
|
|
137
|
+
in [CONTRIBUTING.md](https://github.com/thchilly/rsd/blob/main/CONTRIBUTING.md).
|
|
138
|
+
|
|
139
|
+
## How to cite
|
|
140
|
+
|
|
141
|
+
If you use this package in your research, please cite the methodology paper:
|
|
142
|
+
|
|
143
|
+
> Tsilimigkras, A., Grillakis, M., & Koutroulis, A. (2026). *A
|
|
144
|
+
> reference-based standardization framework for hydroclimate drought
|
|
145
|
+
> indices under distribution shift*. Manuscript submitted to *Water
|
|
146
|
+
> Resources Research*. DOI pending acceptance.
|
|
147
|
+
|
|
148
|
+
For reproducibility, you may additionally cite the specific software version:
|
|
149
|
+
|
|
150
|
+
> Tsilimigkras, A. (2026). *rsd*: Reference-based standardization
|
|
151
|
+
> framework for hydroclimate drought indices (Version 1.0.0)
|
|
152
|
+
> [Computer software]. Zenodo DOI: pending.
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
BSD 3-Clause
|
rsd/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""RSD: Reference-based standardization framework for hydroclimate drought indices.
|
|
2
|
+
|
|
3
|
+
A hybrid empirical-parametric standardized-index framework with a fixed
|
|
4
|
+
reference distribution. Provides :func:`standardize` for 1-D series and
|
|
5
|
+
:func:`standardize_xr` for N-D xarray DataArrays (with dask parallelism).
|
|
6
|
+
|
|
7
|
+
All public-API outputs are float32 standardized z-scores; internal
|
|
8
|
+
computations run in float64 throughout the pipeline and are cast to
|
|
9
|
+
float32 only at the final storage boundary.
|
|
10
|
+
|
|
11
|
+
Custom warning categories (:class:`RSDFitWarning`,
|
|
12
|
+
:class:`RSDDegenerateWarning`) signal tail-fit fallbacks and degenerate
|
|
13
|
+
reference months. Filter them with the standard
|
|
14
|
+
``warnings.filterwarnings`` machinery if needed.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from rsd._version import __version__
|
|
18
|
+
from rsd._warnings import RSDDegenerateWarning, RSDFitWarning
|
|
19
|
+
from rsd.core import standardize
|
|
20
|
+
from rsd.defaults import METHODS
|
|
21
|
+
from rsd.transform import logit_transform, inverse_logit_transform
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __getattr__(name: str):
|
|
25
|
+
"""Lazy import for optional-dependency modules."""
|
|
26
|
+
if name == "standardize_xr":
|
|
27
|
+
from rsd.xarray import standardize_xr
|
|
28
|
+
|
|
29
|
+
return standardize_xr
|
|
30
|
+
if name == "diagnostics":
|
|
31
|
+
from importlib import import_module
|
|
32
|
+
|
|
33
|
+
return import_module("rsd.diagnostics")
|
|
34
|
+
if name == "diagnose":
|
|
35
|
+
from rsd.diagnostics import diagnose
|
|
36
|
+
|
|
37
|
+
return diagnose
|
|
38
|
+
if name == "estimate_bounds":
|
|
39
|
+
from rsd.diagnostics import estimate_bounds
|
|
40
|
+
|
|
41
|
+
return estimate_bounds
|
|
42
|
+
raise AttributeError(f"module 'rsd' has no attribute {name!r}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"standardize",
|
|
47
|
+
"standardize_xr",
|
|
48
|
+
"METHODS",
|
|
49
|
+
"__version__",
|
|
50
|
+
"logit_transform",
|
|
51
|
+
"inverse_logit_transform",
|
|
52
|
+
"RSDFitWarning",
|
|
53
|
+
"RSDDegenerateWarning",
|
|
54
|
+
"diagnostics",
|
|
55
|
+
"diagnose",
|
|
56
|
+
"estimate_bounds",
|
|
57
|
+
]
|
rsd/_kernels.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Pure-NumPy 1-D kernels for each standardization method.
|
|
2
|
+
|
|
3
|
+
Each kernel operates on pre-accumulated 1-D arrays and returns
|
|
4
|
+
float32 z-scores. These are the inner loops called by core.py
|
|
5
|
+
(for 1-D inputs) and xarray.py (for N-D inputs via apply_ufunc).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from rsd.defaults import (
|
|
13
|
+
DEFAULT_LOC,
|
|
14
|
+
DEFAULT_MIN_TAIL_SIZE,
|
|
15
|
+
DEFAULT_PLOTTING_POSITION,
|
|
16
|
+
DEFAULT_SCALE_METHOD,
|
|
17
|
+
DEFAULT_TAIL_QUANTILE,
|
|
18
|
+
DEFAULT_DISTRIBUTION,
|
|
19
|
+
MIN_REF_SIZE,
|
|
20
|
+
)
|
|
21
|
+
from rsd.deseasonalize import compute_seasonal_params, deseasonalize
|
|
22
|
+
from rsd.ecdf import ecdf_probs
|
|
23
|
+
from rsd.parametric import fit_monthwise, monthwise_cdf
|
|
24
|
+
from rsd.tails import fit_gpd_tails, hybrid_cdf
|
|
25
|
+
from rsd.transform import cdf_to_zscore
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def kernel_rsd_1d(
|
|
29
|
+
tgt_accum: np.ndarray,
|
|
30
|
+
ref_accum: np.ndarray,
|
|
31
|
+
months_tgt: np.ndarray,
|
|
32
|
+
months_ref: np.ndarray,
|
|
33
|
+
tail_quantile: float = DEFAULT_TAIL_QUANTILE,
|
|
34
|
+
min_tail_size: int = DEFAULT_MIN_TAIL_SIZE,
|
|
35
|
+
loc: str = DEFAULT_LOC,
|
|
36
|
+
scale_method: str = DEFAULT_SCALE_METHOD,
|
|
37
|
+
plotting_position: str | tuple[float, float] = DEFAULT_PLOTTING_POSITION,
|
|
38
|
+
) -> np.ndarray:
|
|
39
|
+
"""RSD hybrid method: deseasonalize → pool → ECDF core + GPD tails → z-score.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
tgt_accum : 1-D float array
|
|
44
|
+
Accumulated target values. For bounded variables, these are expected
|
|
45
|
+
to be already logit-transformed by the caller (see :func:`rsd.standardize`).
|
|
46
|
+
ref_accum : 1-D float array
|
|
47
|
+
Accumulated reference values, in the same space as *tgt_accum*.
|
|
48
|
+
months_tgt : 1-D int array
|
|
49
|
+
Calendar months (1-12) for target.
|
|
50
|
+
months_ref : 1-D int array
|
|
51
|
+
Calendar months (1-12) for reference.
|
|
52
|
+
tail_quantile : float
|
|
53
|
+
Fraction of the pooled reference assigned to each tail (default
|
|
54
|
+
0.10). See :func:`rsd.standardize` for the validation rules and
|
|
55
|
+
the cross-validation against reference length.
|
|
56
|
+
min_tail_size : int
|
|
57
|
+
Minimum number of exceedances required for the GPD fit; below
|
|
58
|
+
this the affected tail falls back to ECDF-only with an
|
|
59
|
+
``RSDFitWarning`` (default 20). See :func:`rsd.standardize`.
|
|
60
|
+
loc : {"median", "mean"}
|
|
61
|
+
Location estimator used by deseasonalization (default "median").
|
|
62
|
+
scale_method : {"iqr", "std"}
|
|
63
|
+
Scale estimator used by deseasonalization (default "iqr").
|
|
64
|
+
plotting_position : str or (float, float)
|
|
65
|
+
Plotting position formula for the ECDF core. Accepts a known
|
|
66
|
+
name from :data:`rsd.defaults.PLOTTING_POSITIONS` or an (a, b)
|
|
67
|
+
2-tuple (default "gringorten").
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
np.ndarray (float32)
|
|
72
|
+
Standardized z-scores. Same length as *tgt_accum*. Internal
|
|
73
|
+
computations run in float64; the output is cast to float32 for
|
|
74
|
+
memory efficiency at the storage boundary.
|
|
75
|
+
|
|
76
|
+
Notes
|
|
77
|
+
-----
|
|
78
|
+
The kernel operates on already-accumulated values. For bounded variables,
|
|
79
|
+
the logit pre-transform is applied at the public-API layer
|
|
80
|
+
(:func:`rsd.standardize` / :func:`rsd.standardize_xr`) before accumulation,
|
|
81
|
+
so by the time data reaches this kernel the values are already in
|
|
82
|
+
unbounded logit space.
|
|
83
|
+
"""
|
|
84
|
+
out = np.full(len(tgt_accum), np.nan, dtype=np.float32)
|
|
85
|
+
|
|
86
|
+
# Early bailout
|
|
87
|
+
valid_ref = np.isfinite(ref_accum).sum()
|
|
88
|
+
valid_tgt = np.isfinite(tgt_accum).sum()
|
|
89
|
+
if valid_ref < MIN_REF_SIZE or valid_tgt == 0:
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
# Step 1: Deseasonalize
|
|
93
|
+
params = compute_seasonal_params(
|
|
94
|
+
ref_accum, months_ref, loc=loc, scale_method=scale_method
|
|
95
|
+
)
|
|
96
|
+
z_ref = deseasonalize(ref_accum, months_ref, params)
|
|
97
|
+
z_tgt = deseasonalize(tgt_accum, months_tgt, params)
|
|
98
|
+
|
|
99
|
+
# Step 2: Pool reference (finite values only)
|
|
100
|
+
z_ref_clean = z_ref[np.isfinite(z_ref)]
|
|
101
|
+
if len(z_ref_clean) < MIN_REF_SIZE:
|
|
102
|
+
return out
|
|
103
|
+
sorted_ref = np.sort(z_ref_clean)
|
|
104
|
+
|
|
105
|
+
# Step 3: Fit GPD to the lower and upper tails of the pooled reference
|
|
106
|
+
lower_tail, upper_tail = fit_gpd_tails(
|
|
107
|
+
sorted_ref,
|
|
108
|
+
tail_quantile=tail_quantile,
|
|
109
|
+
min_tail_size=min_tail_size,
|
|
110
|
+
plotting_position=plotting_position,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Step 4: Hybrid CDF
|
|
114
|
+
p = hybrid_cdf(
|
|
115
|
+
z_tgt, sorted_ref, lower_tail, upper_tail, plotting_position=plotting_position
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Step 5: Transform to z-scores
|
|
119
|
+
out = cdf_to_zscore(p)
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def kernel_monthwise_ecdf_1d(
|
|
124
|
+
tgt_accum: np.ndarray,
|
|
125
|
+
ref_accum: np.ndarray,
|
|
126
|
+
months_tgt: np.ndarray,
|
|
127
|
+
months_ref: np.ndarray,
|
|
128
|
+
plotting_position: str | tuple[float, float] = DEFAULT_PLOTTING_POSITION,
|
|
129
|
+
) -> np.ndarray:
|
|
130
|
+
"""Classical monthwise ECDF: per-month empirical CDF → z-score.
|
|
131
|
+
|
|
132
|
+
For each calendar month, build a separate ECDF from the reference
|
|
133
|
+
values for that month and evaluate target values against it.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
tgt_accum : 1-D float array
|
|
138
|
+
Accumulated target values.
|
|
139
|
+
ref_accum : 1-D float array
|
|
140
|
+
Accumulated reference values.
|
|
141
|
+
months_tgt : 1-D int array
|
|
142
|
+
Calendar months (1-12) for target.
|
|
143
|
+
months_ref : 1-D int array
|
|
144
|
+
Calendar months (1-12) for reference.
|
|
145
|
+
plotting_position : str or (float, float)
|
|
146
|
+
Plotting position formula passed to the per-month ECDF
|
|
147
|
+
(default "gringorten"). Accepts a known name from
|
|
148
|
+
:data:`rsd.defaults.PLOTTING_POSITIONS` or an (a, b) 2-tuple.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
np.ndarray (float32)
|
|
153
|
+
Standardized z-scores. Same length as *tgt_accum*. Internal
|
|
154
|
+
computations run in float64; the output is cast to float32 for
|
|
155
|
+
memory efficiency at the storage boundary.
|
|
156
|
+
"""
|
|
157
|
+
p = np.full(len(tgt_accum), np.nan, dtype=np.float64)
|
|
158
|
+
|
|
159
|
+
for m in range(1, 13):
|
|
160
|
+
ref_mask = months_ref == m
|
|
161
|
+
tgt_mask = months_tgt == m
|
|
162
|
+
|
|
163
|
+
ref_vals = ref_accum[ref_mask]
|
|
164
|
+
ref_vals = ref_vals[np.isfinite(ref_vals)]
|
|
165
|
+
|
|
166
|
+
if len(ref_vals) == 0 or not tgt_mask.any():
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
sorted_ref = np.sort(ref_vals)
|
|
170
|
+
tgt_vals = tgt_accum[tgt_mask]
|
|
171
|
+
|
|
172
|
+
probs = ecdf_probs(tgt_vals, sorted_ref, plotting_position)
|
|
173
|
+
|
|
174
|
+
# Preserve NaN from target
|
|
175
|
+
probs = np.where(np.isfinite(tgt_vals), probs, np.nan)
|
|
176
|
+
p[tgt_mask] = probs
|
|
177
|
+
|
|
178
|
+
return cdf_to_zscore(p)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def kernel_monthwise_parametric_1d(
|
|
182
|
+
tgt_accum: np.ndarray,
|
|
183
|
+
ref_accum: np.ndarray,
|
|
184
|
+
months_tgt: np.ndarray,
|
|
185
|
+
months_ref: np.ndarray,
|
|
186
|
+
distribution: str = DEFAULT_DISTRIBUTION,
|
|
187
|
+
prob_zero: bool = False,
|
|
188
|
+
floc: float | None = 0,
|
|
189
|
+
) -> np.ndarray:
|
|
190
|
+
"""Classical monthwise parametric: per-month distribution fit → z-score.
|
|
191
|
+
|
|
192
|
+
For each calendar month, fit a scipy distribution to the reference
|
|
193
|
+
values for that month and evaluate the target CDF.
|
|
194
|
+
|
|
195
|
+
Parameters
|
|
196
|
+
----------
|
|
197
|
+
tgt_accum : 1-D float array
|
|
198
|
+
Accumulated target values.
|
|
199
|
+
ref_accum : 1-D float array
|
|
200
|
+
Accumulated reference values.
|
|
201
|
+
months_tgt : 1-D int array
|
|
202
|
+
Calendar months (1-12) for target.
|
|
203
|
+
months_ref : 1-D int array
|
|
204
|
+
Calendar months (1-12) for reference.
|
|
205
|
+
distribution : str
|
|
206
|
+
Name of a scipy.stats continuous distribution.
|
|
207
|
+
prob_zero : bool
|
|
208
|
+
If True, handle zero-inflation for distributions with support
|
|
209
|
+
at or above 0.
|
|
210
|
+
floc : float or None, default 0
|
|
211
|
+
Fixed location parameter for distributions with support at or
|
|
212
|
+
above 0. Default ``0`` matches the canonical Stagge SPI
|
|
213
|
+
convention. ``None`` recovers scipy's free-location fit.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
np.ndarray (float32)
|
|
218
|
+
Standardized z-scores. Same length as *tgt_accum*. Internal
|
|
219
|
+
computations run in float64; the output is cast to float32 for
|
|
220
|
+
memory efficiency at the storage boundary.
|
|
221
|
+
"""
|
|
222
|
+
fit = fit_monthwise(
|
|
223
|
+
ref_accum,
|
|
224
|
+
months_ref,
|
|
225
|
+
distribution=distribution,
|
|
226
|
+
prob_zero=prob_zero,
|
|
227
|
+
floc=floc,
|
|
228
|
+
)
|
|
229
|
+
p = monthwise_cdf(tgt_accum, months_tgt, fit)
|
|
230
|
+
return cdf_to_zscore(p)
|
rsd/_validation.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Input validation for the public RSD API.
|
|
2
|
+
|
|
3
|
+
These helpers are called at the function entry of :func:`rsd.standardize`
|
|
4
|
+
and :func:`rsd.standardize_xr` so that obviously-invalid inputs raise a
|
|
5
|
+
clear ``ValueError`` (or ``TypeError``) immediately, instead of producing
|
|
6
|
+
silently-wrong output or a cryptic error from deep inside the pipeline.
|
|
7
|
+
|
|
8
|
+
The validators are intentionally small and unit-testable; each one
|
|
9
|
+
focuses on a single parameter and is independent of the others.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import numbers
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from rsd.defaults import METHODS, PLOTTING_POSITIONS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def validate_method(method: str) -> None:
|
|
22
|
+
if method not in METHODS:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Unknown method {method!r}. Choose from: {', '.join(METHODS)}"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate_scale(scale: int) -> None:
|
|
29
|
+
if not isinstance(scale, (int, np.integer)) or bool(isinstance(scale, bool)):
|
|
30
|
+
raise ValueError(f"scale must be a positive integer, got {scale!r}")
|
|
31
|
+
if int(scale) < 1:
|
|
32
|
+
raise ValueError(f"scale must be >= 1, got {int(scale)}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def validate_agg(agg: str) -> None:
|
|
36
|
+
if agg not in ("sum", "mean"):
|
|
37
|
+
raise ValueError(f"agg must be 'sum' or 'mean', got {agg!r}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def validate_loc(loc: str) -> None:
|
|
41
|
+
if loc not in ("median", "mean"):
|
|
42
|
+
raise ValueError(f"loc must be 'median' or 'mean', got {loc!r}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_scale_method(scale_method: str) -> None:
|
|
46
|
+
if scale_method not in ("iqr", "std"):
|
|
47
|
+
raise ValueError(f"scale_method must be 'iqr' or 'std', got {scale_method!r}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def validate_tail_quantile(tail_quantile: float) -> None:
|
|
51
|
+
if not isinstance(
|
|
52
|
+
tail_quantile, (int, float, np.floating, np.integer)
|
|
53
|
+
) or isinstance(tail_quantile, bool):
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"tail_quantile must be a number in (0, 0.20], got {tail_quantile!r}"
|
|
56
|
+
)
|
|
57
|
+
value = float(tail_quantile)
|
|
58
|
+
if not (0.0 < value <= 0.20):
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"tail_quantile must be in (0, 0.20], got {value}. "
|
|
61
|
+
f"Values >= 0.5 produce overlapping tails. Values > 0.20 collapse "
|
|
62
|
+
f"the empirical core. Values <= 0 disable tail fitting entirely "
|
|
63
|
+
f"(use a pure-ECDF method instead)."
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def validate_min_tail_size(min_tail_size: int) -> None:
|
|
68
|
+
if not isinstance(min_tail_size, (int, np.integer)) or isinstance(
|
|
69
|
+
min_tail_size, bool
|
|
70
|
+
):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"min_tail_size must be an integer >= 10, got {min_tail_size!r}"
|
|
73
|
+
)
|
|
74
|
+
if int(min_tail_size) < 10:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"min_tail_size must be >= 10, got {int(min_tail_size)}. "
|
|
77
|
+
f"Fewer than 10 exceedances per tail makes the GPD fit unreliable."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def validate_bounds_scalar(bounds) -> None:
|
|
82
|
+
"""For the 1-D :func:`rsd.standardize` path.
|
|
83
|
+
|
|
84
|
+
``bounds`` must be ``None`` or a 2-tuple of finite scalars with
|
|
85
|
+
strict lower < upper.
|
|
86
|
+
"""
|
|
87
|
+
if bounds is None:
|
|
88
|
+
return
|
|
89
|
+
if not (hasattr(bounds, "__len__") and len(bounds) == 2):
|
|
90
|
+
raise ValueError(f"bounds must be a (lower, upper) 2-tuple, got {bounds!r}")
|
|
91
|
+
lower, upper = bounds
|
|
92
|
+
try:
|
|
93
|
+
lower_f = float(lower)
|
|
94
|
+
upper_f = float(upper)
|
|
95
|
+
except (TypeError, ValueError) as e:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"bounds must be a (lower, upper) 2-tuple of numbers, "
|
|
98
|
+
f"got ({lower!r}, {upper!r})"
|
|
99
|
+
) from e
|
|
100
|
+
if not (np.isfinite(lower_f) and np.isfinite(upper_f)):
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"bounds must contain finite numbers, got ({lower_f}, {upper_f})"
|
|
103
|
+
)
|
|
104
|
+
if lower_f >= upper_f:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"bounds lower ({lower_f}) must be strictly less than upper ({upper_f})"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def validate_bounds_xr(bounds) -> None:
|
|
111
|
+
"""For the :func:`rsd.standardize_xr` path.
|
|
112
|
+
|
|
113
|
+
Each bound may be a scalar OR an xarray DataArray (for per-pixel
|
|
114
|
+
bounds). Scalar-vs-scalar ordering is checked here; DataArray
|
|
115
|
+
bounds defer ordering checks to runtime broadcasting.
|
|
116
|
+
"""
|
|
117
|
+
if bounds is None:
|
|
118
|
+
return
|
|
119
|
+
if not (hasattr(bounds, "__len__") and len(bounds) == 2):
|
|
120
|
+
raise ValueError(f"bounds must be a (lower, upper) 2-tuple, got {bounds!r}")
|
|
121
|
+
lower, upper = bounds
|
|
122
|
+
if isinstance(lower, numbers.Number) and isinstance(upper, numbers.Number):
|
|
123
|
+
validate_bounds_scalar(bounds)
|
|
124
|
+
# If either is a DataArray, defer ordering check to _logit_transform_xr.
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def validate_plotting_position(pp) -> None:
|
|
128
|
+
if isinstance(pp, str):
|
|
129
|
+
if pp not in PLOTTING_POSITIONS:
|
|
130
|
+
valid = sorted(PLOTTING_POSITIONS)
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Unknown plotting_position name {pp!r}. "
|
|
133
|
+
f"Choose from: {valid}, or pass a 2-tuple (a, b) of floats."
|
|
134
|
+
)
|
|
135
|
+
return
|
|
136
|
+
if hasattr(pp, "__len__") and len(pp) == 2:
|
|
137
|
+
try:
|
|
138
|
+
float(pp[0])
|
|
139
|
+
float(pp[1])
|
|
140
|
+
except (TypeError, ValueError) as e:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"plotting_position 2-tuple must contain numbers, got {pp!r}"
|
|
143
|
+
) from e
|
|
144
|
+
return
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"plotting_position must be a known name or a 2-tuple of floats, got {pp!r}"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def validate_months(months: np.ndarray, name: str) -> np.ndarray:
|
|
151
|
+
"""Months must be a 1-D integer array with values strictly in [1, 12].
|
|
152
|
+
|
|
153
|
+
Returns the (possibly cast) int32 array for downstream use.
|
|
154
|
+
"""
|
|
155
|
+
arr = np.asarray(months)
|
|
156
|
+
if arr.ndim != 1:
|
|
157
|
+
raise ValueError(f"{name} must be 1-D, got {arr.ndim}-D")
|
|
158
|
+
if arr.dtype.kind not in ("i", "u"):
|
|
159
|
+
try:
|
|
160
|
+
arr = arr.astype(np.int32, casting="safe")
|
|
161
|
+
except TypeError as e:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"{name} must contain integers in [1, 12], "
|
|
164
|
+
f"got dtype {np.asarray(months).dtype}"
|
|
165
|
+
) from e
|
|
166
|
+
out_of_range = (arr < 1) | (arr > 12)
|
|
167
|
+
if out_of_range.any():
|
|
168
|
+
bad = arr[out_of_range]
|
|
169
|
+
sample = bad[:5].tolist()
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"{name} must contain integers in [1, 12]; "
|
|
172
|
+
f"found {int(out_of_range.sum())} out-of-range value(s) "
|
|
173
|
+
f"(examples: {sample}). The calendar-month convention is "
|
|
174
|
+
f"1=January through 12=December."
|
|
175
|
+
)
|
|
176
|
+
return arr.astype(np.int32, copy=False)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def validate_floc(floc) -> None:
|
|
180
|
+
"""``floc`` for the parametric path: either ``None`` or a finite number."""
|
|
181
|
+
if floc is None:
|
|
182
|
+
return
|
|
183
|
+
if isinstance(floc, bool):
|
|
184
|
+
raise ValueError(f"floc must be None or a finite number, got {floc!r}")
|
|
185
|
+
if not isinstance(floc, (int, float, np.floating, np.integer)):
|
|
186
|
+
raise ValueError(f"floc must be None or a finite number, got {floc!r}")
|
|
187
|
+
if not np.isfinite(float(floc)):
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"floc must be a finite number (or None to disable), got {floc!r}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def validate_reference_length_for_tail_fit(
|
|
194
|
+
n_reference: int,
|
|
195
|
+
scale: int,
|
|
196
|
+
tail_quantile: float,
|
|
197
|
+
min_tail_size: int,
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Reject up-front configurations where GPD tail fitting cannot succeed.
|
|
200
|
+
|
|
201
|
+
After ``scale``-step accumulation the effective pooled reference length
|
|
202
|
+
is at most ``n_reference - scale + 1``. The expected number of
|
|
203
|
+
exceedances per tail is ``n_pooled * tail_quantile``; we require this
|
|
204
|
+
to be at least ``min_tail_size`` for a meaningful GPD fit.
|
|
205
|
+
"""
|
|
206
|
+
n_pooled_est = max(0, int(n_reference) - int(scale) + 1)
|
|
207
|
+
expected = float(n_pooled_est) * float(tail_quantile)
|
|
208
|
+
if expected < float(min_tail_size):
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"Reference is too short for GPD tail fitting at the requested "
|
|
211
|
+
f"configuration: n_reference={n_reference}, scale={scale} "
|
|
212
|
+
f"(pooled n ~ {n_pooled_est}), tail_quantile={tail_quantile} "
|
|
213
|
+
f"→ expected exceedances per tail ~ {expected:.1f}, "
|
|
214
|
+
f"but min_tail_size={min_tail_size}. "
|
|
215
|
+
f"Provide a longer reference, lower tail_quantile, or lower "
|
|
216
|
+
f"min_tail_size (>= 10)."
|
|
217
|
+
)
|
rsd/_version.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Version management for the rsd package."""
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
__version__ = version("rsd")
|
|
8
|
+
except PackageNotFoundError:
|
|
9
|
+
__version__ = "0.0.0.dev0"
|
|
10
|
+
except ImportError:
|
|
11
|
+
__version__ = "0.0.0.dev0"
|
rsd/_warnings.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Custom warning categories emitted by the RSD pipeline.
|
|
2
|
+
|
|
3
|
+
Users can silence or escalate these categories independently of generic
|
|
4
|
+
``UserWarning`` traffic, e.g.::
|
|
5
|
+
|
|
6
|
+
import warnings, rsd
|
|
7
|
+
warnings.filterwarnings("ignore", category=rsd.RSDFitWarning)
|
|
8
|
+
warnings.filterwarnings("error", category=rsd.RSDDegenerateWarning)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RSDFitWarning(UserWarning):
|
|
15
|
+
"""A parametric fit (GPD, exponential, or scipy monthwise distribution)
|
|
16
|
+
fell back or failed for a sample. The pipeline still returns a value
|
|
17
|
+
using a documented fallback path (e.g., ECDF-only on GPD failure)."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RSDDegenerateWarning(UserWarning):
|
|
21
|
+
"""An input sample is degenerate (zero spread, constant series, or
|
|
22
|
+
fully outside the declared physical bounds) and was handled by a
|
|
23
|
+
fallback that may make downstream results undefined or NaN for the
|
|
24
|
+
affected positions."""
|