ggh4x-python 0.3.1.9000__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ggh4x/__init__.py +140 -0
- ggh4x/_aimed_text_grob.py +432 -0
- ggh4x/_borrowed_ggplot2.py +273 -0
- ggh4x/_cli.py +84 -0
- ggh4x/_datasets.py +106 -0
- ggh4x/_download.py +111 -0
- ggh4x/_facet_helpers.py +313 -0
- ggh4x/_facet_utils.py +649 -0
- ggh4x/_gap_grobs.py +606 -0
- ggh4x/_registry.py +10 -0
- ggh4x/_rlang.py +93 -0
- ggh4x/_utils.py +150 -0
- ggh4x/_vctrs.py +233 -0
- ggh4x/conveniences.py +601 -0
- ggh4x/coord_axes_inside.py +380 -0
- ggh4x/element_part_rect.py +545 -0
- ggh4x/facet_grid2.py +1018 -0
- ggh4x/facet_manual.py +901 -0
- ggh4x/facet_nested.py +776 -0
- ggh4x/facet_nested_wrap.py +193 -0
- ggh4x/facet_wrap2.py +896 -0
- ggh4x/geom_box.py +536 -0
- ggh4x/geom_outline_point.py +444 -0
- ggh4x/geom_pointpath.py +259 -0
- ggh4x/geom_polygonraster.py +252 -0
- ggh4x/geom_rectrug.py +489 -0
- ggh4x/geom_text_aimed.py +279 -0
- ggh4x/guide_stringlegend.py +354 -0
- ggh4x/help_secondary.py +549 -0
- ggh4x/multiscale/__init__.py +51 -0
- ggh4x/multiscale/_multiscale_add.py +207 -0
- ggh4x/multiscale/scale_listed.py +167 -0
- ggh4x/multiscale/scale_manual.py +478 -0
- ggh4x/multiscale/scale_multi.py +393 -0
- ggh4x/panel_scales/__init__.py +58 -0
- ggh4x/panel_scales/at_panel.py +115 -0
- ggh4x/panel_scales/facetted_pos_scales.py +647 -0
- ggh4x/panel_scales/force_panelsize.py +411 -0
- ggh4x/panel_scales/scale_facet.py +222 -0
- ggh4x/position_disjoint_ranges.py +229 -0
- ggh4x/position_lineartrans.py +242 -0
- ggh4x/py.typed +0 -0
- ggh4x/resources/faithful.csv +273 -0
- ggh4x/resources/iris.csv +151 -0
- ggh4x/resources/mtcars.csv +33 -0
- ggh4x/resources/pressure.csv +20 -0
- ggh4x/resources/volcano.csv +87 -0
- ggh4x/save.py +255 -0
- ggh4x/stat_difference.py +388 -0
- ggh4x/stat_funxy.py +436 -0
- ggh4x/stat_rle.py +290 -0
- ggh4x/stat_rollingkernel.py +369 -0
- ggh4x/stat_theodensity.py +681 -0
- ggh4x/strip_nested.py +448 -0
- ggh4x/strip_split.py +687 -0
- ggh4x/strip_tag.py +636 -0
- ggh4x/strip_themed.py +232 -0
- ggh4x/strip_vanilla.py +1464 -0
- ggh4x/themes.py +31 -0
- ggh4x/themes_ggh4x.py +67 -0
- ggh4x_python-0.3.1.9000.dist-info/METADATA +40 -0
- ggh4x_python-0.3.1.9000.dist-info/RECORD +64 -0
- ggh4x_python-0.3.1.9000.dist-info/WHEEL +4 -0
- ggh4x_python-0.3.1.9000.dist-info/licenses/LICENSE +3 -0
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
"""Fitted theoretical density.
|
|
2
|
+
|
|
3
|
+
Port of ``ggh4x``'s ``stat_theodensity.R``. Estimates the parameters of a
|
|
4
|
+
named theoretical distribution by maximum likelihood and evaluates the
|
|
5
|
+
distribution's probability density (or mass) function on a grid. This is the
|
|
6
|
+
Python analogue of fitting a parametric distribution with
|
|
7
|
+
``fitdistrplus::fitdist`` and then calling ``d<distri>`` from R's ``stats``
|
|
8
|
+
package.
|
|
9
|
+
|
|
10
|
+
R uses ``fitdistrplus::fitdist`` for maximum-likelihood estimation. The Python
|
|
11
|
+
port replaces this with :mod:`scipy.stats` MLE: continuous distributions use the
|
|
12
|
+
``<dist>.fit`` method (with location/scale fixed where R's parameterization
|
|
13
|
+
fixes them), while discrete distributions (``pois``/``geom``/``binom``/
|
|
14
|
+
``nbinom``) use closed-form or numerically optimized maximum-likelihood
|
|
15
|
+
estimators. A hand-built mapping table translates R distribution names and
|
|
16
|
+
parameterizations (e.g. R's ``gamma`` rate vs scipy's scale) into scipy
|
|
17
|
+
objects and parameters.
|
|
18
|
+
|
|
19
|
+
R source
|
|
20
|
+
--------
|
|
21
|
+
``ggh4x/R/stat_theodensity.R``
|
|
22
|
+
|
|
23
|
+
Notes
|
|
24
|
+
-----
|
|
25
|
+
The single largest fidelity risk is that :func:`scipy.stats.<dist>.fit` returns
|
|
26
|
+
parameters in a different order and parameterization than R's ``d<distri>``
|
|
27
|
+
functions. The :data:`_DISTRI_TABLE` mapping captures, per R distribution name,
|
|
28
|
+
the scipy distribution object, the fixed-parameter constraints required to
|
|
29
|
+
reproduce R's MLE, and the conversion from the scipy fit tuple back into R's
|
|
30
|
+
named parameters. Verified against live ``fitdistrplus`` output on identical
|
|
31
|
+
data samples.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
37
|
+
|
|
38
|
+
import numpy as np
|
|
39
|
+
import pandas as pd
|
|
40
|
+
|
|
41
|
+
from ggplot2_py import ggproto_parent
|
|
42
|
+
from ggplot2_py.aes import AfterStat
|
|
43
|
+
from ggplot2_py.stat import StatDensity, _layer
|
|
44
|
+
|
|
45
|
+
from ._cli import cli_abort, cli_inform, cli_warn
|
|
46
|
+
|
|
47
|
+
__all__ = ["stat_theodensity", "StatTheoDensity", "_class_distri"]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Distribution parameterization mapping table
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
#
|
|
54
|
+
# R's ``d<distri>`` functions and ``fitdistrplus`` use parameterizations that
|
|
55
|
+
# differ from :mod:`scipy.stats`. Each entry maps an R distribution name to:
|
|
56
|
+
# - ``kind`` : "continuous" or "discrete".
|
|
57
|
+
# - ``fitter`` : callable ``(x, fix_arg, start_arg) -> dict`` returning the
|
|
58
|
+
# fitted parameters in R's named parameterization (the same
|
|
59
|
+
# names ``coef(fitdist(...))`` would produce, plus any fixed
|
|
60
|
+
# parameters).
|
|
61
|
+
# - ``pdf`` : callable ``(xseq, params) -> ndarray`` evaluating the
|
|
62
|
+
# density / mass function using R's named parameters.
|
|
63
|
+
#
|
|
64
|
+
# This indirection reproduces ``get(paste0("d", distri))`` together with the
|
|
65
|
+
# ``coef(fitdistrplus::fitdist(...))`` call in the R source.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _require_scipy() -> Any:
|
|
69
|
+
"""Import :mod:`scipy.stats`, aborting with a helpful message if absent.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
module
|
|
74
|
+
The :mod:`scipy.stats` module.
|
|
75
|
+
|
|
76
|
+
Raises
|
|
77
|
+
------
|
|
78
|
+
ImportError
|
|
79
|
+
If scipy is not installed.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
from scipy import stats as _stats # noqa: WPS433 (local import by design)
|
|
83
|
+
except ImportError as err: # pragma: no cover - environment dependent
|
|
84
|
+
raise ImportError(
|
|
85
|
+
"The 'scipy' package is required for `stat_theodensity()`."
|
|
86
|
+
) from err
|
|
87
|
+
return _stats
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# -- Continuous fitters ------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _fit_norm(x, fix_arg, start_arg):
|
|
94
|
+
st = _require_scipy()
|
|
95
|
+
loc, scale = st.norm.fit(x)
|
|
96
|
+
return {"mean": loc, "sd": scale}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _fit_lnorm(x, fix_arg, start_arg):
|
|
100
|
+
st = _require_scipy()
|
|
101
|
+
s, _loc, scale = st.lognorm.fit(x, floc=0)
|
|
102
|
+
return {"meanlog": float(np.log(scale)), "sdlog": s}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _fit_cauchy(x, fix_arg, start_arg):
|
|
106
|
+
st = _require_scipy()
|
|
107
|
+
loc, scale = st.cauchy.fit(x)
|
|
108
|
+
return {"location": loc, "scale": scale}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _fit_gamma(x, fix_arg, start_arg):
|
|
112
|
+
st = _require_scipy()
|
|
113
|
+
kwargs: Dict[str, Any] = {"floc": 0}
|
|
114
|
+
fixed: Dict[str, Any] = {}
|
|
115
|
+
# R fixes ``rate`` -> scipy fixes ``scale = 1 / rate``.
|
|
116
|
+
if fix_arg and "rate" in fix_arg:
|
|
117
|
+
kwargs["fscale"] = 1.0 / float(fix_arg["rate"])
|
|
118
|
+
fixed["rate"] = float(fix_arg["rate"])
|
|
119
|
+
if fix_arg and "shape" in fix_arg:
|
|
120
|
+
kwargs["fa"] = float(fix_arg["shape"])
|
|
121
|
+
fixed["shape"] = float(fix_arg["shape"])
|
|
122
|
+
a, _loc, scale = st.gamma.fit(x, **kwargs)
|
|
123
|
+
out: Dict[str, Any] = {"shape": a, "rate": 1.0 / scale}
|
|
124
|
+
out.update(fixed)
|
|
125
|
+
return out
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _fit_weibull(x, fix_arg, start_arg):
|
|
129
|
+
st = _require_scipy()
|
|
130
|
+
c, _loc, scale = st.weibull_min.fit(x, floc=0)
|
|
131
|
+
return {"shape": c, "scale": scale}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _fit_exp(x, fix_arg, start_arg):
|
|
135
|
+
st = _require_scipy()
|
|
136
|
+
_loc, scale = st.expon.fit(x, floc=0)
|
|
137
|
+
return {"rate": 1.0 / scale}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _fit_logis(x, fix_arg, start_arg):
|
|
141
|
+
st = _require_scipy()
|
|
142
|
+
loc, scale = st.logistic.fit(x)
|
|
143
|
+
return {"location": loc, "scale": scale}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _fit_beta(x, fix_arg, start_arg):
|
|
147
|
+
st = _require_scipy()
|
|
148
|
+
a, b, _loc, _scale = st.beta.fit(x, floc=0, fscale=1)
|
|
149
|
+
return {"shape1": a, "shape2": b}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _fit_unif(x, fix_arg, start_arg):
|
|
153
|
+
return {"min": float(np.min(x)), "max": float(np.max(x))}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _fit_t(x, fix_arg, start_arg):
|
|
157
|
+
st = _require_scipy()
|
|
158
|
+
# R's ``dt`` has a single ``df`` parameter (standard t, location 0,
|
|
159
|
+
# scale 1). fitdistrplus needs a start value; mirror by fixing loc/scale.
|
|
160
|
+
df, _loc, _scale = st.t.fit(x, floc=0, fscale=1)
|
|
161
|
+
return {"df": df}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _fit_f(x, fix_arg, start_arg):
|
|
165
|
+
st = _require_scipy()
|
|
166
|
+
dfn, dfd, _loc, _scale = st.f.fit(x, floc=0, fscale=1)
|
|
167
|
+
return {"df1": dfn, "df2": dfd}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _fit_chisq(x, fix_arg, start_arg):
|
|
171
|
+
# Only reached if a user forces ``chisq`` past ``setup_params`` (which
|
|
172
|
+
# normally remaps it to gamma). scipy chi2 has a single ``df`` parameter.
|
|
173
|
+
st = _require_scipy()
|
|
174
|
+
df, _loc, _scale = st.chi2.fit(x, floc=0, fscale=1)
|
|
175
|
+
return {"df": df}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# -- Discrete fitters --------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _fit_pois(x, fix_arg, start_arg):
|
|
182
|
+
return {"lambda": float(np.mean(x))}
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _fit_geom(x, fix_arg, start_arg):
|
|
186
|
+
# R's ``dgeom`` counts failures before the first success; the MLE of the
|
|
187
|
+
# success probability is ``1 / (1 + mean)``.
|
|
188
|
+
return {"prob": 1.0 / (1.0 + float(np.mean(x)))}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _fit_binom(x, fix_arg, start_arg):
|
|
192
|
+
# ``size`` must be fixed (R aborts/auto-fixes it in setup_params). The MLE
|
|
193
|
+
# of ``prob`` with fixed ``size`` is ``mean(x) / size``.
|
|
194
|
+
if not fix_arg or "size" not in fix_arg:
|
|
195
|
+
cli_abort("Fitting a binomial distribution requires a fixed 'size'.")
|
|
196
|
+
size = int(fix_arg["size"])
|
|
197
|
+
return {"size": size, "prob": float(np.mean(x)) / size}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _fit_nbinom(x, fix_arg, start_arg):
|
|
201
|
+
# fitdistrplus returns (size, mu). Full two-parameter MLE over (size, mu),
|
|
202
|
+
# with ``prob = size / (size + mu)``. Matches fitdist on identical data.
|
|
203
|
+
st = _require_scipy()
|
|
204
|
+
from scipy import optimize # noqa: WPS433
|
|
205
|
+
|
|
206
|
+
x = np.asarray(x, dtype=float)
|
|
207
|
+
m = float(np.mean(x))
|
|
208
|
+
v = float(np.var(x))
|
|
209
|
+
size0 = (m * m / (v - m)) if v > m else 1.0
|
|
210
|
+
if not np.isfinite(size0) or size0 <= 0:
|
|
211
|
+
size0 = 1.0
|
|
212
|
+
|
|
213
|
+
def _negll(params: np.ndarray) -> float:
|
|
214
|
+
size, mu = params
|
|
215
|
+
if size <= 0 or mu <= 0:
|
|
216
|
+
return 1e10
|
|
217
|
+
prob = size / (size + mu)
|
|
218
|
+
return -float(np.sum(st.nbinom.logpmf(x, size, prob)))
|
|
219
|
+
|
|
220
|
+
res = optimize.minimize(
|
|
221
|
+
_negll,
|
|
222
|
+
np.array([size0, m]),
|
|
223
|
+
method="Nelder-Mead",
|
|
224
|
+
options={"xatol": 1e-8, "fatol": 1e-10, "maxiter": 10000},
|
|
225
|
+
)
|
|
226
|
+
size, mu = res.x
|
|
227
|
+
return {"size": float(size), "mu": float(mu)}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# -- PDF / PMF evaluators (R parameterization) -------------------------------
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _pdf_norm(xseq, p):
|
|
234
|
+
return _require_scipy().norm.pdf(xseq, loc=p["mean"], scale=p["sd"])
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _pdf_lnorm(xseq, p):
|
|
238
|
+
return _require_scipy().lognorm.pdf(
|
|
239
|
+
xseq, p["sdlog"], loc=0, scale=np.exp(p["meanlog"])
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _pdf_cauchy(xseq, p):
|
|
244
|
+
return _require_scipy().cauchy.pdf(xseq, loc=p["location"], scale=p["scale"])
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _pdf_gamma(xseq, p):
|
|
248
|
+
return _require_scipy().gamma.pdf(xseq, p["shape"], loc=0, scale=1.0 / p["rate"])
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _pdf_weibull(xseq, p):
|
|
252
|
+
return _require_scipy().weibull_min.pdf(xseq, p["shape"], loc=0, scale=p["scale"])
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _pdf_exp(xseq, p):
|
|
256
|
+
return _require_scipy().expon.pdf(xseq, loc=0, scale=1.0 / p["rate"])
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _pdf_logis(xseq, p):
|
|
260
|
+
return _require_scipy().logistic.pdf(xseq, loc=p["location"], scale=p["scale"])
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _pdf_beta(xseq, p):
|
|
264
|
+
return _require_scipy().beta.pdf(xseq, p["shape1"], p["shape2"], loc=0, scale=1)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _pdf_unif(xseq, p):
|
|
268
|
+
lo, hi = p["min"], p["max"]
|
|
269
|
+
return _require_scipy().uniform.pdf(xseq, loc=lo, scale=hi - lo)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _pdf_t(xseq, p):
|
|
273
|
+
return _require_scipy().t.pdf(xseq, p["df"])
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _pdf_f(xseq, p):
|
|
277
|
+
return _require_scipy().f.pdf(xseq, p["df1"], p["df2"])
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _pdf_chisq(xseq, p):
|
|
281
|
+
return _require_scipy().chi2.pdf(xseq, p["df"])
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _pmf_pois(xseq, p):
|
|
285
|
+
return _require_scipy().poisson.pmf(xseq, p["lambda"])
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _pmf_geom(xseq, p):
|
|
289
|
+
return _require_scipy().geom.pmf(xseq + 1, p["prob"]) # scipy geom support: 1,2,...
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _pmf_binom(xseq, p):
|
|
293
|
+
return _require_scipy().binom.pmf(xseq, int(p["size"]), p["prob"])
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _pmf_nbinom(xseq, p):
|
|
297
|
+
size, mu = p["size"], p["mu"]
|
|
298
|
+
prob = size / (size + mu)
|
|
299
|
+
return _require_scipy().nbinom.pmf(xseq, size, prob)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
_DISTRI_TABLE: Dict[str, Dict[str, Any]] = {
|
|
303
|
+
"norm": {"kind": "continuous", "fitter": _fit_norm, "pdf": _pdf_norm},
|
|
304
|
+
"lnorm": {"kind": "continuous", "fitter": _fit_lnorm, "pdf": _pdf_lnorm},
|
|
305
|
+
"cauchy": {"kind": "continuous", "fitter": _fit_cauchy, "pdf": _pdf_cauchy},
|
|
306
|
+
"gamma": {"kind": "continuous", "fitter": _fit_gamma, "pdf": _pdf_gamma},
|
|
307
|
+
"weibull": {"kind": "continuous", "fitter": _fit_weibull, "pdf": _pdf_weibull},
|
|
308
|
+
"exp": {"kind": "continuous", "fitter": _fit_exp, "pdf": _pdf_exp},
|
|
309
|
+
"logis": {"kind": "continuous", "fitter": _fit_logis, "pdf": _pdf_logis},
|
|
310
|
+
"beta": {"kind": "continuous", "fitter": _fit_beta, "pdf": _pdf_beta},
|
|
311
|
+
"unif": {"kind": "continuous", "fitter": _fit_unif, "pdf": _pdf_unif},
|
|
312
|
+
"t": {"kind": "continuous", "fitter": _fit_t, "pdf": _pdf_t},
|
|
313
|
+
"f": {"kind": "continuous", "fitter": _fit_f, "pdf": _pdf_f},
|
|
314
|
+
"chisq": {"kind": "continuous", "fitter": _fit_chisq, "pdf": _pdf_chisq},
|
|
315
|
+
"pois": {"kind": "discrete", "fitter": _fit_pois, "pdf": _pmf_pois},
|
|
316
|
+
"geom": {"kind": "discrete", "fitter": _fit_geom, "pdf": _pmf_geom},
|
|
317
|
+
"binom": {"kind": "discrete", "fitter": _fit_binom, "pdf": _pmf_binom},
|
|
318
|
+
"nbinom": {"kind": "discrete", "fitter": _fit_nbinom, "pdf": _pmf_nbinom},
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
#: Distributions explicitly rejected by ``stat_theodensity`` (mirrors R).
|
|
322
|
+
_UNSUPPORTED = ("multinom", "hyper", "wilcox", "signrank")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# ---------------------------------------------------------------------------
|
|
326
|
+
# Helper: classify a distribution as discrete or continuous
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _class_distri(distri: str) -> str:
|
|
331
|
+
"""Classify a distribution name as ``"discrete"`` or ``"continuous"``.
|
|
332
|
+
|
|
333
|
+
Port of ``class_distri`` (``stat_theodensity.R``). R first checks fixed
|
|
334
|
+
discrete/continuous name sets and only falls back to an empirical
|
|
335
|
+
``r<distri>`` probe for user-defined distributions in the calling
|
|
336
|
+
environment. The Python port supports the built-in distributions of the
|
|
337
|
+
mapping table; unknown names raise.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
distri : str
|
|
342
|
+
Distribution name without the ``d``/``r``/``p``/``q`` prefix.
|
|
343
|
+
|
|
344
|
+
Returns
|
|
345
|
+
-------
|
|
346
|
+
str
|
|
347
|
+
``"discrete"`` or ``"continuous"``.
|
|
348
|
+
|
|
349
|
+
Raises
|
|
350
|
+
------
|
|
351
|
+
ValueError
|
|
352
|
+
If the distribution cannot be classified.
|
|
353
|
+
"""
|
|
354
|
+
discrete_distris = (
|
|
355
|
+
"pois",
|
|
356
|
+
"nbinom",
|
|
357
|
+
"binom",
|
|
358
|
+
"geom",
|
|
359
|
+
"hyper",
|
|
360
|
+
"signrank",
|
|
361
|
+
"multinom",
|
|
362
|
+
"wilcox",
|
|
363
|
+
)
|
|
364
|
+
if distri in discrete_distris:
|
|
365
|
+
return "discrete"
|
|
366
|
+
|
|
367
|
+
conti_distris = (
|
|
368
|
+
"beta",
|
|
369
|
+
"cauchy",
|
|
370
|
+
"chisq",
|
|
371
|
+
"exp",
|
|
372
|
+
"f",
|
|
373
|
+
"gamma",
|
|
374
|
+
"lnorm",
|
|
375
|
+
"norm",
|
|
376
|
+
"t",
|
|
377
|
+
"unif",
|
|
378
|
+
"weibull",
|
|
379
|
+
"logis",
|
|
380
|
+
)
|
|
381
|
+
if distri in conti_distris:
|
|
382
|
+
return "continuous"
|
|
383
|
+
|
|
384
|
+
# R performs an empirical probe of a user-supplied ``r<distri>`` function in
|
|
385
|
+
# the calling environment. That is out of scope for the port; abort like R
|
|
386
|
+
# would when it cannot determine the type.
|
|
387
|
+
cli_abort(
|
|
388
|
+
f"`stat_theodensity()` failed to determine if the '{distri}' "
|
|
389
|
+
"distribution is discrete or continuous."
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ---------------------------------------------------------------------------
|
|
394
|
+
# ggproto
|
|
395
|
+
# ---------------------------------------------------------------------------
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class StatTheoDensity(StatDensity):
|
|
399
|
+
"""Fit a theoretical distribution by MLE and evaluate its density.
|
|
400
|
+
|
|
401
|
+
Extends :class:`ggplot2_py.StatDensity`. The kernel-density computation of
|
|
402
|
+
the parent is replaced by maximum-likelihood fitting of a named theoretical
|
|
403
|
+
distribution followed by evaluation of its probability density (continuous)
|
|
404
|
+
or mass (discrete) function.
|
|
405
|
+
|
|
406
|
+
Attributes
|
|
407
|
+
----------
|
|
408
|
+
default_aes : dict
|
|
409
|
+
Inherited from :class:`StatDensity`, mapping ``x``/``y`` to
|
|
410
|
+
``after_stat(density)``.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
extra_params: List[str] = ["na_rm", "orientation"]
|
|
414
|
+
|
|
415
|
+
def compute_group(
|
|
416
|
+
self,
|
|
417
|
+
data: pd.DataFrame,
|
|
418
|
+
scales: Any,
|
|
419
|
+
distri: str = "norm",
|
|
420
|
+
n: int = 512,
|
|
421
|
+
distri_type: str = "continuous",
|
|
422
|
+
fix_arg: Optional[Dict[str, Any]] = None,
|
|
423
|
+
start_arg: Optional[Dict[str, Any]] = None,
|
|
424
|
+
**kwargs: Any,
|
|
425
|
+
) -> pd.DataFrame:
|
|
426
|
+
"""Fit ``distri`` to ``data['x']`` and evaluate its density on a grid.
|
|
427
|
+
|
|
428
|
+
Parameters
|
|
429
|
+
----------
|
|
430
|
+
data : pandas.DataFrame
|
|
431
|
+
Must contain an ``x`` column.
|
|
432
|
+
scales : dict-like
|
|
433
|
+
Panel scales; ``scales['x'].dimension()`` provides the evaluation
|
|
434
|
+
range.
|
|
435
|
+
distri : str, default ``"norm"``
|
|
436
|
+
Distribution name (without prefix).
|
|
437
|
+
n : int, default 512
|
|
438
|
+
Number of equally spaced evaluation points (continuous only).
|
|
439
|
+
distri_type : str, default ``"continuous"``
|
|
440
|
+
Either ``"continuous"`` or ``"discrete"``.
|
|
441
|
+
fix_arg : dict, optional
|
|
442
|
+
Fixed parameters in R parameterization.
|
|
443
|
+
start_arg : dict, optional
|
|
444
|
+
Starting parameters (consumed by some fitters).
|
|
445
|
+
|
|
446
|
+
Returns
|
|
447
|
+
-------
|
|
448
|
+
pandas.DataFrame
|
|
449
|
+
Columns ``x``, ``density``, ``scaled``, ``count``, ``n`` on success;
|
|
450
|
+
a single NaN row with columns ``x``, ``density``, ``ndensity``,
|
|
451
|
+
``count``, ``n`` on failure (``< 2`` points or estimation failure).
|
|
452
|
+
"""
|
|
453
|
+
_require_scipy()
|
|
454
|
+
|
|
455
|
+
# Data to return upon failure (mirrors R's ``nulldata``).
|
|
456
|
+
nulldata = pd.DataFrame(
|
|
457
|
+
{
|
|
458
|
+
"x": [np.nan],
|
|
459
|
+
"density": [np.nan],
|
|
460
|
+
"ndensity": [np.nan],
|
|
461
|
+
"count": [np.nan],
|
|
462
|
+
"n": [np.nan],
|
|
463
|
+
}
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
entry = _DISTRI_TABLE.get(distri)
|
|
467
|
+
if entry is None:
|
|
468
|
+
cli_abort(
|
|
469
|
+
"The `distri` argument must have a valid density function "
|
|
470
|
+
f"called `d{distri}`."
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
x_all = np.asarray(data["x"].to_numpy(), dtype=float)
|
|
474
|
+
x = x_all[~np.isnan(x_all)]
|
|
475
|
+
nx = len(data["x"]) # R uses length(data$x), i.e. including NA rows.
|
|
476
|
+
|
|
477
|
+
if nx < 2:
|
|
478
|
+
cli_warn("Groups with fewer than two data points have been dropped.")
|
|
479
|
+
return nulldata
|
|
480
|
+
|
|
481
|
+
scale = (
|
|
482
|
+
scales.get("x")
|
|
483
|
+
if isinstance(scales, dict)
|
|
484
|
+
else getattr(scales, "x", None)
|
|
485
|
+
)
|
|
486
|
+
if scale is not None and hasattr(scale, "dimension"):
|
|
487
|
+
rng = tuple(scale.dimension())
|
|
488
|
+
else:
|
|
489
|
+
rng = (float(np.nanmin(x_all)), float(np.nanmax(x_all)))
|
|
490
|
+
|
|
491
|
+
if distri_type == "discrete":
|
|
492
|
+
xseq = np.arange(np.floor(rng[0]), np.ceil(rng[1]) + 1, 1.0)
|
|
493
|
+
else:
|
|
494
|
+
xseq = np.linspace(rng[0], rng[1], n)
|
|
495
|
+
|
|
496
|
+
# Maximum-likelihood estimation (replaces fitdistrplus::fitdist).
|
|
497
|
+
try:
|
|
498
|
+
params = entry["fitter"](x, fix_arg, start_arg)
|
|
499
|
+
except Exception: # noqa: BLE001 - any estimation failure -> nulldata
|
|
500
|
+
cli_warn(f"Failed to estimate parameters of '{distri}' distribution.")
|
|
501
|
+
return nulldata
|
|
502
|
+
|
|
503
|
+
par_values = np.asarray(list(params.values()), dtype=float)
|
|
504
|
+
if (
|
|
505
|
+
par_values.size == 0
|
|
506
|
+
or np.any(np.isnan(par_values))
|
|
507
|
+
or not np.all(np.isfinite(par_values))
|
|
508
|
+
):
|
|
509
|
+
cli_warn(f"Failed to estimate parameters of '{distri}' distribution.")
|
|
510
|
+
return nulldata
|
|
511
|
+
|
|
512
|
+
dens = np.asarray(entry["pdf"](xseq, params), dtype=float)
|
|
513
|
+
|
|
514
|
+
dens_max = np.nanmax(dens)
|
|
515
|
+
return pd.DataFrame(
|
|
516
|
+
{
|
|
517
|
+
"x": xseq,
|
|
518
|
+
"density": dens,
|
|
519
|
+
"scaled": dens / dens_max,
|
|
520
|
+
"count": dens * nx,
|
|
521
|
+
"n": nx,
|
|
522
|
+
}
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def setup_params(
|
|
526
|
+
self, data: pd.DataFrame, params: Dict[str, Any]
|
|
527
|
+
) -> Dict[str, Any]:
|
|
528
|
+
"""Classify the distribution and apply R's parameter remaps.
|
|
529
|
+
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
data : pandas.DataFrame
|
|
533
|
+
Layer data; ``data['x']`` is inspected for integrality.
|
|
534
|
+
params : dict
|
|
535
|
+
Stat parameters (``distri``, ``fix_arg``, ``start_arg``, ...).
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
dict
|
|
540
|
+
Updated parameters with ``distri_type`` injected and ``chisq`` /
|
|
541
|
+
``binom`` remaps applied.
|
|
542
|
+
|
|
543
|
+
Raises
|
|
544
|
+
------
|
|
545
|
+
ValueError
|
|
546
|
+
If a discrete distribution is requested for non-integer data.
|
|
547
|
+
"""
|
|
548
|
+
distri = params.get("distri", "norm")
|
|
549
|
+
dtype = _class_distri(distri)
|
|
550
|
+
if dtype == "discrete":
|
|
551
|
+
x = np.asarray(data["x"].to_numpy(), dtype=float)
|
|
552
|
+
x = x[~np.isnan(x)]
|
|
553
|
+
if float(np.sum(np.abs(np.mod(x, 1)))) > 0:
|
|
554
|
+
cli_abort(
|
|
555
|
+
f"A discrete '{distri}' distribution cannot be fitted "
|
|
556
|
+
"to continuous data."
|
|
557
|
+
)
|
|
558
|
+
params = dict(params)
|
|
559
|
+
params["distri_type"] = dtype
|
|
560
|
+
|
|
561
|
+
# Chi square estimator causes trouble; estimate as gamma with rate=0.5.
|
|
562
|
+
if params.get("distri") == "chisq":
|
|
563
|
+
params["distri"] = "gamma"
|
|
564
|
+
fix_arg = params.get("fix_arg")
|
|
565
|
+
if fix_arg is None:
|
|
566
|
+
params["fix_arg"] = {"rate": 0.5}
|
|
567
|
+
else:
|
|
568
|
+
params["fix_arg"] = {
|
|
569
|
+
"shape": float(fix_arg["df"]) / 2.0,
|
|
570
|
+
"rate": 0.5,
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
# Binomial does not operate without a fixed size.
|
|
574
|
+
if params.get("distri") == "binom":
|
|
575
|
+
x = np.asarray(data["x"].to_numpy(), dtype=float)
|
|
576
|
+
x = x[~np.isnan(x)]
|
|
577
|
+
if params.get("fix_arg") is None:
|
|
578
|
+
params["fix_arg"] = {"size": int(np.max(x))}
|
|
579
|
+
cli_inform(
|
|
580
|
+
"Estimating binomial PMF with size set to maximum data value."
|
|
581
|
+
)
|
|
582
|
+
params["start_arg"] = {
|
|
583
|
+
"prob": float(np.mean(x)) / float(np.max(x))
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
return params
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
# ---------------------------------------------------------------------------
|
|
590
|
+
# Constructor
|
|
591
|
+
# ---------------------------------------------------------------------------
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def stat_theodensity(
|
|
595
|
+
mapping: Optional[Any] = None,
|
|
596
|
+
data: Any = None,
|
|
597
|
+
geom: str = "line",
|
|
598
|
+
position: str = "identity",
|
|
599
|
+
*,
|
|
600
|
+
distri: str = "norm",
|
|
601
|
+
n: int = 512,
|
|
602
|
+
fix_arg: Optional[Dict[str, Any]] = None,
|
|
603
|
+
start_arg: Optional[Dict[str, Any]] = None,
|
|
604
|
+
na_rm: bool = True,
|
|
605
|
+
show_legend: Optional[bool] = None,
|
|
606
|
+
inherit_aes: bool = True,
|
|
607
|
+
**kwargs: Any,
|
|
608
|
+
) -> Any:
|
|
609
|
+
"""Construct a fitted-theoretical-density layer.
|
|
610
|
+
|
|
611
|
+
Estimates the parameters of ``distri`` by maximum likelihood and evaluates
|
|
612
|
+
its probability density function, useful for comparing histograms or kernel
|
|
613
|
+
density estimates against a theoretical distribution.
|
|
614
|
+
|
|
615
|
+
Parameters
|
|
616
|
+
----------
|
|
617
|
+
mapping : aes, optional
|
|
618
|
+
Aesthetic mapping.
|
|
619
|
+
data : DataFrame or callable, optional
|
|
620
|
+
Layer data.
|
|
621
|
+
geom : str, default ``"line"``
|
|
622
|
+
Geometry used to render the layer.
|
|
623
|
+
position : str, default ``"identity"``
|
|
624
|
+
Position adjustment.
|
|
625
|
+
distri : str, default ``"norm"``
|
|
626
|
+
Distribution name without prefix (e.g. ``"norm"``, ``"nbinom"``). See
|
|
627
|
+
:data:`_DISTRI_TABLE` for supported names.
|
|
628
|
+
n : int, default 512
|
|
629
|
+
Number of equally spaced evaluation points (ignored for discrete
|
|
630
|
+
distributions).
|
|
631
|
+
fix_arg : dict, optional
|
|
632
|
+
Fixed parameters of the named distribution (R parameterization).
|
|
633
|
+
start_arg : dict, optional
|
|
634
|
+
Starting parameters for the estimation.
|
|
635
|
+
na_rm : bool, default True
|
|
636
|
+
Whether to silently remove missing values.
|
|
637
|
+
show_legend : bool, optional
|
|
638
|
+
Whether to show this layer in the legend.
|
|
639
|
+
inherit_aes : bool, default True
|
|
640
|
+
Whether to inherit aesthetics from the plot.
|
|
641
|
+
**kwargs
|
|
642
|
+
Additional parameters forwarded to the layer.
|
|
643
|
+
|
|
644
|
+
Returns
|
|
645
|
+
-------
|
|
646
|
+
Layer
|
|
647
|
+
A ggplot2_py layer.
|
|
648
|
+
|
|
649
|
+
Raises
|
|
650
|
+
------
|
|
651
|
+
ValueError
|
|
652
|
+
If ``distri`` has no known density function, or names an unsupported
|
|
653
|
+
distribution (``multinom``/``hyper``/``wilcox``/``signrank``).
|
|
654
|
+
"""
|
|
655
|
+
if distri not in _DISTRI_TABLE:
|
|
656
|
+
cli_abort(
|
|
657
|
+
"The `distri` argument must have a valid density function "
|
|
658
|
+
f"called `d{distri}`."
|
|
659
|
+
)
|
|
660
|
+
if distri in _UNSUPPORTED:
|
|
661
|
+
cli_abort(
|
|
662
|
+
f"`stat_theodensity()` does not support the '{distri}' distribution."
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
return _layer(
|
|
666
|
+
stat=StatTheoDensity,
|
|
667
|
+
geom=geom,
|
|
668
|
+
data=data,
|
|
669
|
+
mapping=mapping,
|
|
670
|
+
position=position,
|
|
671
|
+
show_legend=show_legend,
|
|
672
|
+
inherit_aes=inherit_aes,
|
|
673
|
+
params={
|
|
674
|
+
"distri": distri,
|
|
675
|
+
"n": n,
|
|
676
|
+
"fix_arg": fix_arg,
|
|
677
|
+
"start_arg": start_arg,
|
|
678
|
+
"na_rm": na_rm,
|
|
679
|
+
**kwargs,
|
|
680
|
+
},
|
|
681
|
+
)
|