pysofra 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysofra/__init__.py +82 -0
- pysofra/core/__init__.py +14 -0
- pysofra/core/compose.py +167 -0
- pysofra/core/format.py +155 -0
- pysofra/core/frames.py +69 -0
- pysofra/core/schema.py +128 -0
- pysofra/core/table.py +924 -0
- pysofra/io/__init__.py +1 -0
- pysofra/models/__init__.py +6 -0
- pysofra/models/extract.py +249 -0
- pysofra/models/pool.py +119 -0
- pysofra/models/regression.py +507 -0
- pysofra/models/survival.py +395 -0
- pysofra/models/uvregression.py +438 -0
- pysofra/notebook/__init__.py +6 -0
- pysofra/plot/__init__.py +23 -0
- pysofra/plot/_backend.py +32 -0
- pysofra/plot/forest.py +159 -0
- pysofra/plot/inline.py +171 -0
- pysofra/plot/km.py +249 -0
- pysofra/render/__init__.py +28 -0
- pysofra/render/_zip_determinism.py +57 -0
- pysofra/render/base.py +22 -0
- pysofra/render/docx.py +286 -0
- pysofra/render/html.py +442 -0
- pysofra/render/image.py +130 -0
- pysofra/render/latex.py +253 -0
- pysofra/render/markdown.py +128 -0
- pysofra/render/pptx.py +340 -0
- pysofra/render/xlsx.py +226 -0
- pysofra/summary/__init__.py +6 -0
- pysofra/summary/calibrate.py +214 -0
- pysofra/summary/design.py +246 -0
- pysofra/summary/effect_size.py +187 -0
- pysofra/summary/extras.py +745 -0
- pysofra/summary/smd.py +133 -0
- pysofra/summary/stats.py +135 -0
- pysofra/summary/tbl_cross.py +339 -0
- pysofra/summary/tbl_one.py +1220 -0
- pysofra/summary/tbl_summary.py +51 -0
- pysofra/summary/tests.py +370 -0
- pysofra/summary/typing.py +129 -0
- pysofra/summary/weights.py +161 -0
- pysofra/themes/__init__.py +5 -0
- pysofra/themes/registry.py +272 -0
- pysofra-0.1.0a1.dist-info/METADATA +301 -0
- pysofra-0.1.0a1.dist-info/RECORD +50 -0
- pysofra-0.1.0a1.dist-info/WHEEL +4 -0
- pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
- pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""General descriptive summary tables — equivalent to ``gtsummary::tbl_summary``.
|
|
2
|
+
|
|
3
|
+
``tbl_summary`` is broadly the same engine as :func:`tbl_one` but with more
|
|
4
|
+
flexible knobs: you can pass custom statistic templates, override missing
|
|
5
|
+
handling per variable, and produce summaries without a stratification
|
|
6
|
+
variable as the natural default.
|
|
7
|
+
|
|
8
|
+
For the MVP, ``tbl_summary`` delegates to the same :func:`_build` engine
|
|
9
|
+
under the hood — there is genuinely one statistical computation; only the
|
|
10
|
+
defaults differ between the two front doors.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from ..core.table import SofraTable
|
|
18
|
+
from .tbl_one import tbl_one
|
|
19
|
+
from .typing import VarKind
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def tbl_summary(
|
|
23
|
+
data: pd.DataFrame,
|
|
24
|
+
*,
|
|
25
|
+
by: str | None = None,
|
|
26
|
+
variables: list[str] | None = None,
|
|
27
|
+
labels: dict[str, str] | None = None,
|
|
28
|
+
types: dict[str, VarKind] | None = None,
|
|
29
|
+
nonnormal: list[str] | None = None,
|
|
30
|
+
digits: int = 2,
|
|
31
|
+
pct_digits: int = 1,
|
|
32
|
+
missing: str = "ifany",
|
|
33
|
+
) -> SofraTable:
|
|
34
|
+
"""Build a general descriptive summary table.
|
|
35
|
+
|
|
36
|
+
See :func:`pysofra.tbl_one` for parameter documentation. The two
|
|
37
|
+
functions share an engine; the names exist separately because the
|
|
38
|
+
*intent* differs (Table 1 baseline vs. arbitrary descriptive summary)
|
|
39
|
+
and we may diverge their defaults further in future releases.
|
|
40
|
+
"""
|
|
41
|
+
return tbl_one(
|
|
42
|
+
data,
|
|
43
|
+
by=by,
|
|
44
|
+
variables=variables,
|
|
45
|
+
labels=labels,
|
|
46
|
+
types=types,
|
|
47
|
+
nonnormal=nonnormal,
|
|
48
|
+
digits=digits,
|
|
49
|
+
pct_digits=pct_digits,
|
|
50
|
+
missing=missing,
|
|
51
|
+
)
|
pysofra/summary/tests.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""Statistical test selection for Table 1 / summary tables.
|
|
2
|
+
|
|
3
|
+
Two layers:
|
|
4
|
+
|
|
5
|
+
* **Defaults** — :func:`continuous_test` and :func:`categorical_test` choose
|
|
6
|
+
a sensible test for a variable given its kind, following the
|
|
7
|
+
``tableone`` / ``gtsummary`` conventions:
|
|
8
|
+
|
|
9
|
+
Continuous, 2 groups → Welch's t-test (Wilcoxon if ``nonnormal``)
|
|
10
|
+
Continuous, 3+ groups → one-way ANOVA (Kruskal–Wallis if ``nonnormal``)
|
|
11
|
+
Categorical, 2×2 → Fisher's exact
|
|
12
|
+
Categorical, larger → Pearson chi-square (flagged sparse if any expected < 5)
|
|
13
|
+
|
|
14
|
+
* **Per-variable overrides** — :func:`run_named_test` dispatches a named
|
|
15
|
+
test by string key. Builders accept a ``tests={'age': 'wilcoxon'}`` map
|
|
16
|
+
and call into this dispatcher, falling back to the defaults otherwise.
|
|
17
|
+
|
|
18
|
+
Returns a small :class:`TestResult` so callers can render both the p-value
|
|
19
|
+
and the test name for the footnote.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import warnings
|
|
25
|
+
from collections.abc import Callable
|
|
26
|
+
from contextlib import contextmanager
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas as pd
|
|
31
|
+
from scipy import stats as sp_stats
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class TestResult:
|
|
36
|
+
p_value: float | None
|
|
37
|
+
test: str # short human-readable name; used in footnote
|
|
38
|
+
statistic: float | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_NA = TestResult(p_value=None, test="—")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def _quiet_scipy(): # type: ignore[no-untyped-def]
|
|
46
|
+
"""Suppress numeric and ``RuntimeWarning`` chatter from scipy hypothesis
|
|
47
|
+
tests on edge-case inputs.
|
|
48
|
+
|
|
49
|
+
scipy emits ``RuntimeWarning: Precision loss occurred in moment
|
|
50
|
+
calculation`` when ``ttest_ind`` is asked to test on near-constant
|
|
51
|
+
arrays, and similar advisory warnings for other tests on degenerate
|
|
52
|
+
data. The warning is correct but advisory — the resulting ``nan``
|
|
53
|
+
/ boundary p-value is the well-defined output we want to surface.
|
|
54
|
+
Users gating on ``-W error::RuntimeWarning`` would otherwise see
|
|
55
|
+
the test routine crash instead of returning a NaN p-value, so wrap
|
|
56
|
+
the scipy call at our boundary.
|
|
57
|
+
"""
|
|
58
|
+
with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
|
|
59
|
+
warnings.catch_warnings():
|
|
60
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
61
|
+
yield
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _group_arrays(values: pd.Series, groups: pd.Series) -> list[np.ndarray]:
|
|
65
|
+
df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"), "g": groups})
|
|
66
|
+
df = df.dropna(subset=["v", "g"])
|
|
67
|
+
if df.empty:
|
|
68
|
+
return []
|
|
69
|
+
arrs = [g["v"].to_numpy() for _, g in df.groupby("g", observed=True)]
|
|
70
|
+
return [a for a in arrs if a.size > 0]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ----------------------------------------------------------------------
|
|
74
|
+
# Continuous
|
|
75
|
+
# ----------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def continuous_test(
|
|
78
|
+
values: pd.Series,
|
|
79
|
+
groups: pd.Series,
|
|
80
|
+
nonnormal: bool = False,
|
|
81
|
+
) -> TestResult:
|
|
82
|
+
"""Default continuous test selection."""
|
|
83
|
+
arrs = _group_arrays(values, groups)
|
|
84
|
+
if len(arrs) < 2:
|
|
85
|
+
return _NA
|
|
86
|
+
if len(arrs) == 2:
|
|
87
|
+
return _wilcoxon(arrs) if nonnormal else _welch(arrs)
|
|
88
|
+
return _kruskal(arrs) if nonnormal else _anova(arrs)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _welch(arrs: list[np.ndarray]) -> TestResult:
|
|
92
|
+
with _quiet_scipy():
|
|
93
|
+
stat, p = sp_stats.ttest_ind(*arrs, equal_var=False, nan_policy="omit")
|
|
94
|
+
return TestResult(p_value=float(p), test="Welch's t-test", statistic=float(stat))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _student_t(arrs: list[np.ndarray]) -> TestResult:
|
|
98
|
+
with _quiet_scipy():
|
|
99
|
+
stat, p = sp_stats.ttest_ind(*arrs, equal_var=True, nan_policy="omit")
|
|
100
|
+
return TestResult(p_value=float(p), test="Student's t-test", statistic=float(stat))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _wilcoxon(arrs: list[np.ndarray]) -> TestResult:
|
|
104
|
+
with _quiet_scipy():
|
|
105
|
+
stat, p = sp_stats.mannwhitneyu(*arrs, alternative="two-sided")
|
|
106
|
+
return TestResult(p_value=float(p), test="Wilcoxon rank-sum", statistic=float(stat))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _anova(arrs: list[np.ndarray]) -> TestResult:
|
|
110
|
+
with _quiet_scipy():
|
|
111
|
+
stat, p = sp_stats.f_oneway(*arrs)
|
|
112
|
+
return TestResult(p_value=float(p), test="One-way ANOVA", statistic=float(stat))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _kruskal(arrs: list[np.ndarray]) -> TestResult:
|
|
116
|
+
with _quiet_scipy():
|
|
117
|
+
stat, p = sp_stats.kruskal(*arrs)
|
|
118
|
+
return TestResult(p_value=float(p), test="Kruskal–Wallis", statistic=float(stat))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# ----------------------------------------------------------------------
|
|
122
|
+
# Categorical
|
|
123
|
+
# ----------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
def categorical_test(values: pd.Series, groups: pd.Series) -> TestResult:
|
|
126
|
+
"""Default categorical test selection."""
|
|
127
|
+
ctab = _crosstab(values, groups)
|
|
128
|
+
if ctab is None:
|
|
129
|
+
return _NA
|
|
130
|
+
observed = ctab.to_numpy()
|
|
131
|
+
if observed.shape == (2, 2):
|
|
132
|
+
return _fisher(observed)
|
|
133
|
+
return _chisq(observed)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def svyttest(
|
|
137
|
+
values: pd.Series,
|
|
138
|
+
groups: pd.Series,
|
|
139
|
+
weights: pd.Series,
|
|
140
|
+
*,
|
|
141
|
+
strata: pd.Series | None = None,
|
|
142
|
+
cluster: pd.Series | None = None,
|
|
143
|
+
) -> TestResult:
|
|
144
|
+
"""Design-adjusted two-sample t-test (svyttest analogue).
|
|
145
|
+
|
|
146
|
+
For two groups, the test statistic is
|
|
147
|
+
|
|
148
|
+
t = (ȳ₂_w − ȳ₁_w) / SE(diff)
|
|
149
|
+
|
|
150
|
+
where ȳᵢ_w is the weighted group mean and the SE is computed via
|
|
151
|
+
Taylor linearisation (using :func:`pysofra.summary.design.design_mean_var`
|
|
152
|
+
once per group, summed across groups for the variance of the
|
|
153
|
+
difference). Compared against a ``t`` distribution with ``Σ n_h − H``
|
|
154
|
+
degrees of freedom (where ``H`` is the number of strata if given, or
|
|
155
|
+
one otherwise).
|
|
156
|
+
"""
|
|
157
|
+
from .design import design_mean_var
|
|
158
|
+
|
|
159
|
+
df_ = pd.DataFrame({
|
|
160
|
+
"v": pd.to_numeric(values, errors="coerce"),
|
|
161
|
+
"g": groups,
|
|
162
|
+
"w": pd.to_numeric(weights, errors="coerce"),
|
|
163
|
+
})
|
|
164
|
+
if strata is not None:
|
|
165
|
+
df_["strata"] = strata.values
|
|
166
|
+
if cluster is not None:
|
|
167
|
+
df_["cluster"] = cluster.values
|
|
168
|
+
df_ = df_.dropna(subset=["v", "g", "w"])
|
|
169
|
+
df_ = df_[df_["w"] > 0]
|
|
170
|
+
if df_.empty:
|
|
171
|
+
return _NA
|
|
172
|
+
|
|
173
|
+
levels = sorted(df_["g"].unique(), key=str)
|
|
174
|
+
if len(levels) != 2:
|
|
175
|
+
return _NA
|
|
176
|
+
|
|
177
|
+
means: list[float] = []
|
|
178
|
+
vars_: list[float] = []
|
|
179
|
+
n_per: list[int] = []
|
|
180
|
+
for lvl in levels:
|
|
181
|
+
sub = df_[df_["g"] == lvl]
|
|
182
|
+
m, v, _ = design_mean_var(
|
|
183
|
+
sub["v"], sub["w"],
|
|
184
|
+
strata=sub.get("strata"),
|
|
185
|
+
cluster=sub.get("cluster"),
|
|
186
|
+
)
|
|
187
|
+
means.append(m)
|
|
188
|
+
vars_.append(v)
|
|
189
|
+
n_per.append(int(len(sub)))
|
|
190
|
+
|
|
191
|
+
diff = means[1] - means[0]
|
|
192
|
+
se = float(np.sqrt(vars_[0] + vars_[1])) if not (
|
|
193
|
+
np.isnan(vars_[0]) or np.isnan(vars_[1])
|
|
194
|
+
) else float("nan")
|
|
195
|
+
if not np.isfinite(se) or se == 0:
|
|
196
|
+
return _NA
|
|
197
|
+
t_stat = diff / se
|
|
198
|
+
# Degrees of freedom: total n minus number of strata (1 if unstratified).
|
|
199
|
+
h = 1 if strata is None else int(pd.Series(strata).nunique())
|
|
200
|
+
df_deg = max(1, sum(n_per) - h)
|
|
201
|
+
p = 2 * float(sp_stats.t.sf(abs(t_stat), df=df_deg))
|
|
202
|
+
|
|
203
|
+
return TestResult(p_value=p, test="Design-adjusted t-test", statistic=t_stat)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def rao_scott_chisq(
|
|
207
|
+
values: pd.Series,
|
|
208
|
+
groups: pd.Series,
|
|
209
|
+
weights: pd.Series,
|
|
210
|
+
) -> TestResult:
|
|
211
|
+
"""Rao–Scott first-order corrected chi-square for survey-weighted data.
|
|
212
|
+
|
|
213
|
+
Computes a Pearson chi-square statistic on the *weighted* contingency
|
|
214
|
+
table, then scales it by an estimated design effect (DEFF) derived
|
|
215
|
+
from the weights:
|
|
216
|
+
|
|
217
|
+
DEFF ≈ n * Σ w_i² / (Σ w_i)²
|
|
218
|
+
|
|
219
|
+
The corrected statistic is referred to a χ² distribution with the
|
|
220
|
+
usual ``(R-1)(C-1)`` degrees of freedom. This is the first-order
|
|
221
|
+
Rao–Scott correction; for full second-order accuracy a generalised
|
|
222
|
+
design matrix is required and is left to dedicated survey packages.
|
|
223
|
+
"""
|
|
224
|
+
df = pd.DataFrame({
|
|
225
|
+
"v": values,
|
|
226
|
+
"g": groups,
|
|
227
|
+
"w": pd.to_numeric(weights, errors="coerce"),
|
|
228
|
+
}).dropna()
|
|
229
|
+
df = df[df["w"] > 0]
|
|
230
|
+
if df.empty:
|
|
231
|
+
return _NA
|
|
232
|
+
|
|
233
|
+
# Weighted contingency table.
|
|
234
|
+
ctab = df.groupby(["v", "g"], observed=True)["w"].sum().unstack(
|
|
235
|
+
fill_value=0,
|
|
236
|
+
).astype(float)
|
|
237
|
+
if ctab.shape[0] < 2 or ctab.shape[1] < 2:
|
|
238
|
+
return _NA
|
|
239
|
+
observed_w = ctab.to_numpy(dtype=float)
|
|
240
|
+
|
|
241
|
+
# Pearson chi-square on the weighted observed table.
|
|
242
|
+
with _quiet_scipy():
|
|
243
|
+
chi2, _, dof, _expected = sp_stats.chi2_contingency(
|
|
244
|
+
observed_w, correction=False,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
w = df["w"].to_numpy(dtype=float)
|
|
248
|
+
n = float(len(w))
|
|
249
|
+
deff = float(n * (w**2).sum() / (w.sum() ** 2)) if w.sum() > 0 else 1.0
|
|
250
|
+
chi2_adj = float(chi2) / max(deff, 1e-12)
|
|
251
|
+
p_adj = float(sp_stats.chi2.sf(chi2_adj, df=dof))
|
|
252
|
+
|
|
253
|
+
return TestResult(
|
|
254
|
+
p_value=p_adj,
|
|
255
|
+
test="Rao–Scott chi-square",
|
|
256
|
+
statistic=chi2_adj,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _crosstab(values: pd.Series, groups: pd.Series) -> pd.DataFrame | None:
|
|
261
|
+
df = pd.DataFrame({"v": values, "g": groups}).dropna()
|
|
262
|
+
if df.empty:
|
|
263
|
+
return None
|
|
264
|
+
ctab = pd.crosstab(df["v"], df["g"])
|
|
265
|
+
if ctab.shape[0] < 2 or ctab.shape[1] < 2:
|
|
266
|
+
return None
|
|
267
|
+
return ctab
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _fisher(observed: np.ndarray) -> TestResult:
|
|
271
|
+
# scipy >= 1.13: `alternative` is only valid for 2x2 tables; larger
|
|
272
|
+
# tables use an exact RxC computation under the default method.
|
|
273
|
+
with _quiet_scipy():
|
|
274
|
+
if observed.shape == (2, 2):
|
|
275
|
+
_, p = sp_stats.fisher_exact(observed, alternative="two-sided")
|
|
276
|
+
else:
|
|
277
|
+
_, p = sp_stats.fisher_exact(observed)
|
|
278
|
+
return TestResult(p_value=float(p), test="Fisher's exact")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _chisq(observed: np.ndarray) -> TestResult:
|
|
282
|
+
with _quiet_scipy():
|
|
283
|
+
chi2, p, _, expected = sp_stats.chi2_contingency(observed, correction=False)
|
|
284
|
+
if np.any(expected < 5):
|
|
285
|
+
return TestResult(p_value=float(p), test="Chi-square (sparse)", statistic=float(chi2))
|
|
286
|
+
return TestResult(p_value=float(p), test="Pearson's chi-square", statistic=float(chi2))
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ----------------------------------------------------------------------
|
|
290
|
+
# Named-test dispatcher (per-variable overrides)
|
|
291
|
+
# ----------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
ContinuousFn = Callable[[list[np.ndarray]], TestResult]
|
|
294
|
+
CategoricalFn = Callable[[np.ndarray], TestResult]
|
|
295
|
+
|
|
296
|
+
_CONTINUOUS_TESTS: dict[str, ContinuousFn] = {
|
|
297
|
+
"welch": _welch,
|
|
298
|
+
"welch_t": _welch,
|
|
299
|
+
"t": _welch,
|
|
300
|
+
"ttest": _welch,
|
|
301
|
+
"student": _student_t,
|
|
302
|
+
"student_t": _student_t,
|
|
303
|
+
"equal_var_t": _student_t,
|
|
304
|
+
"wilcoxon": _wilcoxon,
|
|
305
|
+
"mannwhitney": _wilcoxon,
|
|
306
|
+
"mwu": _wilcoxon,
|
|
307
|
+
"rank_sum": _wilcoxon,
|
|
308
|
+
"anova": _anova,
|
|
309
|
+
"oneway_anova": _anova,
|
|
310
|
+
"kruskal": _kruskal,
|
|
311
|
+
"kruskal_wallis": _kruskal,
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
_CATEGORICAL_TESTS: dict[str, CategoricalFn] = {
|
|
315
|
+
"fisher": _fisher,
|
|
316
|
+
"fisher_exact": _fisher,
|
|
317
|
+
"chisq": _chisq,
|
|
318
|
+
"chi_square": _chisq,
|
|
319
|
+
"chi2": _chisq,
|
|
320
|
+
"pearson": _chisq,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def available_tests() -> dict[str, list[str]]:
|
|
325
|
+
"""Return the lookup table of named tests, grouped by variable kind."""
|
|
326
|
+
return {
|
|
327
|
+
"continuous": sorted(_CONTINUOUS_TESTS),
|
|
328
|
+
"categorical": sorted(_CATEGORICAL_TESTS),
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def run_named_test(
|
|
333
|
+
name: str,
|
|
334
|
+
values: pd.Series,
|
|
335
|
+
groups: pd.Series,
|
|
336
|
+
*,
|
|
337
|
+
kind: str,
|
|
338
|
+
) -> TestResult:
|
|
339
|
+
"""Run a named test against (values, groups).
|
|
340
|
+
|
|
341
|
+
``kind`` is ``"continuous"`` or ``"categorical"`` and disambiguates
|
|
342
|
+
which dispatch table to consult. Raises ``ValueError`` if the test
|
|
343
|
+
name is unknown for the given kind.
|
|
344
|
+
"""
|
|
345
|
+
key = name.lower().strip()
|
|
346
|
+
if kind == "continuous":
|
|
347
|
+
cont_fn = _CONTINUOUS_TESTS.get(key)
|
|
348
|
+
if cont_fn is None:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"Unknown continuous test {name!r}. Available: "
|
|
351
|
+
+ ", ".join(sorted(set(_CONTINUOUS_TESTS)))
|
|
352
|
+
)
|
|
353
|
+
arrs = _group_arrays(values, groups)
|
|
354
|
+
if len(arrs) < 2:
|
|
355
|
+
return _NA
|
|
356
|
+
return cont_fn(arrs)
|
|
357
|
+
|
|
358
|
+
if kind == "categorical":
|
|
359
|
+
cat_fn = _CATEGORICAL_TESTS.get(key)
|
|
360
|
+
if cat_fn is None:
|
|
361
|
+
raise ValueError(
|
|
362
|
+
f"Unknown categorical test {name!r}. Available: "
|
|
363
|
+
+ ", ".join(sorted(set(_CATEGORICAL_TESTS)))
|
|
364
|
+
)
|
|
365
|
+
ctab = _crosstab(values, groups)
|
|
366
|
+
if ctab is None:
|
|
367
|
+
return _NA
|
|
368
|
+
return cat_fn(ctab.to_numpy())
|
|
369
|
+
|
|
370
|
+
raise ValueError(f"Unknown variable kind {kind!r}")
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Automatic variable typing for summary tables.
|
|
2
|
+
|
|
3
|
+
We classify each variable into one of four kinds:
|
|
4
|
+
|
|
5
|
+
* ``continuous`` — numeric and treated as continuous
|
|
6
|
+
* ``categorical`` — discrete factor variable (strings, booleans, low-cardinality ints)
|
|
7
|
+
* ``dichotomous`` — categorical with exactly two non-missing levels
|
|
8
|
+
* ``ordinal`` — pandas ``Categorical`` with ``ordered=True``
|
|
9
|
+
|
|
10
|
+
The classifier errs on the side of categorical when ambiguous (e.g. integer
|
|
11
|
+
columns with very few unique values) because mistakenly summarising a
|
|
12
|
+
factor as continuous produces nonsense output in publication tables.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Literal
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from pandas.api.types import (
|
|
22
|
+
is_bool_dtype,
|
|
23
|
+
is_datetime64_any_dtype,
|
|
24
|
+
is_numeric_dtype,
|
|
25
|
+
is_object_dtype,
|
|
26
|
+
is_string_dtype,
|
|
27
|
+
is_timedelta64_dtype,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_categorical(series: pd.Series) -> bool:
|
|
32
|
+
return isinstance(series.dtype, pd.CategoricalDtype)
|
|
33
|
+
|
|
34
|
+
VarKind = Literal["continuous", "categorical", "dichotomous", "ordinal"]
|
|
35
|
+
|
|
36
|
+
# Integer columns with at most this many distinct values *and* whose values
|
|
37
|
+
# all lie in a small low-integer range get classified as categorical.
|
|
38
|
+
# Real-world continuous variables (age, BMI, systolic BP, etc.) virtually
|
|
39
|
+
# always exceed this — the heuristic is intentionally conservative so we
|
|
40
|
+
# don't accidentally summarise a continuous variable as n (%) per level.
|
|
41
|
+
_MAX_CAT_CARDINALITY_INT = 5
|
|
42
|
+
_MAX_CAT_INT_VALUE = 20
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def infer_kind(series: pd.Series) -> VarKind:
|
|
46
|
+
"""Infer the variable kind of a pandas Series."""
|
|
47
|
+
s = series.dropna()
|
|
48
|
+
if s.empty:
|
|
49
|
+
# No information — default to categorical so we render n (%) of NaNs.
|
|
50
|
+
return "categorical"
|
|
51
|
+
|
|
52
|
+
if _is_categorical(series):
|
|
53
|
+
if getattr(series.cat, "ordered", False):
|
|
54
|
+
return "ordinal"
|
|
55
|
+
return "dichotomous" if s.nunique() == 2 else "categorical"
|
|
56
|
+
|
|
57
|
+
if is_bool_dtype(series):
|
|
58
|
+
return "dichotomous"
|
|
59
|
+
|
|
60
|
+
if is_string_dtype(series) or is_object_dtype(series):
|
|
61
|
+
return "dichotomous" if s.nunique() == 2 else "categorical"
|
|
62
|
+
|
|
63
|
+
if is_numeric_dtype(series):
|
|
64
|
+
uniques = s.unique()
|
|
65
|
+
# Exactly {0, 1} (or {0.0, 1.0}) is a strong dichotomous signal.
|
|
66
|
+
# ``int(np.inf)`` raises ``OverflowError``; columns containing
|
|
67
|
+
# ``inf`` or ``-inf`` are by definition not 0/1, so we fall
|
|
68
|
+
# through to the continuous branch instead of crashing.
|
|
69
|
+
try:
|
|
70
|
+
uvals = set(int(x) for x in uniques)
|
|
71
|
+
if uvals.issubset({0, 1}) and len(uvals) == 2:
|
|
72
|
+
return "dichotomous"
|
|
73
|
+
except (TypeError, ValueError, OverflowError):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# ``s.dtype`` is ``np.dtype | ExtensionDtype`` under
|
|
78
|
+
# pandas-stubs; only the np.dtype branch is meaningful here
|
|
79
|
+
# (extension dtypes are handled earlier), so silence the
|
|
80
|
+
# narrowed union for this site.
|
|
81
|
+
is_int = bool(np.issubdtype(s.dtype, np.integer)) # type: ignore[arg-type]
|
|
82
|
+
except TypeError: # pragma: no cover — defensive: numpy is_numeric_dtype guarantees a known dtype
|
|
83
|
+
is_int = False
|
|
84
|
+
if (
|
|
85
|
+
is_int
|
|
86
|
+
and len(uniques) <= _MAX_CAT_CARDINALITY_INT
|
|
87
|
+
and float(np.min(uniques)) >= -_MAX_CAT_INT_VALUE
|
|
88
|
+
and float(np.max(uniques)) <= _MAX_CAT_INT_VALUE
|
|
89
|
+
):
|
|
90
|
+
return "dichotomous" if len(uniques) == 2 else "categorical"
|
|
91
|
+
return "continuous"
|
|
92
|
+
|
|
93
|
+
# Datetime / timedelta — PySofra doesn't summarise temporal columns
|
|
94
|
+
# natively (there is no "median date (Q1, Q3)" idiom that maps cleanly
|
|
95
|
+
# to a publication table). Falling through to the categorical branch
|
|
96
|
+
# would put every unique timestamp on its own row, which is almost
|
|
97
|
+
# always not what the user wants. Emit a UserWarning so they notice
|
|
98
|
+
# and switch to a derived numeric column (e.g.
|
|
99
|
+
# ``(df.date - ref).dt.days``), then still return categorical so the
|
|
100
|
+
# call doesn't crash.
|
|
101
|
+
if is_datetime64_any_dtype(series) or is_timedelta64_dtype(series):
|
|
102
|
+
import warnings
|
|
103
|
+
warnings.warn(
|
|
104
|
+
f"Variable {series.name!r} has dtype {series.dtype!s}; PySofra "
|
|
105
|
+
"does not summarise temporal columns. Convert it to a numeric "
|
|
106
|
+
"duration (e.g. (df.date - reference).dt.days) and pass that "
|
|
107
|
+
"instead.",
|
|
108
|
+
UserWarning,
|
|
109
|
+
stacklevel=2,
|
|
110
|
+
)
|
|
111
|
+
# Any other unrecognised dtype — treat as categorical for safety.
|
|
112
|
+
return "categorical"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def apply_overrides(
|
|
116
|
+
inferred: dict[str, VarKind],
|
|
117
|
+
overrides: dict[str, VarKind] | None,
|
|
118
|
+
) -> dict[str, VarKind]:
|
|
119
|
+
if not overrides:
|
|
120
|
+
return inferred
|
|
121
|
+
merged = dict(inferred)
|
|
122
|
+
for k, v in overrides.items():
|
|
123
|
+
if v not in ("continuous", "categorical", "dichotomous", "ordinal"):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Invalid variable kind {v!r} for {k!r}. "
|
|
126
|
+
"Must be one of continuous, categorical, dichotomous, ordinal."
|
|
127
|
+
)
|
|
128
|
+
merged[k] = v
|
|
129
|
+
return merged
|