pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pysofra/__init__.py +82 -0
  2. pysofra/core/__init__.py +14 -0
  3. pysofra/core/compose.py +167 -0
  4. pysofra/core/format.py +155 -0
  5. pysofra/core/frames.py +69 -0
  6. pysofra/core/schema.py +128 -0
  7. pysofra/core/table.py +924 -0
  8. pysofra/io/__init__.py +1 -0
  9. pysofra/models/__init__.py +6 -0
  10. pysofra/models/extract.py +249 -0
  11. pysofra/models/pool.py +119 -0
  12. pysofra/models/regression.py +507 -0
  13. pysofra/models/survival.py +395 -0
  14. pysofra/models/uvregression.py +438 -0
  15. pysofra/notebook/__init__.py +6 -0
  16. pysofra/plot/__init__.py +23 -0
  17. pysofra/plot/_backend.py +32 -0
  18. pysofra/plot/forest.py +159 -0
  19. pysofra/plot/inline.py +171 -0
  20. pysofra/plot/km.py +249 -0
  21. pysofra/render/__init__.py +28 -0
  22. pysofra/render/_zip_determinism.py +57 -0
  23. pysofra/render/base.py +22 -0
  24. pysofra/render/docx.py +286 -0
  25. pysofra/render/html.py +442 -0
  26. pysofra/render/image.py +130 -0
  27. pysofra/render/latex.py +253 -0
  28. pysofra/render/markdown.py +128 -0
  29. pysofra/render/pptx.py +340 -0
  30. pysofra/render/xlsx.py +226 -0
  31. pysofra/summary/__init__.py +6 -0
  32. pysofra/summary/calibrate.py +214 -0
  33. pysofra/summary/design.py +246 -0
  34. pysofra/summary/effect_size.py +187 -0
  35. pysofra/summary/extras.py +745 -0
  36. pysofra/summary/smd.py +133 -0
  37. pysofra/summary/stats.py +135 -0
  38. pysofra/summary/tbl_cross.py +339 -0
  39. pysofra/summary/tbl_one.py +1220 -0
  40. pysofra/summary/tbl_summary.py +51 -0
  41. pysofra/summary/tests.py +370 -0
  42. pysofra/summary/typing.py +129 -0
  43. pysofra/summary/weights.py +161 -0
  44. pysofra/themes/__init__.py +5 -0
  45. pysofra/themes/registry.py +272 -0
  46. pysofra-0.1.0a1.dist-info/METADATA +301 -0
  47. pysofra-0.1.0a1.dist-info/RECORD +50 -0
  48. pysofra-0.1.0a1.dist-info/WHEEL +4 -0
  49. pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
  50. pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,51 @@
1
+ """General descriptive summary tables — equivalent to ``gtsummary::tbl_summary``.
2
+
3
+ ``tbl_summary`` is broadly the same engine as :func:`tbl_one` but with more
4
+ flexible knobs: you can pass custom statistic templates, override missing
5
+ handling per variable, and produce summaries without a stratification
6
+ variable as the natural default.
7
+
8
+ For the MVP, ``tbl_summary`` delegates to the same :func:`_build` engine
9
+ under the hood — there is genuinely one statistical computation; only the
10
+ defaults differ between the two front doors.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import pandas as pd
16
+
17
+ from ..core.table import SofraTable
18
+ from .tbl_one import tbl_one
19
+ from .typing import VarKind
20
+
21
+
22
+ def tbl_summary(
23
+ data: pd.DataFrame,
24
+ *,
25
+ by: str | None = None,
26
+ variables: list[str] | None = None,
27
+ labels: dict[str, str] | None = None,
28
+ types: dict[str, VarKind] | None = None,
29
+ nonnormal: list[str] | None = None,
30
+ digits: int = 2,
31
+ pct_digits: int = 1,
32
+ missing: str = "ifany",
33
+ ) -> SofraTable:
34
+ """Build a general descriptive summary table.
35
+
36
+ See :func:`pysofra.tbl_one` for parameter documentation. The two
37
+ functions share an engine; the names exist separately because the
38
+ *intent* differs (Table 1 baseline vs. arbitrary descriptive summary)
39
+ and we may diverge their defaults further in future releases.
40
+ """
41
+ return tbl_one(
42
+ data,
43
+ by=by,
44
+ variables=variables,
45
+ labels=labels,
46
+ types=types,
47
+ nonnormal=nonnormal,
48
+ digits=digits,
49
+ pct_digits=pct_digits,
50
+ missing=missing,
51
+ )
@@ -0,0 +1,370 @@
1
+ """Statistical test selection for Table 1 / summary tables.
2
+
3
+ Two layers:
4
+
5
+ * **Defaults** — :func:`continuous_test` and :func:`categorical_test` choose
6
+ a sensible test for a variable given its kind, following the
7
+ ``tableone`` / ``gtsummary`` conventions:
8
+
9
+ Continuous, 2 groups → Welch's t-test (Wilcoxon if ``nonnormal``)
10
+ Continuous, 3+ groups → one-way ANOVA (Kruskal–Wallis if ``nonnormal``)
11
+ Categorical, 2×2 → Fisher's exact
12
+ Categorical, larger → Pearson chi-square (flagged sparse if any expected < 5)
13
+
14
+ * **Per-variable overrides** — :func:`run_named_test` dispatches a named
15
+ test by string key. Builders accept a ``tests={'age': 'wilcoxon'}`` map
16
+ and call into this dispatcher, falling back to the defaults otherwise.
17
+
18
+ Returns a small :class:`TestResult` so callers can render both the p-value
19
+ and the test name for the footnote.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import warnings
25
+ from collections.abc import Callable
26
+ from contextlib import contextmanager
27
+ from dataclasses import dataclass
28
+
29
+ import numpy as np
30
+ import pandas as pd
31
+ from scipy import stats as sp_stats
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class TestResult:
36
+ p_value: float | None
37
+ test: str # short human-readable name; used in footnote
38
+ statistic: float | None = None
39
+
40
+
41
+ _NA = TestResult(p_value=None, test="—")
42
+
43
+
44
+ @contextmanager
45
+ def _quiet_scipy(): # type: ignore[no-untyped-def]
46
+ """Suppress numeric and ``RuntimeWarning`` chatter from scipy hypothesis
47
+ tests on edge-case inputs.
48
+
49
+ scipy emits ``RuntimeWarning: Precision loss occurred in moment
50
+ calculation`` when ``ttest_ind`` is asked to test on near-constant
51
+ arrays, and similar advisory warnings for other tests on degenerate
52
+ data. The warning is correct but advisory — the resulting ``nan``
53
+ / boundary p-value is the well-defined output we want to surface.
54
+ Users gating on ``-W error::RuntimeWarning`` would otherwise see
55
+ the test routine crash instead of returning a NaN p-value, so wrap
56
+ the scipy call at our boundary.
57
+ """
58
+ with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
59
+ warnings.catch_warnings():
60
+ warnings.simplefilter("ignore", RuntimeWarning)
61
+ yield
62
+
63
+
64
+ def _group_arrays(values: pd.Series, groups: pd.Series) -> list[np.ndarray]:
65
+ df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"), "g": groups})
66
+ df = df.dropna(subset=["v", "g"])
67
+ if df.empty:
68
+ return []
69
+ arrs = [g["v"].to_numpy() for _, g in df.groupby("g", observed=True)]
70
+ return [a for a in arrs if a.size > 0]
71
+
72
+
73
+ # ----------------------------------------------------------------------
74
+ # Continuous
75
+ # ----------------------------------------------------------------------
76
+
77
+ def continuous_test(
78
+ values: pd.Series,
79
+ groups: pd.Series,
80
+ nonnormal: bool = False,
81
+ ) -> TestResult:
82
+ """Default continuous test selection."""
83
+ arrs = _group_arrays(values, groups)
84
+ if len(arrs) < 2:
85
+ return _NA
86
+ if len(arrs) == 2:
87
+ return _wilcoxon(arrs) if nonnormal else _welch(arrs)
88
+ return _kruskal(arrs) if nonnormal else _anova(arrs)
89
+
90
+
91
+ def _welch(arrs: list[np.ndarray]) -> TestResult:
92
+ with _quiet_scipy():
93
+ stat, p = sp_stats.ttest_ind(*arrs, equal_var=False, nan_policy="omit")
94
+ return TestResult(p_value=float(p), test="Welch's t-test", statistic=float(stat))
95
+
96
+
97
+ def _student_t(arrs: list[np.ndarray]) -> TestResult:
98
+ with _quiet_scipy():
99
+ stat, p = sp_stats.ttest_ind(*arrs, equal_var=True, nan_policy="omit")
100
+ return TestResult(p_value=float(p), test="Student's t-test", statistic=float(stat))
101
+
102
+
103
+ def _wilcoxon(arrs: list[np.ndarray]) -> TestResult:
104
+ with _quiet_scipy():
105
+ stat, p = sp_stats.mannwhitneyu(*arrs, alternative="two-sided")
106
+ return TestResult(p_value=float(p), test="Wilcoxon rank-sum", statistic=float(stat))
107
+
108
+
109
+ def _anova(arrs: list[np.ndarray]) -> TestResult:
110
+ with _quiet_scipy():
111
+ stat, p = sp_stats.f_oneway(*arrs)
112
+ return TestResult(p_value=float(p), test="One-way ANOVA", statistic=float(stat))
113
+
114
+
115
+ def _kruskal(arrs: list[np.ndarray]) -> TestResult:
116
+ with _quiet_scipy():
117
+ stat, p = sp_stats.kruskal(*arrs)
118
+ return TestResult(p_value=float(p), test="Kruskal–Wallis", statistic=float(stat))
119
+
120
+
121
+ # ----------------------------------------------------------------------
122
+ # Categorical
123
+ # ----------------------------------------------------------------------
124
+
125
+ def categorical_test(values: pd.Series, groups: pd.Series) -> TestResult:
126
+ """Default categorical test selection."""
127
+ ctab = _crosstab(values, groups)
128
+ if ctab is None:
129
+ return _NA
130
+ observed = ctab.to_numpy()
131
+ if observed.shape == (2, 2):
132
+ return _fisher(observed)
133
+ return _chisq(observed)
134
+
135
+
136
+ def svyttest(
137
+ values: pd.Series,
138
+ groups: pd.Series,
139
+ weights: pd.Series,
140
+ *,
141
+ strata: pd.Series | None = None,
142
+ cluster: pd.Series | None = None,
143
+ ) -> TestResult:
144
+ """Design-adjusted two-sample t-test (svyttest analogue).
145
+
146
+ For two groups, the test statistic is
147
+
148
+ t = (ȳ₂_w − ȳ₁_w) / SE(diff)
149
+
150
+ where ȳᵢ_w is the weighted group mean and the SE is computed via
151
+ Taylor linearisation (using :func:`pysofra.summary.design.design_mean_var`
152
+ once per group, summed across groups for the variance of the
153
+ difference). Compared against a ``t`` distribution with ``Σ n_h − H``
154
+ degrees of freedom (where ``H`` is the number of strata if given, or
155
+ one otherwise).
156
+ """
157
+ from .design import design_mean_var
158
+
159
+ df_ = pd.DataFrame({
160
+ "v": pd.to_numeric(values, errors="coerce"),
161
+ "g": groups,
162
+ "w": pd.to_numeric(weights, errors="coerce"),
163
+ })
164
+ if strata is not None:
165
+ df_["strata"] = strata.values
166
+ if cluster is not None:
167
+ df_["cluster"] = cluster.values
168
+ df_ = df_.dropna(subset=["v", "g", "w"])
169
+ df_ = df_[df_["w"] > 0]
170
+ if df_.empty:
171
+ return _NA
172
+
173
+ levels = sorted(df_["g"].unique(), key=str)
174
+ if len(levels) != 2:
175
+ return _NA
176
+
177
+ means: list[float] = []
178
+ vars_: list[float] = []
179
+ n_per: list[int] = []
180
+ for lvl in levels:
181
+ sub = df_[df_["g"] == lvl]
182
+ m, v, _ = design_mean_var(
183
+ sub["v"], sub["w"],
184
+ strata=sub.get("strata"),
185
+ cluster=sub.get("cluster"),
186
+ )
187
+ means.append(m)
188
+ vars_.append(v)
189
+ n_per.append(int(len(sub)))
190
+
191
+ diff = means[1] - means[0]
192
+ se = float(np.sqrt(vars_[0] + vars_[1])) if not (
193
+ np.isnan(vars_[0]) or np.isnan(vars_[1])
194
+ ) else float("nan")
195
+ if not np.isfinite(se) or se == 0:
196
+ return _NA
197
+ t_stat = diff / se
198
+ # Degrees of freedom: total n minus number of strata (1 if unstratified).
199
+ h = 1 if strata is None else int(pd.Series(strata).nunique())
200
+ df_deg = max(1, sum(n_per) - h)
201
+ p = 2 * float(sp_stats.t.sf(abs(t_stat), df=df_deg))
202
+
203
+ return TestResult(p_value=p, test="Design-adjusted t-test", statistic=t_stat)
204
+
205
+
206
+ def rao_scott_chisq(
207
+ values: pd.Series,
208
+ groups: pd.Series,
209
+ weights: pd.Series,
210
+ ) -> TestResult:
211
+ """Rao–Scott first-order corrected chi-square for survey-weighted data.
212
+
213
+ Computes a Pearson chi-square statistic on the *weighted* contingency
214
+ table, then scales it by an estimated design effect (DEFF) derived
215
+ from the weights:
216
+
217
+ DEFF ≈ n * Σ w_i² / (Σ w_i)²
218
+
219
+ The corrected statistic is referred to a χ² distribution with the
220
+ usual ``(R-1)(C-1)`` degrees of freedom. This is the first-order
221
+ Rao–Scott correction; for full second-order accuracy a generalised
222
+ design matrix is required and is left to dedicated survey packages.
223
+ """
224
+ df = pd.DataFrame({
225
+ "v": values,
226
+ "g": groups,
227
+ "w": pd.to_numeric(weights, errors="coerce"),
228
+ }).dropna()
229
+ df = df[df["w"] > 0]
230
+ if df.empty:
231
+ return _NA
232
+
233
+ # Weighted contingency table.
234
+ ctab = df.groupby(["v", "g"], observed=True)["w"].sum().unstack(
235
+ fill_value=0,
236
+ ).astype(float)
237
+ if ctab.shape[0] < 2 or ctab.shape[1] < 2:
238
+ return _NA
239
+ observed_w = ctab.to_numpy(dtype=float)
240
+
241
+ # Pearson chi-square on the weighted observed table.
242
+ with _quiet_scipy():
243
+ chi2, _, dof, _expected = sp_stats.chi2_contingency(
244
+ observed_w, correction=False,
245
+ )
246
+
247
+ w = df["w"].to_numpy(dtype=float)
248
+ n = float(len(w))
249
+ deff = float(n * (w**2).sum() / (w.sum() ** 2)) if w.sum() > 0 else 1.0
250
+ chi2_adj = float(chi2) / max(deff, 1e-12)
251
+ p_adj = float(sp_stats.chi2.sf(chi2_adj, df=dof))
252
+
253
+ return TestResult(
254
+ p_value=p_adj,
255
+ test="Rao–Scott chi-square",
256
+ statistic=chi2_adj,
257
+ )
258
+
259
+
260
+ def _crosstab(values: pd.Series, groups: pd.Series) -> pd.DataFrame | None:
261
+ df = pd.DataFrame({"v": values, "g": groups}).dropna()
262
+ if df.empty:
263
+ return None
264
+ ctab = pd.crosstab(df["v"], df["g"])
265
+ if ctab.shape[0] < 2 or ctab.shape[1] < 2:
266
+ return None
267
+ return ctab
268
+
269
+
270
+ def _fisher(observed: np.ndarray) -> TestResult:
271
+ # scipy >= 1.13: `alternative` is only valid for 2x2 tables; larger
272
+ # tables use an exact RxC computation under the default method.
273
+ with _quiet_scipy():
274
+ if observed.shape == (2, 2):
275
+ _, p = sp_stats.fisher_exact(observed, alternative="two-sided")
276
+ else:
277
+ _, p = sp_stats.fisher_exact(observed)
278
+ return TestResult(p_value=float(p), test="Fisher's exact")
279
+
280
+
281
+ def _chisq(observed: np.ndarray) -> TestResult:
282
+ with _quiet_scipy():
283
+ chi2, p, _, expected = sp_stats.chi2_contingency(observed, correction=False)
284
+ if np.any(expected < 5):
285
+ return TestResult(p_value=float(p), test="Chi-square (sparse)", statistic=float(chi2))
286
+ return TestResult(p_value=float(p), test="Pearson's chi-square", statistic=float(chi2))
287
+
288
+
289
+ # ----------------------------------------------------------------------
290
+ # Named-test dispatcher (per-variable overrides)
291
+ # ----------------------------------------------------------------------
292
+
293
+ ContinuousFn = Callable[[list[np.ndarray]], TestResult]
294
+ CategoricalFn = Callable[[np.ndarray], TestResult]
295
+
296
+ _CONTINUOUS_TESTS: dict[str, ContinuousFn] = {
297
+ "welch": _welch,
298
+ "welch_t": _welch,
299
+ "t": _welch,
300
+ "ttest": _welch,
301
+ "student": _student_t,
302
+ "student_t": _student_t,
303
+ "equal_var_t": _student_t,
304
+ "wilcoxon": _wilcoxon,
305
+ "mannwhitney": _wilcoxon,
306
+ "mwu": _wilcoxon,
307
+ "rank_sum": _wilcoxon,
308
+ "anova": _anova,
309
+ "oneway_anova": _anova,
310
+ "kruskal": _kruskal,
311
+ "kruskal_wallis": _kruskal,
312
+ }
313
+
314
+ _CATEGORICAL_TESTS: dict[str, CategoricalFn] = {
315
+ "fisher": _fisher,
316
+ "fisher_exact": _fisher,
317
+ "chisq": _chisq,
318
+ "chi_square": _chisq,
319
+ "chi2": _chisq,
320
+ "pearson": _chisq,
321
+ }
322
+
323
+
324
+ def available_tests() -> dict[str, list[str]]:
325
+ """Return the lookup table of named tests, grouped by variable kind."""
326
+ return {
327
+ "continuous": sorted(_CONTINUOUS_TESTS),
328
+ "categorical": sorted(_CATEGORICAL_TESTS),
329
+ }
330
+
331
+
332
+ def run_named_test(
333
+ name: str,
334
+ values: pd.Series,
335
+ groups: pd.Series,
336
+ *,
337
+ kind: str,
338
+ ) -> TestResult:
339
+ """Run a named test against (values, groups).
340
+
341
+ ``kind`` is ``"continuous"`` or ``"categorical"`` and disambiguates
342
+ which dispatch table to consult. Raises ``ValueError`` if the test
343
+ name is unknown for the given kind.
344
+ """
345
+ key = name.lower().strip()
346
+ if kind == "continuous":
347
+ cont_fn = _CONTINUOUS_TESTS.get(key)
348
+ if cont_fn is None:
349
+ raise ValueError(
350
+ f"Unknown continuous test {name!r}. Available: "
351
+ + ", ".join(sorted(set(_CONTINUOUS_TESTS)))
352
+ )
353
+ arrs = _group_arrays(values, groups)
354
+ if len(arrs) < 2:
355
+ return _NA
356
+ return cont_fn(arrs)
357
+
358
+ if kind == "categorical":
359
+ cat_fn = _CATEGORICAL_TESTS.get(key)
360
+ if cat_fn is None:
361
+ raise ValueError(
362
+ f"Unknown categorical test {name!r}. Available: "
363
+ + ", ".join(sorted(set(_CATEGORICAL_TESTS)))
364
+ )
365
+ ctab = _crosstab(values, groups)
366
+ if ctab is None:
367
+ return _NA
368
+ return cat_fn(ctab.to_numpy())
369
+
370
+ raise ValueError(f"Unknown variable kind {kind!r}")
@@ -0,0 +1,129 @@
1
+ """Automatic variable typing for summary tables.
2
+
3
+ We classify each variable into one of four kinds:
4
+
5
+ * ``continuous`` — numeric and treated as continuous
6
+ * ``categorical`` — discrete factor variable (strings, booleans, low-cardinality ints)
7
+ * ``dichotomous`` — categorical with exactly two non-missing levels
8
+ * ``ordinal`` — pandas ``Categorical`` with ``ordered=True``
9
+
10
+ The classifier errs on the side of categorical when ambiguous (e.g. integer
11
+ columns with very few unique values) because mistakenly summarising a
12
+ factor as continuous produces nonsense output in publication tables.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Literal
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ from pandas.api.types import (
22
+ is_bool_dtype,
23
+ is_datetime64_any_dtype,
24
+ is_numeric_dtype,
25
+ is_object_dtype,
26
+ is_string_dtype,
27
+ is_timedelta64_dtype,
28
+ )
29
+
30
+
31
+ def _is_categorical(series: pd.Series) -> bool:
32
+ return isinstance(series.dtype, pd.CategoricalDtype)
33
+
34
+ VarKind = Literal["continuous", "categorical", "dichotomous", "ordinal"]
35
+
36
+ # Integer columns with at most this many distinct values *and* whose values
37
+ # all lie in a small low-integer range get classified as categorical.
38
+ # Real-world continuous variables (age, BMI, systolic BP, etc.) virtually
39
+ # always exceed this — the heuristic is intentionally conservative so we
40
+ # don't accidentally summarise a continuous variable as n (%) per level.
41
+ _MAX_CAT_CARDINALITY_INT = 5
42
+ _MAX_CAT_INT_VALUE = 20
43
+
44
+
45
+ def infer_kind(series: pd.Series) -> VarKind:
46
+ """Infer the variable kind of a pandas Series."""
47
+ s = series.dropna()
48
+ if s.empty:
49
+ # No information — default to categorical so we render n (%) of NaNs.
50
+ return "categorical"
51
+
52
+ if _is_categorical(series):
53
+ if getattr(series.cat, "ordered", False):
54
+ return "ordinal"
55
+ return "dichotomous" if s.nunique() == 2 else "categorical"
56
+
57
+ if is_bool_dtype(series):
58
+ return "dichotomous"
59
+
60
+ if is_string_dtype(series) or is_object_dtype(series):
61
+ return "dichotomous" if s.nunique() == 2 else "categorical"
62
+
63
+ if is_numeric_dtype(series):
64
+ uniques = s.unique()
65
+ # Exactly {0, 1} (or {0.0, 1.0}) is a strong dichotomous signal.
66
+ # ``int(np.inf)`` raises ``OverflowError``; columns containing
67
+ # ``inf`` or ``-inf`` are by definition not 0/1, so we fall
68
+ # through to the continuous branch instead of crashing.
69
+ try:
70
+ uvals = set(int(x) for x in uniques)
71
+ if uvals.issubset({0, 1}) and len(uvals) == 2:
72
+ return "dichotomous"
73
+ except (TypeError, ValueError, OverflowError):
74
+ pass
75
+
76
+ try:
77
+ # ``s.dtype`` is ``np.dtype | ExtensionDtype`` under
78
+ # pandas-stubs; only the np.dtype branch is meaningful here
79
+ # (extension dtypes are handled earlier), so silence the
80
+ # narrowed union for this site.
81
+ is_int = bool(np.issubdtype(s.dtype, np.integer)) # type: ignore[arg-type]
82
+ except TypeError: # pragma: no cover — defensive: numpy is_numeric_dtype guarantees a known dtype
83
+ is_int = False
84
+ if (
85
+ is_int
86
+ and len(uniques) <= _MAX_CAT_CARDINALITY_INT
87
+ and float(np.min(uniques)) >= -_MAX_CAT_INT_VALUE
88
+ and float(np.max(uniques)) <= _MAX_CAT_INT_VALUE
89
+ ):
90
+ return "dichotomous" if len(uniques) == 2 else "categorical"
91
+ return "continuous"
92
+
93
+ # Datetime / timedelta — PySofra doesn't summarise temporal columns
94
+ # natively (there is no "median date (Q1, Q3)" idiom that maps cleanly
95
+ # to a publication table). Falling through to the categorical branch
96
+ # would put every unique timestamp on its own row, which is almost
97
+ # always not what the user wants. Emit a UserWarning so they notice
98
+ # and switch to a derived numeric column (e.g.
99
+ # ``(df.date - ref).dt.days``), then still return categorical so the
100
+ # call doesn't crash.
101
+ if is_datetime64_any_dtype(series) or is_timedelta64_dtype(series):
102
+ import warnings
103
+ warnings.warn(
104
+ f"Variable {series.name!r} has dtype {series.dtype!s}; PySofra "
105
+ "does not summarise temporal columns. Convert it to a numeric "
106
+ "duration (e.g. (df.date - reference).dt.days) and pass that "
107
+ "instead.",
108
+ UserWarning,
109
+ stacklevel=2,
110
+ )
111
+ # Any other unrecognised dtype — treat as categorical for safety.
112
+ return "categorical"
113
+
114
+
115
+ def apply_overrides(
116
+ inferred: dict[str, VarKind],
117
+ overrides: dict[str, VarKind] | None,
118
+ ) -> dict[str, VarKind]:
119
+ if not overrides:
120
+ return inferred
121
+ merged = dict(inferred)
122
+ for k, v in overrides.items():
123
+ if v not in ("continuous", "categorical", "dichotomous", "ordinal"):
124
+ raise ValueError(
125
+ f"Invalid variable kind {v!r} for {k!r}. "
126
+ "Must be one of continuous, categorical, dichotomous, ordinal."
127
+ )
128
+ merged[k] = v
129
+ return merged