pysofra 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysofra/__init__.py +82 -0
- pysofra/core/__init__.py +14 -0
- pysofra/core/compose.py +167 -0
- pysofra/core/format.py +155 -0
- pysofra/core/frames.py +69 -0
- pysofra/core/schema.py +128 -0
- pysofra/core/table.py +924 -0
- pysofra/io/__init__.py +1 -0
- pysofra/models/__init__.py +6 -0
- pysofra/models/extract.py +249 -0
- pysofra/models/pool.py +119 -0
- pysofra/models/regression.py +507 -0
- pysofra/models/survival.py +395 -0
- pysofra/models/uvregression.py +438 -0
- pysofra/notebook/__init__.py +6 -0
- pysofra/plot/__init__.py +23 -0
- pysofra/plot/_backend.py +32 -0
- pysofra/plot/forest.py +159 -0
- pysofra/plot/inline.py +171 -0
- pysofra/plot/km.py +249 -0
- pysofra/render/__init__.py +28 -0
- pysofra/render/_zip_determinism.py +57 -0
- pysofra/render/base.py +22 -0
- pysofra/render/docx.py +286 -0
- pysofra/render/html.py +442 -0
- pysofra/render/image.py +130 -0
- pysofra/render/latex.py +253 -0
- pysofra/render/markdown.py +128 -0
- pysofra/render/pptx.py +340 -0
- pysofra/render/xlsx.py +226 -0
- pysofra/summary/__init__.py +6 -0
- pysofra/summary/calibrate.py +214 -0
- pysofra/summary/design.py +246 -0
- pysofra/summary/effect_size.py +187 -0
- pysofra/summary/extras.py +745 -0
- pysofra/summary/smd.py +133 -0
- pysofra/summary/stats.py +135 -0
- pysofra/summary/tbl_cross.py +339 -0
- pysofra/summary/tbl_one.py +1220 -0
- pysofra/summary/tbl_summary.py +51 -0
- pysofra/summary/tests.py +370 -0
- pysofra/summary/typing.py +129 -0
- pysofra/summary/weights.py +161 -0
- pysofra/themes/__init__.py +5 -0
- pysofra/themes/registry.py +272 -0
- pysofra-0.1.0a1.dist-info/METADATA +301 -0
- pysofra-0.1.0a1.dist-info/RECORD +50 -0
- pysofra-0.1.0a1.dist-info/WHEEL +4 -0
- pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
- pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,1220 @@
|
|
|
1
|
+
"""Table 1 builder — baseline characteristic tables.
|
|
2
|
+
|
|
3
|
+
Mirrors the workflow of R's ``tableone`` while staying Pythonic:
|
|
4
|
+
|
|
5
|
+
.. code-block:: python
|
|
6
|
+
|
|
7
|
+
import pysofra as ps
|
|
8
|
+
|
|
9
|
+
(
|
|
10
|
+
ps.tbl_one(df, by="treatment")
|
|
11
|
+
.add_p()
|
|
12
|
+
.add_smd()
|
|
13
|
+
.add_overall()
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
The function returns a :class:`~pysofra.core.SofraTable` that renders
|
|
17
|
+
beautifully in notebooks and exports to HTML/Markdown/DOCX.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from ..core.format import (
|
|
27
|
+
fmt_mean_sd,
|
|
28
|
+
fmt_median_iqr,
|
|
29
|
+
fmt_p_value,
|
|
30
|
+
fmt_smd,
|
|
31
|
+
)
|
|
32
|
+
from ..core.frames import to_pandas
|
|
33
|
+
from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
|
|
34
|
+
from ..core.table import SofraTable, TableSpec
|
|
35
|
+
from .design import SurveyDesign, design_mean_var, replicate_mean_var
|
|
36
|
+
from .smd import categorical_smd, continuous_smd
|
|
37
|
+
from .stats import continuous_stats
|
|
38
|
+
from .tests import (
|
|
39
|
+
categorical_test,
|
|
40
|
+
continuous_test,
|
|
41
|
+
rao_scott_chisq,
|
|
42
|
+
run_named_test,
|
|
43
|
+
svyttest,
|
|
44
|
+
)
|
|
45
|
+
from .typing import VarKind, apply_overrides, infer_kind
|
|
46
|
+
from .weights import weighted_continuous_stats
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _is_cat_dtype(series: pd.Series) -> bool:
|
|
50
|
+
return isinstance(series.dtype, pd.CategoricalDtype)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
DEFAULT_OVERALL_LABEL = "Overall"
|
|
54
|
+
MISSING_LABEL = "Missing"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def tbl_one(
|
|
58
|
+
data: pd.DataFrame,
|
|
59
|
+
*,
|
|
60
|
+
by: str | None = None,
|
|
61
|
+
variables: list[str] | None = None,
|
|
62
|
+
labels: dict[str, str] | None = None,
|
|
63
|
+
types: dict[str, VarKind] | None = None,
|
|
64
|
+
nonnormal: list[str] | None = None,
|
|
65
|
+
tests: dict[str, str] | None = None,
|
|
66
|
+
weights: str | None = None,
|
|
67
|
+
design: SurveyDesign | None = None,
|
|
68
|
+
digits: int = 2,
|
|
69
|
+
pct_digits: int = 1,
|
|
70
|
+
missing: str = "ifany",
|
|
71
|
+
include_missing: bool | None = None,
|
|
72
|
+
) -> SofraTable:
|
|
73
|
+
"""Build a Table 1.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
data
|
|
78
|
+
Source dataframe.
|
|
79
|
+
by
|
|
80
|
+
Optional column name to stratify on. If omitted, a single
|
|
81
|
+
``Overall`` column is produced.
|
|
82
|
+
variables
|
|
83
|
+
Explicit list of variables to include. Defaults to all columns
|
|
84
|
+
other than ``by``.
|
|
85
|
+
labels
|
|
86
|
+
Mapping of column name → display label.
|
|
87
|
+
types
|
|
88
|
+
Override automatic variable typing on a per-column basis.
|
|
89
|
+
nonnormal
|
|
90
|
+
Continuous variables that should be summarised as
|
|
91
|
+
``median (Q1, Q3)`` and tested with rank-based tests.
|
|
92
|
+
tests
|
|
93
|
+
Per-variable statistical test overrides, e.g.
|
|
94
|
+
``{'age': 'wilcoxon', 'race': 'fisher'}``. See
|
|
95
|
+
:func:`pysofra.summary.tests.available_tests` for the registry.
|
|
96
|
+
weights
|
|
97
|
+
Column name carrying non-negative frequency weights. When
|
|
98
|
+
supplied, continuous summaries become weighted means / variances
|
|
99
|
+
and categorical summaries become weighted proportions. The
|
|
100
|
+
weights column is excluded from the variable list automatically.
|
|
101
|
+
design
|
|
102
|
+
A :class:`SurveyDesign` describing a complex sampling structure
|
|
103
|
+
(weights + optional strata, clusters, and FPC). When provided,
|
|
104
|
+
variance estimates use Taylor linearisation instead of the
|
|
105
|
+
simple frequency-weighted formula. If both ``weights`` and
|
|
106
|
+
``design`` are passed, ``design`` wins.
|
|
107
|
+
digits
|
|
108
|
+
Decimal places for continuous summaries.
|
|
109
|
+
pct_digits
|
|
110
|
+
Decimal places for percentages.
|
|
111
|
+
missing
|
|
112
|
+
``"ifany"`` (default) — include a *Missing* row only when there is
|
|
113
|
+
missing data; ``"always"`` — always include; ``"never"``.
|
|
114
|
+
include_missing
|
|
115
|
+
Deprecated alias for ``missing``. ``True`` maps to ``"ifany"``,
|
|
116
|
+
``False`` to ``"never"``.
|
|
117
|
+
"""
|
|
118
|
+
if include_missing is not None:
|
|
119
|
+
missing = "ifany" if include_missing else "never"
|
|
120
|
+
if missing not in ("ifany", "always", "never"):
|
|
121
|
+
raise ValueError("missing must be one of 'ifany', 'always', 'never'")
|
|
122
|
+
|
|
123
|
+
data = to_pandas(data)
|
|
124
|
+
# Duplicate-column-name check — without this, downstream pandas calls
|
|
125
|
+
# raise a confusing ``AttributeError`` on the duplicated Series.
|
|
126
|
+
duplicate_cols = [c for c in data.columns if list(data.columns).count(c) > 1]
|
|
127
|
+
if duplicate_cols:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"tbl_one cannot accept a DataFrame with duplicate column names; "
|
|
130
|
+
f"got duplicates: {sorted(set(duplicate_cols))}."
|
|
131
|
+
)
|
|
132
|
+
if by is not None and by not in data.columns:
|
|
133
|
+
raise KeyError(f"by column {by!r} not in data")
|
|
134
|
+
if design is not None and weights is not None and weights != design.weights:
|
|
135
|
+
import warnings
|
|
136
|
+
warnings.warn(
|
|
137
|
+
f"Both weights={weights!r} and design= were provided; "
|
|
138
|
+
f"using design.weights={design.weights!r}. Pass only one.",
|
|
139
|
+
UserWarning,
|
|
140
|
+
stacklevel=2,
|
|
141
|
+
)
|
|
142
|
+
if design is not None:
|
|
143
|
+
design.validate(data)
|
|
144
|
+
weights = design.weights
|
|
145
|
+
if weights is not None and weights not in data.columns:
|
|
146
|
+
raise KeyError(f"weights column {weights!r} not in data")
|
|
147
|
+
if weights is not None:
|
|
148
|
+
# Warn — but don't refuse — on negative weights; the standard
|
|
149
|
+
# behaviour is to drop them (matches survey::svydesign).
|
|
150
|
+
import warnings
|
|
151
|
+
w_col = pd.to_numeric(data[weights], errors="coerce")
|
|
152
|
+
n_neg = int((w_col < 0).sum())
|
|
153
|
+
if n_neg > 0:
|
|
154
|
+
warnings.warn(
|
|
155
|
+
f"weights column {weights!r} contains {n_neg} negative value(s); "
|
|
156
|
+
"rows with negative weight are excluded from summaries.",
|
|
157
|
+
UserWarning,
|
|
158
|
+
stacklevel=2,
|
|
159
|
+
)
|
|
160
|
+
excluded: set[str] = {c for c in (by, weights) if c is not None}
|
|
161
|
+
if design is not None:
|
|
162
|
+
if design.strata is not None:
|
|
163
|
+
excluded.add(design.strata)
|
|
164
|
+
if design.cluster is not None:
|
|
165
|
+
if isinstance(design.cluster, tuple):
|
|
166
|
+
excluded.update(design.cluster)
|
|
167
|
+
else:
|
|
168
|
+
excluded.add(design.cluster)
|
|
169
|
+
if design.fpc is not None:
|
|
170
|
+
excluded.add(design.fpc)
|
|
171
|
+
if design.replicate_weights is not None:
|
|
172
|
+
excluded.update(design.replicate_weights)
|
|
173
|
+
if variables is None:
|
|
174
|
+
variables = [c for c in data.columns if c not in excluded]
|
|
175
|
+
else:
|
|
176
|
+
missing_cols = [v for v in variables if v not in data.columns]
|
|
177
|
+
if missing_cols:
|
|
178
|
+
raise KeyError(f"variables not in data: {missing_cols}")
|
|
179
|
+
# Warn when the user-supplied variables list overlaps the design /
|
|
180
|
+
# stratification columns; silently dropping them is surprising.
|
|
181
|
+
overlap = [v for v in variables if v in excluded]
|
|
182
|
+
if overlap:
|
|
183
|
+
import warnings
|
|
184
|
+
warnings.warn(
|
|
185
|
+
f"variables={overlap} overlap with the by= / weights / design "
|
|
186
|
+
"columns and were excluded.",
|
|
187
|
+
UserWarning,
|
|
188
|
+
stacklevel=2,
|
|
189
|
+
)
|
|
190
|
+
variables = [v for v in variables if v not in excluded]
|
|
191
|
+
|
|
192
|
+
labels = dict(labels or {})
|
|
193
|
+
nonnormal_set = set(nonnormal or [])
|
|
194
|
+
inferred = {v: infer_kind(data[v]) for v in variables}
|
|
195
|
+
kinds = apply_overrides(inferred, types)
|
|
196
|
+
tests_map = dict(tests or {})
|
|
197
|
+
|
|
198
|
+
# Warn the user if labels / nonnormal / tests reference columns that
|
|
199
|
+
# are NOT in the final variable list — those entries are silently
|
|
200
|
+
# ignored otherwise, leading to wrong tests (a ``nonnormal=["hbac1"]``
|
|
201
|
+
# typo → Welch instead of Wilcoxon, no warning), wrong row labels
|
|
202
|
+
# (``labels={"hbac1": "HbA1c"}`` typo → raw column name in the table),
|
|
203
|
+
# and wrong tests (``tests={"hbac1": "wilcoxon"}`` typo → default).
|
|
204
|
+
_var_set = set(variables)
|
|
205
|
+
_bad_labels = sorted(set(labels) - _var_set)
|
|
206
|
+
_bad_nonnormal = sorted(nonnormal_set - _var_set)
|
|
207
|
+
_bad_tests = sorted(set(tests_map) - _var_set)
|
|
208
|
+
if _bad_labels or _bad_nonnormal or _bad_tests:
|
|
209
|
+
import warnings
|
|
210
|
+
msgs = []
|
|
211
|
+
if _bad_labels:
|
|
212
|
+
msgs.append(f"labels={_bad_labels}")
|
|
213
|
+
if _bad_nonnormal:
|
|
214
|
+
msgs.append(f"nonnormal={_bad_nonnormal}")
|
|
215
|
+
if _bad_tests:
|
|
216
|
+
msgs.append(f"tests={_bad_tests}")
|
|
217
|
+
warnings.warn(
|
|
218
|
+
f"Variables referenced but not in the table: {'; '.join(msgs)}. "
|
|
219
|
+
f"Check for typos against {sorted(_var_set)!r}. The entries were "
|
|
220
|
+
"ignored.",
|
|
221
|
+
UserWarning,
|
|
222
|
+
stacklevel=2,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
spec = TableSpec(
|
|
226
|
+
builder="tbl_one",
|
|
227
|
+
options={
|
|
228
|
+
"by": by,
|
|
229
|
+
"variables": tuple(variables),
|
|
230
|
+
"labels": labels,
|
|
231
|
+
"kinds": kinds,
|
|
232
|
+
"nonnormal": frozenset(nonnormal_set),
|
|
233
|
+
"tests": tests_map,
|
|
234
|
+
"weights": weights,
|
|
235
|
+
"design": design,
|
|
236
|
+
"digits": int(digits),
|
|
237
|
+
"pct_digits": int(pct_digits),
|
|
238
|
+
"missing": missing,
|
|
239
|
+
"p_value": False,
|
|
240
|
+
"smd": False,
|
|
241
|
+
"overall": False,
|
|
242
|
+
"overall_label": DEFAULT_OVERALL_LABEL,
|
|
243
|
+
"q_value": False,
|
|
244
|
+
"q_method": "fdr_bh",
|
|
245
|
+
},
|
|
246
|
+
)
|
|
247
|
+
# We close over the *original data* so spec changes don't lose it.
|
|
248
|
+
return _build(data, spec)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# ----------------------------------------------------------------------
|
|
252
|
+
# Internals
|
|
253
|
+
# ----------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
def _build(data: pd.DataFrame, spec: TableSpec) -> SofraTable:
|
|
256
|
+
"""Construct a SofraTable from a (data, spec) pair."""
|
|
257
|
+
opts = spec.options
|
|
258
|
+
by: str | None = opts["by"]
|
|
259
|
+
variables: tuple[str, ...] = opts["variables"]
|
|
260
|
+
labels: dict[str, str] = opts["labels"]
|
|
261
|
+
kinds: dict[str, VarKind] = opts["kinds"]
|
|
262
|
+
nonnormal: frozenset[str] = opts["nonnormal"]
|
|
263
|
+
tests_map: dict[str, str] = opts.get("tests", {}) or {}
|
|
264
|
+
weights_col: str | None = opts.get("weights")
|
|
265
|
+
design: SurveyDesign | None = opts.get("design")
|
|
266
|
+
digits: int = opts["digits"]
|
|
267
|
+
pct_digits: int = opts["pct_digits"]
|
|
268
|
+
missing_mode: str = opts["missing"]
|
|
269
|
+
show_p: bool = bool(opts["p_value"])
|
|
270
|
+
show_smd: bool = bool(opts["smd"])
|
|
271
|
+
show_overall: bool = bool(opts["overall"])
|
|
272
|
+
overall_label: str = opts["overall_label"]
|
|
273
|
+
show_q: bool = bool(opts.get("q_value"))
|
|
274
|
+
q_method: str = opts.get("q_method", "fdr_bh")
|
|
275
|
+
bold_p_mode: bool = False
|
|
276
|
+
bold_p_threshold: float = 0.05
|
|
277
|
+
|
|
278
|
+
if by is None:
|
|
279
|
+
group_keys: list[Any] = [overall_label]
|
|
280
|
+
group_masks = {overall_label: pd.Series(True, index=data.index)}
|
|
281
|
+
show_overall = False # already overall-only
|
|
282
|
+
else:
|
|
283
|
+
# Drop rows missing the stratification variable; record N dropped.
|
|
284
|
+
by_series = data[by]
|
|
285
|
+
# Preserve categorical / sorted order
|
|
286
|
+
if _is_cat_dtype(by_series):
|
|
287
|
+
group_keys = [k for k in by_series.cat.categories if (by_series == k).any()]
|
|
288
|
+
else:
|
|
289
|
+
group_keys = sorted(by_series.dropna().unique(), key=_sort_key)
|
|
290
|
+
# cast to plain list[Any]
|
|
291
|
+
group_keys = list(group_keys)
|
|
292
|
+
group_masks = {k: (by_series == k) for k in group_keys}
|
|
293
|
+
# Guard against degenerate ``by=`` columns
|
|
294
|
+
# silently produced an unstratified or empty table. Match the R9
|
|
295
|
+
# policy (clear UserWarning when the user's intent doesn't match
|
|
296
|
+
# the input) so the user knows to drop ``by=`` or fix the column.
|
|
297
|
+
if len(group_keys) == 0:
|
|
298
|
+
import warnings
|
|
299
|
+
warnings.warn(
|
|
300
|
+
f"by={by!r} has no non-missing values; the resulting table "
|
|
301
|
+
"has no stratification columns. Pass by=None for an "
|
|
302
|
+
"explicitly unstratified summary, or fix the column.",
|
|
303
|
+
UserWarning,
|
|
304
|
+
stacklevel=2,
|
|
305
|
+
)
|
|
306
|
+
elif len(group_keys) == 1:
|
|
307
|
+
import warnings
|
|
308
|
+
warnings.warn(
|
|
309
|
+
f"by={by!r} has only one non-missing level "
|
|
310
|
+
f"({group_keys[0]!r}); the resulting table has a single "
|
|
311
|
+
"stratum column and no between-group statistics. Pass "
|
|
312
|
+
"by=None for an explicitly unstratified summary.",
|
|
313
|
+
UserWarning,
|
|
314
|
+
stacklevel=2,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if weights_col is not None:
|
|
318
|
+
w_series = pd.to_numeric(data[weights_col], errors="coerce").fillna(0.0)
|
|
319
|
+
n_per_group = {k: float(w_series[group_masks[k]].sum()) for k in group_keys}
|
|
320
|
+
n_overall = (
|
|
321
|
+
float(w_series.sum())
|
|
322
|
+
if by is None
|
|
323
|
+
else float(w_series[data[by].notna()].sum())
|
|
324
|
+
)
|
|
325
|
+
else:
|
|
326
|
+
w_series = None
|
|
327
|
+
n_per_group = {k: int(group_masks[k].sum()) for k in group_keys}
|
|
328
|
+
n_overall = int(len(data)) if by is None else int(data[by].notna().sum())
|
|
329
|
+
|
|
330
|
+
# ------------------------------------------------------------------
|
|
331
|
+
# Headers
|
|
332
|
+
# ------------------------------------------------------------------
|
|
333
|
+
header_cells: list[HeaderCell] = [HeaderCell(text="Characteristic", align="left")]
|
|
334
|
+
|
|
335
|
+
def _fmt_n(val: float | int) -> str:
|
|
336
|
+
if isinstance(val, float):
|
|
337
|
+
return f"{val:,.1f}" if val != int(val) else f"{int(val):,}"
|
|
338
|
+
return f"{val:,}"
|
|
339
|
+
|
|
340
|
+
if show_overall:
|
|
341
|
+
header_cells.append(
|
|
342
|
+
HeaderCell(text=f"{overall_label}\nN = {_fmt_n(n_overall)}")
|
|
343
|
+
)
|
|
344
|
+
for k in group_keys:
|
|
345
|
+
header_cells.append(
|
|
346
|
+
HeaderCell(text=f"{_fmt_level(k)}\nN = {_fmt_n(n_per_group[k])}")
|
|
347
|
+
)
|
|
348
|
+
if show_p:
|
|
349
|
+
header_cells.append(HeaderCell(text="p-value"))
|
|
350
|
+
if show_q:
|
|
351
|
+
header_cells.append(HeaderCell(text="q-value"))
|
|
352
|
+
if show_smd:
|
|
353
|
+
header_cells.append(HeaderCell(text="SMD"))
|
|
354
|
+
|
|
355
|
+
headers: tuple[HeaderRow, ...] = (HeaderRow(cells=tuple(header_cells)),)
|
|
356
|
+
|
|
357
|
+
# ------------------------------------------------------------------
|
|
358
|
+
# Body rows
|
|
359
|
+
# ------------------------------------------------------------------
|
|
360
|
+
rows: list[Row] = []
|
|
361
|
+
test_names: set[str] = set()
|
|
362
|
+
|
|
363
|
+
for var in variables:
|
|
364
|
+
kind = kinds[var]
|
|
365
|
+
label = labels.get(var, var)
|
|
366
|
+
is_nonnormal = var in nonnormal
|
|
367
|
+
|
|
368
|
+
test_override = tests_map.get(var)
|
|
369
|
+
|
|
370
|
+
if kind == "continuous":
|
|
371
|
+
row_blocks, test_used = _continuous_rows(
|
|
372
|
+
data, var, label, by, group_keys, group_masks,
|
|
373
|
+
digits=digits,
|
|
374
|
+
pct_digits=pct_digits,
|
|
375
|
+
show_overall=show_overall,
|
|
376
|
+
show_p=show_p,
|
|
377
|
+
show_q=show_q,
|
|
378
|
+
show_smd=show_smd,
|
|
379
|
+
nonnormal=is_nonnormal,
|
|
380
|
+
missing_mode=missing_mode,
|
|
381
|
+
bold_p_mode=bold_p_mode,
|
|
382
|
+
bold_p_threshold=bold_p_threshold,
|
|
383
|
+
test_override=test_override,
|
|
384
|
+
weights=w_series,
|
|
385
|
+
design=design,
|
|
386
|
+
)
|
|
387
|
+
rows.extend(row_blocks)
|
|
388
|
+
if test_used:
|
|
389
|
+
test_names.add(test_used)
|
|
390
|
+
else:
|
|
391
|
+
row_blocks, test_used = _categorical_rows(
|
|
392
|
+
data, var, label, by, group_keys, group_masks,
|
|
393
|
+
kind=kind,
|
|
394
|
+
pct_digits=pct_digits,
|
|
395
|
+
show_overall=show_overall,
|
|
396
|
+
show_p=show_p,
|
|
397
|
+
show_q=show_q,
|
|
398
|
+
show_smd=show_smd,
|
|
399
|
+
missing_mode=missing_mode,
|
|
400
|
+
bold_p_mode=bold_p_mode,
|
|
401
|
+
bold_p_threshold=bold_p_threshold,
|
|
402
|
+
test_override=test_override,
|
|
403
|
+
weights=w_series,
|
|
404
|
+
)
|
|
405
|
+
rows.extend(row_blocks)
|
|
406
|
+
if test_used:
|
|
407
|
+
test_names.add(test_used)
|
|
408
|
+
|
|
409
|
+
# ------------------------------------------------------------------
|
|
410
|
+
# Footnotes
|
|
411
|
+
# ------------------------------------------------------------------
|
|
412
|
+
footnotes: list[str] = []
|
|
413
|
+
# Continuous summary footnote
|
|
414
|
+
cont_vars = [v for v in variables if kinds[v] == "continuous"]
|
|
415
|
+
nn_vars = [v for v in cont_vars if v in nonnormal]
|
|
416
|
+
nm_vars = [v for v in cont_vars if v not in nonnormal]
|
|
417
|
+
design_with_variance = (
|
|
418
|
+
design is not None and weights_col is not None
|
|
419
|
+
and (design.strata is not None or design.cluster is not None)
|
|
420
|
+
)
|
|
421
|
+
if nm_vars and design_with_variance:
|
|
422
|
+
footnotes.append(
|
|
423
|
+
"Mean (SE) for continuous variables (design-based "
|
|
424
|
+
"Taylor-linearised variance)."
|
|
425
|
+
)
|
|
426
|
+
elif nm_vars:
|
|
427
|
+
footnotes.append("Mean (SD) for continuous variables.")
|
|
428
|
+
if nn_vars:
|
|
429
|
+
labelled = ", ".join(labels.get(v, v) for v in nn_vars)
|
|
430
|
+
footnotes.append(f"Median (Q1, Q3) for: {labelled}.")
|
|
431
|
+
cat_vars = [v for v in variables if kinds[v] != "continuous"]
|
|
432
|
+
if cat_vars:
|
|
433
|
+
footnotes.append("n (%) for categorical variables.")
|
|
434
|
+
if show_p and test_names:
|
|
435
|
+
footnotes.append("Tests: " + "; ".join(sorted(test_names)) + ".")
|
|
436
|
+
if show_q:
|
|
437
|
+
footnotes.append(f"q-value = {_q_method_label(q_method)} adjusted p-value.")
|
|
438
|
+
if show_smd:
|
|
439
|
+
footnotes.append("SMD = standardized mean difference (max pairwise).")
|
|
440
|
+
|
|
441
|
+
if show_q:
|
|
442
|
+
rows = _patch_q_values(rows, method=q_method)
|
|
443
|
+
|
|
444
|
+
# ------------------------------------------------------------------
|
|
445
|
+
# add_global_p() — joint Wald p-value per variable, fitted as
|
|
446
|
+
# ``Logit(by == reference_level ~ variable + adjust_for)`` for
|
|
447
|
+
# each variable. Adds a "global p" column to the right of any
|
|
448
|
+
# existing p-value column. Requires a 2-level ``by=``; >2-level
|
|
449
|
+
# ``by=`` is out of scope for v1 (would require multinomial
|
|
450
|
+
# logit).
|
|
451
|
+
# ------------------------------------------------------------------
|
|
452
|
+
if opts.get("global_p"):
|
|
453
|
+
rows, headers, footnotes_extra = _attach_global_p(
|
|
454
|
+
data=data, by=by,
|
|
455
|
+
variables=variables, kinds=kinds, labels=labels,
|
|
456
|
+
rows=rows, headers=headers,
|
|
457
|
+
adjust_for=tuple(opts.get("global_p_adjust_for", ()) or ()),
|
|
458
|
+
)
|
|
459
|
+
footnotes = list(footnotes) + list(footnotes_extra)
|
|
460
|
+
|
|
461
|
+
def _rebuild_fn(new_spec: TableSpec) -> SofraTable:
|
|
462
|
+
return _build(data, new_spec)
|
|
463
|
+
|
|
464
|
+
return SofraTable(
|
|
465
|
+
rows=tuple(rows),
|
|
466
|
+
headers=headers,
|
|
467
|
+
footnotes=tuple(footnotes),
|
|
468
|
+
metadata={"builder": "tbl_one", "tests": sorted(test_names)},
|
|
469
|
+
_spec=spec,
|
|
470
|
+
_rebuild=_rebuild_fn,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# ----------------------------------------------------------------------
|
|
475
|
+
# q-value post-processing
|
|
476
|
+
# ----------------------------------------------------------------------
|
|
477
|
+
|
|
478
|
+
_Q_METHOD_LABELS = {
|
|
479
|
+
"fdr_bh": "Benjamini–Hochberg",
|
|
480
|
+
"fdr_by": "Benjamini–Yekutieli",
|
|
481
|
+
"bonferroni": "Bonferroni",
|
|
482
|
+
"holm": "Holm",
|
|
483
|
+
"hommel": "Hommel",
|
|
484
|
+
"sidak": "Šidák",
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _q_method_label(method: str) -> str:
|
|
489
|
+
return _Q_METHOD_LABELS.get(method, method)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _patch_q_values(rows: list[Row], *, method: str) -> list[Row]:
|
|
493
|
+
"""Walk rows, collect p-values, compute q-values, patch q-value cells."""
|
|
494
|
+
# Find rows that have *both* a numeric p-value cell and a q-value placeholder.
|
|
495
|
+
# NaN p-values are silently skipped — feeding them to ``multipletests``
|
|
496
|
+
# contaminates the entire adjustment (statsmodels returns NaN for every
|
|
497
|
+
# output), which would wrongly null out the q-values of valid rows.
|
|
498
|
+
import math
|
|
499
|
+
from dataclasses import replace as dc_replace
|
|
500
|
+
|
|
501
|
+
from statsmodels.stats.multitest import multipletests
|
|
502
|
+
|
|
503
|
+
pairs: list[tuple[int, int, int, float]] = [] # (row_idx, p_col, q_col, p_val)
|
|
504
|
+
for i, r in enumerate(rows):
|
|
505
|
+
p_col = q_col = None
|
|
506
|
+
for j, c in enumerate(r.cells):
|
|
507
|
+
if c.kind == "p_value" and isinstance(c.value, (int, float)):
|
|
508
|
+
if math.isnan(float(c.value)):
|
|
509
|
+
p_col = None # poison; skip this row entirely
|
|
510
|
+
break
|
|
511
|
+
p_col = j
|
|
512
|
+
elif c.kind == "q_value":
|
|
513
|
+
q_col = j
|
|
514
|
+
if p_col is not None and q_col is not None:
|
|
515
|
+
pairs.append((i, p_col, q_col, float(rows[i].cells[p_col].value)))
|
|
516
|
+
|
|
517
|
+
if not pairs:
|
|
518
|
+
return rows
|
|
519
|
+
|
|
520
|
+
pvals = [p for _, _, _, p in pairs]
|
|
521
|
+
_, qvals, _, _ = multipletests(pvals, method=method)
|
|
522
|
+
|
|
523
|
+
new_rows = list(rows)
|
|
524
|
+
for (i, _p_col, q_col, _p), q in zip(pairs, qvals, strict=True):
|
|
525
|
+
old_row = new_rows[i]
|
|
526
|
+
new_cells = list(old_row.cells)
|
|
527
|
+
new_cells[q_col] = dc_replace(
|
|
528
|
+
new_cells[q_col],
|
|
529
|
+
text=fmt_p_value(float(q), digits=3),
|
|
530
|
+
value=float(q),
|
|
531
|
+
)
|
|
532
|
+
new_rows[i] = dc_replace(old_row, cells=tuple(new_cells))
|
|
533
|
+
return new_rows
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def _fmt_level(k: Any) -> str:
|
|
537
|
+
if isinstance(k, bool):
|
|
538
|
+
return "Yes" if k else "No"
|
|
539
|
+
return str(k)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _fmt_weighted_n_pct(n: float, total: float, pct_digits: int) -> str:
|
|
543
|
+
"""Render ``n (xx.x%)`` with weighted (possibly non-integer) counts."""
|
|
544
|
+
if total <= 0:
|
|
545
|
+
n_str = f"{n:,.1f}" if n != int(n) else f"{int(n):,}"
|
|
546
|
+
return f"{n_str} (—)"
|
|
547
|
+
pct = 100.0 * n / total
|
|
548
|
+
n_str = f"{n:,.1f}" if n != int(n) else f"{int(n):,}"
|
|
549
|
+
return f"{n_str} ({pct:.{pct_digits}f}%)"
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _sort_key(x: Any) -> tuple[int, Any]:
|
|
553
|
+
if isinstance(x, bool):
|
|
554
|
+
return (0, int(x))
|
|
555
|
+
if isinstance(x, (int, float)):
|
|
556
|
+
return (0, float(x))
|
|
557
|
+
if isinstance(x, str):
|
|
558
|
+
return (1, x)
|
|
559
|
+
return (2, repr(x))
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# ----------------------------------------------------------------------
|
|
563
|
+
# Continuous rows
|
|
564
|
+
# ----------------------------------------------------------------------
|
|
565
|
+
|
|
566
|
+
def _continuous_rows(
|
|
567
|
+
data: pd.DataFrame,
|
|
568
|
+
var: str,
|
|
569
|
+
label: str,
|
|
570
|
+
by: str | None,
|
|
571
|
+
group_keys: list[Any],
|
|
572
|
+
group_masks: dict[Any, pd.Series],
|
|
573
|
+
*,
|
|
574
|
+
digits: int,
|
|
575
|
+
pct_digits: int,
|
|
576
|
+
show_overall: bool,
|
|
577
|
+
show_p: bool,
|
|
578
|
+
show_q: bool,
|
|
579
|
+
show_smd: bool,
|
|
580
|
+
nonnormal: bool,
|
|
581
|
+
missing_mode: str,
|
|
582
|
+
bold_p_mode: bool,
|
|
583
|
+
bold_p_threshold: float,
|
|
584
|
+
test_override: str | None = None,
|
|
585
|
+
weights: pd.Series | None = None,
|
|
586
|
+
design: SurveyDesign | None = None,
|
|
587
|
+
) -> tuple[list[Row], str | None]:
|
|
588
|
+
"""Produce 1 (+ optional missing) rows for one continuous variable."""
|
|
589
|
+
|
|
590
|
+
def _summary_for(mask: pd.Series) -> str:
|
|
591
|
+
# Design-based: report mean (SE) when the user has opted into a
|
|
592
|
+
# complex design (strata or cluster). For weight-only designs we
|
|
593
|
+
# fall through to the frequency-weighted mean (SD) path below.
|
|
594
|
+
if design is not None and weights is not None and (
|
|
595
|
+
design.strata is not None
|
|
596
|
+
or design.cluster is not None
|
|
597
|
+
or design.replicate_weights is not None
|
|
598
|
+
):
|
|
599
|
+
if design.replicate_weights is not None:
|
|
600
|
+
rep_series = [data.loc[mask, c] for c in design.replicate_weights]
|
|
601
|
+
mean, var_, n_eff = replicate_mean_var(
|
|
602
|
+
data.loc[mask, var],
|
|
603
|
+
weights.loc[mask],
|
|
604
|
+
rep_series,
|
|
605
|
+
replicate_type=design.replicate_type,
|
|
606
|
+
)
|
|
607
|
+
else:
|
|
608
|
+
mean, var_, n_eff = design_mean_var(
|
|
609
|
+
data.loc[mask, var],
|
|
610
|
+
weights.loc[mask],
|
|
611
|
+
strata=(data.loc[mask, design.strata]
|
|
612
|
+
if design.strata else None),
|
|
613
|
+
cluster=(data.loc[mask, design.primary_cluster]
|
|
614
|
+
if design.cluster else None),
|
|
615
|
+
fpc=(data.loc[mask, design.fpc]
|
|
616
|
+
if design.fpc else None),
|
|
617
|
+
)
|
|
618
|
+
if n_eff <= 0:
|
|
619
|
+
return "—"
|
|
620
|
+
import math
|
|
621
|
+
se = math.sqrt(max(var_, 0.0)) if not math.isnan(var_) else float("nan")
|
|
622
|
+
return fmt_mean_sd(mean, se, digits=digits)
|
|
623
|
+
if weights is not None:
|
|
624
|
+
st = weighted_continuous_stats(data.loc[mask, var], weights.loc[mask])
|
|
625
|
+
if st.n_eff <= 0:
|
|
626
|
+
return "—"
|
|
627
|
+
if nonnormal:
|
|
628
|
+
return fmt_median_iqr(st.median, st.q1, st.q3, digits=digits)
|
|
629
|
+
return fmt_mean_sd(st.mean, st.sd, digits=digits)
|
|
630
|
+
cs = continuous_stats(data.loc[mask, var])
|
|
631
|
+
if cs.n == 0:
|
|
632
|
+
return "—"
|
|
633
|
+
if nonnormal:
|
|
634
|
+
return fmt_median_iqr(cs.median, cs.q1, cs.q3, digits=digits)
|
|
635
|
+
return fmt_mean_sd(cs.mean, cs.sd, digits=digits)
|
|
636
|
+
|
|
637
|
+
p_value: float | None = None
|
|
638
|
+
test_used: str | None = None
|
|
639
|
+
if show_p and by is not None:
|
|
640
|
+
if test_override is not None:
|
|
641
|
+
res = run_named_test(test_override, data[var], data[by], kind="continuous")
|
|
642
|
+
elif weights is not None:
|
|
643
|
+
# Any weighted call gets the design-adjusted two-sample
|
|
644
|
+
# t-test (Taylor-linearised; ``svyttest`` analogue) on the
|
|
645
|
+
# 2-group case. Strata/cluster from ``design`` are honoured
|
|
646
|
+
# when present; bare ``weights=`` falls through with
|
|
647
|
+
# ``strata=None``/``cluster=None``, which still gives a
|
|
648
|
+
# weighted SE rather than the unweighted Welch fallback the
|
|
649
|
+
# earlier behaviour silently produced.
|
|
650
|
+
two_grp = len(set(data[by].dropna().unique())) == 2
|
|
651
|
+
if two_grp:
|
|
652
|
+
strata_col = (data[design.strata]
|
|
653
|
+
if design is not None and design.strata else None)
|
|
654
|
+
cluster_col = (data[design.primary_cluster]
|
|
655
|
+
if design is not None and design.primary_cluster
|
|
656
|
+
else None)
|
|
657
|
+
res = svyttest(
|
|
658
|
+
data[var], data[by], weights,
|
|
659
|
+
strata=strata_col,
|
|
660
|
+
cluster=cluster_col,
|
|
661
|
+
)
|
|
662
|
+
else:
|
|
663
|
+
# >2 groups under weights: design-adjusted F-test is
|
|
664
|
+
# out of scope (see the README "discussed but not
|
|
665
|
+
# implemented" list); fall back to design-naive ANOVA /
|
|
666
|
+
# Kruskal–Wallis with the existing footnote.
|
|
667
|
+
res = continuous_test(data[var], data[by], nonnormal=nonnormal)
|
|
668
|
+
else:
|
|
669
|
+
res = continuous_test(data[var], data[by], nonnormal=nonnormal)
|
|
670
|
+
p_value = res.p_value
|
|
671
|
+
test_used = res.test if res.p_value is not None else None
|
|
672
|
+
|
|
673
|
+
smd_val: float | None = None
|
|
674
|
+
if show_smd and by is not None:
|
|
675
|
+
smd_val = continuous_smd(data[var], data[by])
|
|
676
|
+
|
|
677
|
+
bold_row = (
|
|
678
|
+
bold_p_mode
|
|
679
|
+
and p_value is not None
|
|
680
|
+
and p_value < bold_p_threshold
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
cells: list[Cell] = [make_cell(label, align="left", bold=bold_row)]
|
|
684
|
+
if show_overall:
|
|
685
|
+
cells.append(
|
|
686
|
+
make_cell(_summary_for(pd.Series(True, index=data.index)),
|
|
687
|
+
kind="numeric", align="right")
|
|
688
|
+
)
|
|
689
|
+
for k in group_keys:
|
|
690
|
+
cells.append(
|
|
691
|
+
make_cell(_summary_for(group_masks[k]), kind="numeric", align="right")
|
|
692
|
+
)
|
|
693
|
+
if show_p:
|
|
694
|
+
cells.append(make_cell(fmt_p_value(p_value), value=p_value, kind="p_value",
|
|
695
|
+
align="right", bold=bold_row))
|
|
696
|
+
if show_q:
|
|
697
|
+
# Placeholder; patched by _patch_q_values after build.
|
|
698
|
+
cells.append(make_cell("", value=None, kind="q_value", align="right"))
|
|
699
|
+
if show_smd:
|
|
700
|
+
cells.append(make_cell(fmt_smd(smd_val), value=smd_val, kind="numeric", align="right"))
|
|
701
|
+
|
|
702
|
+
rows: list[Row] = [Row(cells=tuple(cells))]
|
|
703
|
+
|
|
704
|
+
_maybe_append_missing(rows, data, var, group_keys, group_masks,
|
|
705
|
+
show_overall, show_p, show_q, show_smd,
|
|
706
|
+
pct_digits=pct_digits, missing_mode=missing_mode,
|
|
707
|
+
weights=weights)
|
|
708
|
+
return rows, test_used
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
# ----------------------------------------------------------------------
|
|
712
|
+
# Categorical rows
|
|
713
|
+
# ----------------------------------------------------------------------
|
|
714
|
+
|
|
715
|
+
def _categorical_rows(
|
|
716
|
+
data: pd.DataFrame,
|
|
717
|
+
var: str,
|
|
718
|
+
label: str,
|
|
719
|
+
by: str | None,
|
|
720
|
+
group_keys: list[Any],
|
|
721
|
+
group_masks: dict[Any, pd.Series],
|
|
722
|
+
*,
|
|
723
|
+
kind: VarKind,
|
|
724
|
+
pct_digits: int,
|
|
725
|
+
show_overall: bool,
|
|
726
|
+
show_p: bool,
|
|
727
|
+
show_q: bool,
|
|
728
|
+
show_smd: bool,
|
|
729
|
+
missing_mode: str,
|
|
730
|
+
bold_p_mode: bool,
|
|
731
|
+
bold_p_threshold: float,
|
|
732
|
+
test_override: str | None = None,
|
|
733
|
+
weights: pd.Series | None = None,
|
|
734
|
+
) -> tuple[list[Row], str | None]:
|
|
735
|
+
"""Produce a header row + one row per level (+ optional missing)."""
|
|
736
|
+
s_all = data[var]
|
|
737
|
+
# Determine levels from full data so all groups share them.
|
|
738
|
+
if _is_cat_dtype(s_all):
|
|
739
|
+
levels: list[Any] = list(s_all.cat.categories)
|
|
740
|
+
else:
|
|
741
|
+
levels = sorted(s_all.dropna().unique(), key=_sort_key)
|
|
742
|
+
|
|
743
|
+
# All-NaN variable: emit a single "no data" row and any missing-row
|
|
744
|
+
# follow-up. Without this short-circuit the multi-level path produces
|
|
745
|
+
# a confusing group header with empty group cells.
|
|
746
|
+
if len(levels) == 0:
|
|
747
|
+
empty_cells: list[Cell] = [make_cell(label, align="left", bold=True)]
|
|
748
|
+
if show_overall:
|
|
749
|
+
empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
|
|
750
|
+
for _ in group_keys:
|
|
751
|
+
empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
|
|
752
|
+
if show_p:
|
|
753
|
+
empty_cells.append(make_cell("—", value=None, kind="p_value", align="right"))
|
|
754
|
+
if show_q:
|
|
755
|
+
empty_cells.append(make_cell("—", value=None, kind="q_value", align="right"))
|
|
756
|
+
if show_smd:
|
|
757
|
+
empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
|
|
758
|
+
empty_rows: list[Row] = [Row(cells=tuple(empty_cells))]
|
|
759
|
+
_maybe_append_missing(empty_rows, data, var, group_keys, group_masks,
|
|
760
|
+
show_overall, show_p, show_q, show_smd,
|
|
761
|
+
pct_digits=pct_digits, missing_mode=missing_mode,
|
|
762
|
+
weights=weights)
|
|
763
|
+
return empty_rows, None
|
|
764
|
+
|
|
765
|
+
p_value: float | None = None
|
|
766
|
+
test_used: str | None = None
|
|
767
|
+
if show_p and by is not None:
|
|
768
|
+
if test_override is not None:
|
|
769
|
+
res = run_named_test(test_override, s_all, data[by], kind="categorical")
|
|
770
|
+
elif weights is not None:
|
|
771
|
+
# Survey-weighted data → Rao–Scott corrected chi-square.
|
|
772
|
+
res = rao_scott_chisq(s_all, data[by], weights)
|
|
773
|
+
else:
|
|
774
|
+
res = categorical_test(s_all, data[by])
|
|
775
|
+
p_value = res.p_value
|
|
776
|
+
test_used = res.test if res.p_value is not None else None
|
|
777
|
+
|
|
778
|
+
smd_val: float | None = None
|
|
779
|
+
if show_smd and by is not None:
|
|
780
|
+
smd_val = categorical_smd(s_all, data[by], levels=levels)
|
|
781
|
+
|
|
782
|
+
bold_row = bold_p_mode and p_value is not None and p_value < bold_p_threshold
|
|
783
|
+
|
|
784
|
+
def _weighted_n_tot(mask: pd.Series, target_value: Any) -> tuple[float, float]:
|
|
785
|
+
sub = data.loc[mask]
|
|
786
|
+
valid = sub[var].notna()
|
|
787
|
+
if weights is not None:
|
|
788
|
+
w_sub = weights.loc[sub.index]
|
|
789
|
+
tot = float(w_sub[valid].sum())
|
|
790
|
+
n_match = float(w_sub[valid & (sub[var] == target_value)].sum())
|
|
791
|
+
else:
|
|
792
|
+
tot = float(valid.sum())
|
|
793
|
+
n_match = float((sub[var] == target_value).sum())
|
|
794
|
+
return n_match, tot
|
|
795
|
+
|
|
796
|
+
# Dichotomous: render as a single row "var, level = level1" with n (%)
|
|
797
|
+
# for the second (success) level. This matches gtsummary defaults.
|
|
798
|
+
if kind == "dichotomous" and len(levels) == 2:
|
|
799
|
+
success = levels[1]
|
|
800
|
+
success_label = _fmt_level(success)
|
|
801
|
+
row_label = f"{label} = {success_label}"
|
|
802
|
+
cells: list[Cell] = [make_cell(row_label, align="left", bold=bold_row)]
|
|
803
|
+
if show_overall:
|
|
804
|
+
n, tot = _weighted_n_tot(pd.Series(True, index=data.index), success)
|
|
805
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
|
|
806
|
+
kind="numeric", align="right"))
|
|
807
|
+
for k in group_keys:
|
|
808
|
+
n, tot = _weighted_n_tot(group_masks[k], success)
|
|
809
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
|
|
810
|
+
kind="numeric", align="right"))
|
|
811
|
+
if show_p:
|
|
812
|
+
cells.append(make_cell(fmt_p_value(p_value), value=p_value,
|
|
813
|
+
kind="p_value", align="right", bold=bold_row))
|
|
814
|
+
if show_q:
|
|
815
|
+
cells.append(make_cell("", value=None, kind="q_value", align="right"))
|
|
816
|
+
if show_smd:
|
|
817
|
+
cells.append(make_cell(fmt_smd(smd_val), value=smd_val,
|
|
818
|
+
kind="numeric", align="right"))
|
|
819
|
+
rows: list[Row] = [Row(cells=tuple(cells))]
|
|
820
|
+
_maybe_append_missing(rows, data, var, group_keys, group_masks,
|
|
821
|
+
show_overall, show_p, show_q, show_smd,
|
|
822
|
+
pct_digits=pct_digits, missing_mode=missing_mode,
|
|
823
|
+
weights=weights)
|
|
824
|
+
return rows, test_used
|
|
825
|
+
|
|
826
|
+
# Multi-level categorical: header row with overall N + p-value + SMD,
|
|
827
|
+
# then one indented row per level.
|
|
828
|
+
rows = []
|
|
829
|
+
hdr: list[Cell] = [make_cell(label, align="left", bold=True)]
|
|
830
|
+
if show_overall:
|
|
831
|
+
hdr.append(make_cell("", value=None))
|
|
832
|
+
for _ in group_keys:
|
|
833
|
+
hdr.append(make_cell("", value=None))
|
|
834
|
+
if show_p:
|
|
835
|
+
hdr.append(make_cell(fmt_p_value(p_value), value=p_value,
|
|
836
|
+
kind="p_value", align="right",
|
|
837
|
+
bold=bold_row))
|
|
838
|
+
if show_q:
|
|
839
|
+
hdr.append(make_cell("", value=None, kind="q_value", align="right"))
|
|
840
|
+
if show_smd:
|
|
841
|
+
hdr.append(make_cell(fmt_smd(smd_val), value=smd_val,
|
|
842
|
+
kind="numeric", align="right"))
|
|
843
|
+
rows.append(Row(cells=tuple(hdr), is_group_header=True))
|
|
844
|
+
|
|
845
|
+
for lvl in levels:
|
|
846
|
+
cells = [make_cell(f"{_fmt_level(lvl)}", align="left", indent=1)]
|
|
847
|
+
if show_overall:
|
|
848
|
+
n, tot = _weighted_n_tot(pd.Series(True, index=data.index), lvl)
|
|
849
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
|
|
850
|
+
kind="numeric", align="right"))
|
|
851
|
+
for k in group_keys:
|
|
852
|
+
n, tot = _weighted_n_tot(group_masks[k], lvl)
|
|
853
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
|
|
854
|
+
kind="numeric", align="right"))
|
|
855
|
+
if show_p:
|
|
856
|
+
cells.append(make_cell("", value=None))
|
|
857
|
+
if show_q:
|
|
858
|
+
cells.append(make_cell("", value=None))
|
|
859
|
+
if show_smd:
|
|
860
|
+
cells.append(make_cell("", value=None))
|
|
861
|
+
rows.append(Row(cells=tuple(cells)))
|
|
862
|
+
|
|
863
|
+
_maybe_append_missing(rows, data, var, group_keys, group_masks,
|
|
864
|
+
show_overall, show_p, show_q, show_smd,
|
|
865
|
+
pct_digits=pct_digits, missing_mode=missing_mode,
|
|
866
|
+
weights=weights)
|
|
867
|
+
return rows, test_used
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _maybe_append_missing(
|
|
871
|
+
rows: list[Row],
|
|
872
|
+
data: pd.DataFrame,
|
|
873
|
+
var: str,
|
|
874
|
+
group_keys: list[Any],
|
|
875
|
+
group_masks: dict[Any, pd.Series],
|
|
876
|
+
show_overall: bool,
|
|
877
|
+
show_p: bool,
|
|
878
|
+
show_q: bool,
|
|
879
|
+
show_smd: bool,
|
|
880
|
+
*,
|
|
881
|
+
pct_digits: int,
|
|
882
|
+
missing_mode: str,
|
|
883
|
+
weights: pd.Series | None = None,
|
|
884
|
+
) -> None:
|
|
885
|
+
if missing_mode == "never":
|
|
886
|
+
return
|
|
887
|
+
if weights is not None:
|
|
888
|
+
n_miss_overall = float(weights[data[var].isna()].sum())
|
|
889
|
+
else:
|
|
890
|
+
n_miss_overall = float(data[var].isna().sum())
|
|
891
|
+
if missing_mode == "ifany" and n_miss_overall == 0:
|
|
892
|
+
return
|
|
893
|
+
|
|
894
|
+
cells: list[Cell] = [make_cell(MISSING_LABEL, align="left", indent=1)]
|
|
895
|
+
if show_overall:
|
|
896
|
+
tot = float(weights.sum()) if weights is not None else float(len(data))
|
|
897
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n_miss_overall, tot, pct_digits),
|
|
898
|
+
kind="numeric", align="right"))
|
|
899
|
+
for k in group_keys:
|
|
900
|
+
mask = group_masks[k]
|
|
901
|
+
if weights is not None:
|
|
902
|
+
n_miss = float(weights.loc[mask][data.loc[mask, var].isna()].sum())
|
|
903
|
+
tot = float(weights.loc[mask].sum())
|
|
904
|
+
else:
|
|
905
|
+
n_miss = float(data.loc[mask, var].isna().sum())
|
|
906
|
+
tot = float(mask.sum())
|
|
907
|
+
cells.append(make_cell(_fmt_weighted_n_pct(n_miss, tot, pct_digits),
|
|
908
|
+
kind="numeric", align="right"))
|
|
909
|
+
if show_p:
|
|
910
|
+
cells.append(make_cell("", value=None))
|
|
911
|
+
if show_q:
|
|
912
|
+
cells.append(make_cell("", value=None))
|
|
913
|
+
if show_smd:
|
|
914
|
+
cells.append(make_cell("", value=None))
|
|
915
|
+
rows.append(Row(cells=tuple(cells)))
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
# ----------------------------------------------------------------------
|
|
919
|
+
# add_global_p() — joint Wald p-value per variable for tbl_one /
|
|
920
|
+
# tbl_summary. Re-fits a logistic regression per variable on the
|
|
921
|
+
# source data (`Logit(by == ref ~ variable [+ adjust_for])`) and
|
|
922
|
+
# computes the joint p-value over the variable's coefficients. Adds
|
|
923
|
+
# a "global p" column to each row.
|
|
924
|
+
#
|
|
925
|
+
# Single-coefficient predictors (continuous, dichotomous) get the
|
|
926
|
+
# Wald p of that one coefficient. Multi-level categorical predictors
|
|
927
|
+
# (k levels → k-1 dummies) get the joint Wald F-test across all
|
|
928
|
+
# dummies — same statistic as gtsummary's ``add_global_p()``.
|
|
929
|
+
#
|
|
930
|
+
# v1 scope: 2-level ``by=``. With ≥3-level ``by=`` the joint test
|
|
931
|
+
# requires multinomial logit, which is left out of scope; the column
|
|
932
|
+
# is filled with em-dash and a footnote explains why.
|
|
933
|
+
# ----------------------------------------------------------------------
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def _attach_global_p(
|
|
937
|
+
*,
|
|
938
|
+
data: pd.DataFrame,
|
|
939
|
+
by: str | None,
|
|
940
|
+
variables: tuple[str, ...],
|
|
941
|
+
kinds: dict[str, VarKind],
|
|
942
|
+
labels: dict[str, str],
|
|
943
|
+
rows: list[Row],
|
|
944
|
+
headers: tuple[HeaderRow, ...],
|
|
945
|
+
adjust_for: tuple[str, ...],
|
|
946
|
+
) -> tuple[list[Row], tuple[HeaderRow, ...], list[str]]:
|
|
947
|
+
"""Attach a joint-p column to a tbl_one table.
|
|
948
|
+
|
|
949
|
+
Walks the existing rows, identifies which rows belong to which
|
|
950
|
+
variable (by label-matching the first cell), and inserts a new
|
|
951
|
+
"global p" column carrying the joint Wald p-value for each
|
|
952
|
+
variable. Single-coefficient predictors get the Wald p directly;
|
|
953
|
+
multi-coefficient (categorical) predictors get the F-test joint p.
|
|
954
|
+
|
|
955
|
+
Parameters
|
|
956
|
+
----------
|
|
957
|
+
data
|
|
958
|
+
The source DataFrame closed over by the ``tbl_one`` rebuild.
|
|
959
|
+
by
|
|
960
|
+
The stratifying column name; ``None`` is unsupported and
|
|
961
|
+
causes the function to return rows unchanged with a footnote
|
|
962
|
+
explaining the column was skipped.
|
|
963
|
+
variables, kinds, labels
|
|
964
|
+
Variable list + kind / label maps from the spec.
|
|
965
|
+
rows
|
|
966
|
+
The already-built body rows (will be re-emitted with one
|
|
967
|
+
extra cell appended).
|
|
968
|
+
headers
|
|
969
|
+
The already-built header rows (will be re-emitted with one
|
|
970
|
+
extra header cell appended).
|
|
971
|
+
adjust_for
|
|
972
|
+
Tuple of column names to include as covariates. Each is
|
|
973
|
+
treated as continuous if numeric and categorical (dummy
|
|
974
|
+
coded) otherwise.
|
|
975
|
+
|
|
976
|
+
Returns
|
|
977
|
+
-------
|
|
978
|
+
new_rows, new_headers, extra_footnotes
|
|
979
|
+
Rows / headers with the new column inserted, plus any
|
|
980
|
+
explanatory footnotes (e.g. for variables that couldn't be
|
|
981
|
+
fit).
|
|
982
|
+
"""
|
|
983
|
+
extra_footnotes: list[str] = []
|
|
984
|
+
|
|
985
|
+
# Validate adjust_for columns up-front — fail fast with a clear
|
|
986
|
+
# ``KeyError`` rather than letting pandas raise a generic
|
|
987
|
+
# ``KeyError: ['NOPE'] not in index`` from deep inside the fit.
|
|
988
|
+
# Matches the validation pattern for ``by=`` and ``weights=``.
|
|
989
|
+
missing_adj = [c for c in adjust_for if c not in data.columns]
|
|
990
|
+
if missing_adj:
|
|
991
|
+
raise KeyError(
|
|
992
|
+
f"add_global_p: adjust_for column(s) {missing_adj!r} not in data"
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
if by is None:
|
|
996
|
+
extra_footnotes.append(
|
|
997
|
+
"add_global_p: skipped (no by= column).",
|
|
998
|
+
)
|
|
999
|
+
# Append blank cells so the column shape stays consistent.
|
|
1000
|
+
return (
|
|
1001
|
+
_append_blank_column(rows),
|
|
1002
|
+
_append_header_column(headers, "global p"),
|
|
1003
|
+
extra_footnotes,
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
by_series = data[by]
|
|
1007
|
+
levels = sorted(by_series.dropna().unique(), key=_sort_key)
|
|
1008
|
+
if len(levels) != 2:
|
|
1009
|
+
# Multinomial logit is out of scope for v1.
|
|
1010
|
+
extra_footnotes.append(
|
|
1011
|
+
f"add_global_p: by={by!r} has {len(levels)} levels; "
|
|
1012
|
+
"v1 supports only 2-level stratification (multinomial "
|
|
1013
|
+
"logit not implemented).",
|
|
1014
|
+
)
|
|
1015
|
+
return (
|
|
1016
|
+
_append_blank_column(rows, fill="—"),
|
|
1017
|
+
_append_header_column(headers, "global p"),
|
|
1018
|
+
extra_footnotes,
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
# Compute one joint p-value per variable.
|
|
1022
|
+
p_per_var: dict[str, float | None] = {}
|
|
1023
|
+
for var in variables:
|
|
1024
|
+
p_per_var[var] = _fit_global_p(
|
|
1025
|
+
data=data, by=by, by_levels=levels,
|
|
1026
|
+
var=var, kind=kinds[var], adjust_for=adjust_for,
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
# Walk rows, map each to its variable, and append a cell. The map
|
|
1030
|
+
# uses the existing row labels:
|
|
1031
|
+
# - Continuous / categorical parent row: label = labels.get(var, var)
|
|
1032
|
+
# - Dichotomous: label = "varlabel = displayed_level"
|
|
1033
|
+
# - Categorical level rows: indented level text (parent row above)
|
|
1034
|
+
# - Missing sub-row: label = MISSING_LABEL
|
|
1035
|
+
# We rely on the build order: variables are processed sequentially
|
|
1036
|
+
# and the parent row of each variable always appears before its
|
|
1037
|
+
# level / missing sub-rows.
|
|
1038
|
+
var_label_to_var = {labels.get(v, v): v for v in variables}
|
|
1039
|
+
# For dichotomous "var = Level" rows, also map the prefix.
|
|
1040
|
+
dichot_label_to_var = {
|
|
1041
|
+
f"{labels.get(v, v)} = ": v for v in variables if kinds[v] == "dichotomous"
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
new_rows: list[Row] = []
|
|
1045
|
+
for r in rows:
|
|
1046
|
+
first = r.cells[0].text
|
|
1047
|
+
# Identify variable for this row.
|
|
1048
|
+
matched_var: str | None = None
|
|
1049
|
+
if first in var_label_to_var:
|
|
1050
|
+
matched_var = var_label_to_var[first]
|
|
1051
|
+
else:
|
|
1052
|
+
for prefix, v in dichot_label_to_var.items():
|
|
1053
|
+
if first.startswith(prefix):
|
|
1054
|
+
matched_var = v
|
|
1055
|
+
break
|
|
1056
|
+
|
|
1057
|
+
if matched_var is not None:
|
|
1058
|
+
p = p_per_var.get(matched_var)
|
|
1059
|
+
cell = make_cell(
|
|
1060
|
+
fmt_p_value(p) if p is not None else "—",
|
|
1061
|
+
value=p, kind="p_value", align="right",
|
|
1062
|
+
)
|
|
1063
|
+
else:
|
|
1064
|
+
# Sub-row (categorical level, missing): blank so the
|
|
1065
|
+
# joint-p is visually anchored to the variable's parent row.
|
|
1066
|
+
cell = make_cell("", value=None)
|
|
1067
|
+
|
|
1068
|
+
new_rows.append(
|
|
1069
|
+
Row(cells=tuple(list(r.cells) + [cell]),
|
|
1070
|
+
is_group_header=r.is_group_header),
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
new_headers = _append_header_column(headers, "global p")
|
|
1074
|
+
if adjust_for:
|
|
1075
|
+
extra_footnotes.append(
|
|
1076
|
+
"global p: joint Wald test on the variable's coefficients "
|
|
1077
|
+
f"from Logit({by} ~ variable + "
|
|
1078
|
+
f"{' + '.join(adjust_for)}).",
|
|
1079
|
+
)
|
|
1080
|
+
else:
|
|
1081
|
+
extra_footnotes.append(
|
|
1082
|
+
"global p: joint Wald test on the variable's coefficients "
|
|
1083
|
+
f"from Logit({by} ~ variable).",
|
|
1084
|
+
)
|
|
1085
|
+
return new_rows, new_headers, extra_footnotes
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
def _fit_global_p(
|
|
1089
|
+
*,
|
|
1090
|
+
data: pd.DataFrame,
|
|
1091
|
+
by: str,
|
|
1092
|
+
by_levels: list[Any],
|
|
1093
|
+
var: str,
|
|
1094
|
+
kind: VarKind,
|
|
1095
|
+
adjust_for: tuple[str, ...],
|
|
1096
|
+
) -> float | None:
|
|
1097
|
+
"""Fit one logistic regression and return the joint Wald p-value
|
|
1098
|
+
for ``var``'s coefficients.
|
|
1099
|
+
|
|
1100
|
+
Implementation choices:
|
|
1101
|
+
* Outcome encoded as ``by == by_levels[1]`` (alphabetically
|
|
1102
|
+
second level) so the reference is well-defined.
|
|
1103
|
+
* Variable encoded based on inferred kind: continuous columns
|
|
1104
|
+
used as-is; dichotomous / categorical columns one-hot encoded
|
|
1105
|
+
via ``pd.get_dummies(drop_first=True)``.
|
|
1106
|
+
* Adjustment columns each encoded the same way (numeric →
|
|
1107
|
+
as-is; non-numeric → dummies).
|
|
1108
|
+
* Joint test built as a constraint string
|
|
1109
|
+
``"c1 = 0, c2 = 0, ..."`` over the variable's columns and
|
|
1110
|
+
passed to ``model.f_test`` (matches what ``add_global_p()``
|
|
1111
|
+
does for ``tbl_regression``).
|
|
1112
|
+
* Singular design / convergence failure → ``None`` (renders as
|
|
1113
|
+
em-dash; never a misleading numeric).
|
|
1114
|
+
"""
|
|
1115
|
+
# Build the working frame: drop rows with NaN in any required col.
|
|
1116
|
+
# Deduplicate column references — when the variable being tested is
|
|
1117
|
+
# also listed in ``adjust_for``, the duplicate would (a) make
|
|
1118
|
+
# ``data[cols]`` produce a 2-D selection that crashes
|
|
1119
|
+
# ``pd.to_numeric``, and (b) make the design matrix singular.
|
|
1120
|
+
# Variable always wins; the matching adjustment column is dropped.
|
|
1121
|
+
seen: set[str] = set()
|
|
1122
|
+
cols: list[str] = []
|
|
1123
|
+
for c in (by, var, *adjust_for):
|
|
1124
|
+
if c not in seen:
|
|
1125
|
+
seen.add(c)
|
|
1126
|
+
cols.append(c)
|
|
1127
|
+
sub = data[cols].dropna()
|
|
1128
|
+
if sub.empty or sub[by].nunique() < 2:
|
|
1129
|
+
return None
|
|
1130
|
+
|
|
1131
|
+
y = (sub[by] == by_levels[1]).astype(int).to_numpy()
|
|
1132
|
+
|
|
1133
|
+
var_cols = _design_columns(sub, var, kind)
|
|
1134
|
+
if not var_cols:
|
|
1135
|
+
return None
|
|
1136
|
+
adj_cols: list[tuple[str, Any]] = []
|
|
1137
|
+
for a in adjust_for:
|
|
1138
|
+
if a == var:
|
|
1139
|
+
continue # already in var_cols
|
|
1140
|
+
akind = _quick_kind(sub[a])
|
|
1141
|
+
for name, col in _design_columns(sub, a, akind):
|
|
1142
|
+
adj_cols.append((name, col))
|
|
1143
|
+
|
|
1144
|
+
# Stack into a single design matrix.
|
|
1145
|
+
import numpy as np
|
|
1146
|
+
import statsmodels.api as sm
|
|
1147
|
+
X_parts = [c for _, c in var_cols] + [c for _, c in adj_cols]
|
|
1148
|
+
X = np.column_stack(X_parts)
|
|
1149
|
+
# Add a constant column (intercept).
|
|
1150
|
+
X = sm.add_constant(X, has_constant="add")
|
|
1151
|
+
|
|
1152
|
+
# Column-name registry: index 0 is the const, then var_cols, then
|
|
1153
|
+
# adj_cols.
|
|
1154
|
+
col_names = ["const"] + [n for n, _ in var_cols] + [n for n, _ in adj_cols]
|
|
1155
|
+
if X.shape[1] != len(col_names): # pragma: no cover — defensive
|
|
1156
|
+
return None
|
|
1157
|
+
|
|
1158
|
+
import warnings as _w
|
|
1159
|
+
try:
|
|
1160
|
+
with _w.catch_warnings():
|
|
1161
|
+
_w.simplefilter("ignore") # statsmodels convergence chatter
|
|
1162
|
+
res = sm.Logit(y, X).fit(disp=False, method="newton",
|
|
1163
|
+
maxiter=100)
|
|
1164
|
+
except Exception: # pragma: no cover — defensive: singular design / no convergence
|
|
1165
|
+
return None
|
|
1166
|
+
|
|
1167
|
+
if not hasattr(res, "f_test"): # pragma: no cover
|
|
1168
|
+
return None
|
|
1169
|
+
|
|
1170
|
+
# Build the joint hypothesis: variable's dummies = 0.
|
|
1171
|
+
var_names = [n for n, _ in var_cols]
|
|
1172
|
+
constraint = ", ".join(
|
|
1173
|
+
f"x{col_names.index(n)} = 0" for n in var_names
|
|
1174
|
+
)
|
|
1175
|
+
try:
|
|
1176
|
+
ftest = res.f_test(constraint)
|
|
1177
|
+
return float(ftest.pvalue)
|
|
1178
|
+
except Exception: # pragma: no cover
|
|
1179
|
+
return None
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
def _design_columns(
|
|
1183
|
+
sub: pd.DataFrame, var: str, kind: VarKind,
|
|
1184
|
+
) -> list[tuple[str, Any]]:
|
|
1185
|
+
"""Return a list of (column_name, numpy_array) pairs for a single
|
|
1186
|
+
variable, dummy-coded if categorical."""
|
|
1187
|
+
s = sub[var]
|
|
1188
|
+
if kind == "continuous":
|
|
1189
|
+
return [(var, pd.to_numeric(s, errors="coerce").to_numpy())]
|
|
1190
|
+
# Dichotomous and categorical both go through one-hot encoding;
|
|
1191
|
+
# ``drop_first=True`` keeps the design full-rank.
|
|
1192
|
+
dummies = pd.get_dummies(s, prefix=var, drop_first=True, dtype=float)
|
|
1193
|
+
return [(c, dummies[c].to_numpy()) for c in dummies.columns]
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
def _quick_kind(s: pd.Series) -> VarKind:
|
|
1197
|
+
"""Best-effort kind inference for adjustment columns."""
|
|
1198
|
+
if pd.api.types.is_numeric_dtype(s) and s.nunique() > 2:
|
|
1199
|
+
return "continuous"
|
|
1200
|
+
if s.nunique() <= 2:
|
|
1201
|
+
return "dichotomous"
|
|
1202
|
+
return "categorical"
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def _append_header_column(
|
|
1206
|
+
headers: tuple[HeaderRow, ...], label: str,
|
|
1207
|
+
) -> tuple[HeaderRow, ...]:
|
|
1208
|
+
out = []
|
|
1209
|
+
for hr in headers:
|
|
1210
|
+
new_cells = list(hr.cells) + [HeaderCell(text=label, align="center", bold=True)]
|
|
1211
|
+
out.append(HeaderRow(cells=tuple(new_cells)))
|
|
1212
|
+
return tuple(out)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def _append_blank_column(rows: list[Row], fill: str = "") -> list[Row]:
|
|
1216
|
+
return [
|
|
1217
|
+
Row(cells=tuple(list(r.cells) + [make_cell(fill, value=None)]),
|
|
1218
|
+
is_group_header=r.is_group_header)
|
|
1219
|
+
for r in rows
|
|
1220
|
+
]
|