pysofra 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysofra/__init__.py +82 -0
- pysofra/core/__init__.py +14 -0
- pysofra/core/compose.py +167 -0
- pysofra/core/format.py +155 -0
- pysofra/core/frames.py +69 -0
- pysofra/core/schema.py +128 -0
- pysofra/core/table.py +924 -0
- pysofra/io/__init__.py +1 -0
- pysofra/models/__init__.py +6 -0
- pysofra/models/extract.py +249 -0
- pysofra/models/pool.py +119 -0
- pysofra/models/regression.py +507 -0
- pysofra/models/survival.py +395 -0
- pysofra/models/uvregression.py +438 -0
- pysofra/notebook/__init__.py +6 -0
- pysofra/plot/__init__.py +23 -0
- pysofra/plot/_backend.py +32 -0
- pysofra/plot/forest.py +159 -0
- pysofra/plot/inline.py +171 -0
- pysofra/plot/km.py +249 -0
- pysofra/render/__init__.py +28 -0
- pysofra/render/_zip_determinism.py +57 -0
- pysofra/render/base.py +22 -0
- pysofra/render/docx.py +286 -0
- pysofra/render/html.py +442 -0
- pysofra/render/image.py +130 -0
- pysofra/render/latex.py +253 -0
- pysofra/render/markdown.py +128 -0
- pysofra/render/pptx.py +340 -0
- pysofra/render/xlsx.py +226 -0
- pysofra/summary/__init__.py +6 -0
- pysofra/summary/calibrate.py +214 -0
- pysofra/summary/design.py +246 -0
- pysofra/summary/effect_size.py +187 -0
- pysofra/summary/extras.py +745 -0
- pysofra/summary/smd.py +133 -0
- pysofra/summary/stats.py +135 -0
- pysofra/summary/tbl_cross.py +339 -0
- pysofra/summary/tbl_one.py +1220 -0
- pysofra/summary/tbl_summary.py +51 -0
- pysofra/summary/tests.py +370 -0
- pysofra/summary/typing.py +129 -0
- pysofra/summary/weights.py +161 -0
- pysofra/themes/__init__.py +5 -0
- pysofra/themes/registry.py +272 -0
- pysofra-0.1.0a1.dist-info/METADATA +301 -0
- pysofra-0.1.0a1.dist-info/RECORD +50 -0
- pysofra-0.1.0a1.dist-info/WHEEL +4 -0
- pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
- pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,745 @@
|
|
|
1
|
+
"""Extra modifiers — joint Type-III tests, between-group differences,
|
|
2
|
+
descriptive confidence intervals, formatter overrides.
|
|
3
|
+
|
|
4
|
+
These are smaller add-ons to the core ``tbl_one`` / ``tbl_summary``
|
|
5
|
+
output, modelled on the corresponding ``gtsummary`` functions
|
|
6
|
+
(``add_global_p``, ``add_difference``, ``add_ci``, ``estimate_fun=``,
|
|
7
|
+
``pvalue_fun=``).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import math
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from dataclasses import replace
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from scipy import stats as sp_stats
|
|
19
|
+
|
|
20
|
+
from ..core.format import fmt_number
|
|
21
|
+
from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
|
|
22
|
+
from ..core.table import SofraTable
|
|
23
|
+
|
|
24
|
+
# ----------------------------------------------------------------------
|
|
25
|
+
# add_global_p — Type-III joint test per categorical variable
|
|
26
|
+
# ----------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
def add_significance_stars(
|
|
29
|
+
table: SofraTable,
|
|
30
|
+
*,
|
|
31
|
+
thresholds: tuple[tuple[float, str], ...] = (
|
|
32
|
+
(0.001, "***"),
|
|
33
|
+
(0.01, "**"),
|
|
34
|
+
(0.05, "*"),
|
|
35
|
+
),
|
|
36
|
+
) -> SofraTable:
|
|
37
|
+
"""Append a ``stars`` column with ``*** / ** / *`` significance markers.
|
|
38
|
+
|
|
39
|
+
``thresholds`` is a tuple of ``(cutoff, marker)`` pairs sorted from
|
|
40
|
+
smallest to largest cutoff. A p-value is marked with the first
|
|
41
|
+
marker whose cutoff it falls below (matching standard journal
|
|
42
|
+
convention).
|
|
43
|
+
"""
|
|
44
|
+
new_headers = _insert_after_pvalue_header(table.headers, "")
|
|
45
|
+
new_rows: list[Row] = []
|
|
46
|
+
for r in table.rows:
|
|
47
|
+
p_cell = next(
|
|
48
|
+
(c for c in r.cells
|
|
49
|
+
if c.kind == "p_value" and isinstance(c.value, (int, float))),
|
|
50
|
+
None,
|
|
51
|
+
)
|
|
52
|
+
marker = ""
|
|
53
|
+
if p_cell is not None and p_cell.value is not None:
|
|
54
|
+
p = float(p_cell.value)
|
|
55
|
+
if not _isnan(p):
|
|
56
|
+
for cutoff, m in thresholds:
|
|
57
|
+
if p < cutoff:
|
|
58
|
+
marker = m
|
|
59
|
+
break
|
|
60
|
+
new_rows.append(_insert_after_pvalue_cell(r, marker, value=None))
|
|
61
|
+
|
|
62
|
+
# Drop the placeholder header label — significance stars don't need one.
|
|
63
|
+
cleaned_headers: list[HeaderRow] = []
|
|
64
|
+
for hr in new_headers:
|
|
65
|
+
cleaned_headers.append(hr)
|
|
66
|
+
return replace(table, headers=tuple(cleaned_headers), rows=tuple(new_rows))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def add_n(table: SofraTable) -> SofraTable:
|
|
70
|
+
"""Append a per-row ``N`` column with the non-missing sample size.
|
|
71
|
+
|
|
72
|
+
Reads the rebuild closure to recover the source data, then counts
|
|
73
|
+
non-missing observations per variable. For categorical rows the
|
|
74
|
+
column shows the variable's overall N (not per-level N).
|
|
75
|
+
"""
|
|
76
|
+
if table._spec is None or table._rebuild is None:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"add_n needs access to the source data — only tables built "
|
|
79
|
+
"directly by tbl_one / tbl_summary qualify."
|
|
80
|
+
)
|
|
81
|
+
data = _data_from_rebuild(table._rebuild)
|
|
82
|
+
if data is None:
|
|
83
|
+
raise ValueError("Could not recover source data from table closure.")
|
|
84
|
+
|
|
85
|
+
opts = table._spec.options
|
|
86
|
+
variables = opts["variables"]
|
|
87
|
+
kinds = opts["kinds"]
|
|
88
|
+
|
|
89
|
+
n_for: dict[str, int] = {}
|
|
90
|
+
for v in variables:
|
|
91
|
+
n_for[v] = int(data[v].notna().sum())
|
|
92
|
+
|
|
93
|
+
new_headers = _insert_after_label_header(table.headers, "N")
|
|
94
|
+
new_rows: list[Row] = []
|
|
95
|
+
for r in table.rows:
|
|
96
|
+
label = r.cells[0].text
|
|
97
|
+
var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
|
|
98
|
+
text = f"{n_for[var]:,}" if var is not None and var in n_for else ""
|
|
99
|
+
new_rows.append(_insert_after_label_cell(
|
|
100
|
+
r, text, value=n_for.get(var) if var else None,
|
|
101
|
+
))
|
|
102
|
+
return replace(table, headers=new_headers, rows=tuple(new_rows))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def add_stat_label(table: SofraTable) -> SofraTable:
|
|
106
|
+
"""Append a ``Statistic`` column describing each row's summary form.
|
|
107
|
+
|
|
108
|
+
Continuous (non-nonnormal) rows display "Mean (SD)"; ``nonnormal``
|
|
109
|
+
rows display "Median (Q1, Q3)"; categorical rows display "n (%)".
|
|
110
|
+
"""
|
|
111
|
+
if table._spec is None or table._rebuild is None:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"add_stat_label needs a tbl_one / tbl_summary source table."
|
|
114
|
+
)
|
|
115
|
+
opts = table._spec.options
|
|
116
|
+
variables = opts["variables"]
|
|
117
|
+
kinds = opts["kinds"]
|
|
118
|
+
nonnormal = set(opts.get("nonnormal", set()))
|
|
119
|
+
|
|
120
|
+
label_for: dict[str, str] = {}
|
|
121
|
+
for v in variables:
|
|
122
|
+
if kinds[v] == "continuous":
|
|
123
|
+
label_for[v] = (
|
|
124
|
+
"Median (Q1, Q3)" if v in nonnormal else "Mean (SD)"
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
label_for[v] = "n (%)"
|
|
128
|
+
|
|
129
|
+
new_headers = _insert_after_label_header(table.headers, "Statistic")
|
|
130
|
+
new_rows: list[Row] = []
|
|
131
|
+
for r in table.rows:
|
|
132
|
+
label = r.cells[0].text
|
|
133
|
+
var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
|
|
134
|
+
text = label_for.get(var, "") if var else ""
|
|
135
|
+
new_rows.append(_insert_after_label_cell(r, text, value=None))
|
|
136
|
+
return replace(table, headers=new_headers, rows=tuple(new_rows))
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def color_scale_if(
|
|
140
|
+
table: SofraTable,
|
|
141
|
+
*,
|
|
142
|
+
column: int,
|
|
143
|
+
palette: tuple[str, str, str] = ("#fff5f0", "#fcae91", "#cb181d"),
|
|
144
|
+
skip_blank: bool = True,
|
|
145
|
+
) -> SofraTable:
|
|
146
|
+
"""Heatmap-style cell colouring for a numeric column (HTML only).
|
|
147
|
+
|
|
148
|
+
Walks the body rows, finds the cell at ``column``, and assigns a
|
|
149
|
+
background colour interpolated across the three-stop ``palette``
|
|
150
|
+
according to the cell's numeric value. Non-numeric cells are left
|
|
151
|
+
untouched (or skipped when ``skip_blank=True``).
|
|
152
|
+
|
|
153
|
+
Renderers other than HTML ignore the colour (DOCX/XLSX could honour
|
|
154
|
+
it via per-cell ``style``; that's left to a future round).
|
|
155
|
+
"""
|
|
156
|
+
# Pass 1: collect numeric values.
|
|
157
|
+
vals: list[tuple[int, float]] = []
|
|
158
|
+
for i, r in enumerate(table.rows):
|
|
159
|
+
if column >= len(r.cells):
|
|
160
|
+
continue
|
|
161
|
+
v = r.cells[column].value
|
|
162
|
+
if isinstance(v, (int, float)) and not _isnan(v):
|
|
163
|
+
vals.append((i, float(v)))
|
|
164
|
+
|
|
165
|
+
if not vals:
|
|
166
|
+
return table # nothing to colour
|
|
167
|
+
lo = min(v for _, v in vals)
|
|
168
|
+
hi = max(v for _, v in vals)
|
|
169
|
+
span = hi - lo if hi > lo else 1.0
|
|
170
|
+
mid_color = palette[1]
|
|
171
|
+
lo_color, hi_color = palette[0], palette[2]
|
|
172
|
+
|
|
173
|
+
def interp(value: float) -> str:
|
|
174
|
+
t = (value - lo) / span
|
|
175
|
+
# Two-stop: lo→mid for t<0.5, mid→hi for t>=0.5.
|
|
176
|
+
if t < 0.5:
|
|
177
|
+
return _mix_hex(lo_color, mid_color, t / 0.5)
|
|
178
|
+
return _mix_hex(mid_color, hi_color, (t - 0.5) / 0.5)
|
|
179
|
+
|
|
180
|
+
new_rows: list[Row] = []
|
|
181
|
+
val_dict = dict(vals)
|
|
182
|
+
for i, r in enumerate(table.rows):
|
|
183
|
+
if i not in val_dict:
|
|
184
|
+
new_rows.append(r)
|
|
185
|
+
continue
|
|
186
|
+
cells = list(r.cells)
|
|
187
|
+
c = cells[column]
|
|
188
|
+
color = interp(val_dict[i])
|
|
189
|
+
style = dict(c.style or {})
|
|
190
|
+
style["html"] = (style.get("html", "") + f"background:{color};").strip(";")
|
|
191
|
+
cells[column] = replace(c, style=style)
|
|
192
|
+
new_rows.append(replace(r, cells=tuple(cells)))
|
|
193
|
+
return replace(table, rows=tuple(new_rows))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _mix_hex(a: str, b: str, t: float) -> str:
|
|
197
|
+
"""Linear-interpolate two ``#rrggbb`` colours at parameter ``t``."""
|
|
198
|
+
t = max(0.0, min(1.0, t))
|
|
199
|
+
a = a.lstrip("#")
|
|
200
|
+
b = b.lstrip("#")
|
|
201
|
+
ar, ag, ab = int(a[0:2], 16), int(a[2:4], 16), int(a[4:6], 16)
|
|
202
|
+
br, bg, bb = int(b[0:2], 16), int(b[2:4], 16), int(b[4:6], 16)
|
|
203
|
+
r = int(round(ar + (br - ar) * t))
|
|
204
|
+
g = int(round(ag + (bg - ag) * t))
|
|
205
|
+
bl = int(round(ab + (bb - ab) * t))
|
|
206
|
+
return f"#{r:02x}{g:02x}{bl:02x}"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _insert_after_label_header(
|
|
210
|
+
headers: tuple[HeaderRow, ...], label: str,
|
|
211
|
+
) -> tuple[HeaderRow, ...]:
|
|
212
|
+
"""Insert a header cell right after the first column (the label column)."""
|
|
213
|
+
new_headers: list[HeaderRow] = []
|
|
214
|
+
for hr in headers:
|
|
215
|
+
new_cells = list(hr.cells)
|
|
216
|
+
new_cells.insert(1, HeaderCell(text=label))
|
|
217
|
+
new_headers.append(HeaderRow(cells=tuple(new_cells)))
|
|
218
|
+
return tuple(new_headers)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _insert_after_label_cell(
|
|
222
|
+
row: Row, text: str, *, value: Any,
|
|
223
|
+
) -> Row:
|
|
224
|
+
new_cells = list(row.cells)
|
|
225
|
+
new_cells.insert(1, make_cell(text, value=value, align="right"))
|
|
226
|
+
return replace(row, cells=tuple(new_cells))
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def add_global_p(table: SofraTable) -> SofraTable:
|
|
230
|
+
"""Add a joint Type-III p-value column to a :func:`tbl_regression` table.
|
|
231
|
+
|
|
232
|
+
For each multi-level categorical predictor, the rows share a single
|
|
233
|
+
joint Wald-F p-value computed via ``model.f_test()`` on the
|
|
234
|
+
contrast matrix that zeroes out every level simultaneously.
|
|
235
|
+
Single-level coefficients receive their existing p-value
|
|
236
|
+
duplicated.
|
|
237
|
+
|
|
238
|
+
Raises
|
|
239
|
+
------
|
|
240
|
+
NotImplementedError
|
|
241
|
+
If the table was not built by :func:`tbl_regression` (i.e. no
|
|
242
|
+
fitted ``model`` is attached). Joint Type-III tests on a
|
|
243
|
+
:func:`tbl_one` table require re-fitting per-variable
|
|
244
|
+
regressions on the source data — that path is not yet
|
|
245
|
+
implemented; raising avoids silently emitting a column of
|
|
246
|
+
em-dashes that would mislead a reader of the published table.
|
|
247
|
+
|
|
248
|
+
Notes
|
|
249
|
+
-----
|
|
250
|
+
The contrast matrix is built from coefficient stems detected by
|
|
251
|
+
:func:`_coef_stem` (handles statsmodels'
|
|
252
|
+
``C(race)[T.B]`` / ``arm[T.Treatment]`` markers).
|
|
253
|
+
"""
|
|
254
|
+
model = (table.metadata or {}).get("model")
|
|
255
|
+
if model is None or not hasattr(model, "f_test"):
|
|
256
|
+
raise NotImplementedError(
|
|
257
|
+
"add_global_p currently supports tbl_regression tables only. "
|
|
258
|
+
"For a tbl_one / tbl_summary table, joint Type-III tests would "
|
|
259
|
+
"require re-fitting per-variable regressions on the source "
|
|
260
|
+
"data — that path is not implemented yet. Track the issue "
|
|
261
|
+
"before using `add_global_p` on a non-regression table."
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
new_headers = _insert_after_pvalue_header(table.headers, "global p")
|
|
265
|
+
|
|
266
|
+
# Group coefficient rows by their stem (the part before the level
|
|
267
|
+
# marker that statsmodels uses, e.g. ``C(race)[T.B]``).
|
|
268
|
+
stems: dict[str, list[int]] = {}
|
|
269
|
+
row_label_to_stem: dict[int, str] = {}
|
|
270
|
+
for i, r in enumerate(table.rows):
|
|
271
|
+
label = r.cells[0].text
|
|
272
|
+
stem = _coef_stem(label)
|
|
273
|
+
stems.setdefault(stem, []).append(i)
|
|
274
|
+
row_label_to_stem[i] = stem
|
|
275
|
+
|
|
276
|
+
# Compute one joint p-value per stem.
|
|
277
|
+
joint_p: dict[str, float | None] = {}
|
|
278
|
+
params = getattr(model, "params", None)
|
|
279
|
+
param_names = (
|
|
280
|
+
list(params.index)
|
|
281
|
+
if params is not None and hasattr(params, "index")
|
|
282
|
+
else []
|
|
283
|
+
)
|
|
284
|
+
for stem, _idxs in stems.items():
|
|
285
|
+
# The coefficient names contributing to this stem are every param
|
|
286
|
+
# whose stem matches.
|
|
287
|
+
contributing = [p for p in param_names if _coef_stem(p) == stem]
|
|
288
|
+
if not contributing:
|
|
289
|
+
joint_p[stem] = None
|
|
290
|
+
continue
|
|
291
|
+
# Build a constraint string "c1 = 0, c2 = 0, ..."
|
|
292
|
+
constraint = ", ".join(f"{c} = 0" for c in contributing)
|
|
293
|
+
try:
|
|
294
|
+
res = model.f_test(constraint)
|
|
295
|
+
joint_p[stem] = float(res.pvalue)
|
|
296
|
+
except Exception: # pragma: no cover — exotic models / singular cov
|
|
297
|
+
joint_p[stem] = None
|
|
298
|
+
|
|
299
|
+
from ..core.format import fmt_p_value
|
|
300
|
+
|
|
301
|
+
new_rows = []
|
|
302
|
+
for i, r in enumerate(table.rows):
|
|
303
|
+
stem = row_label_to_stem[i]
|
|
304
|
+
p = joint_p.get(stem)
|
|
305
|
+
new_rows.append(_insert_after_pvalue_cell(
|
|
306
|
+
r, fmt_p_value(p) if p is not None else "—", value=p,
|
|
307
|
+
))
|
|
308
|
+
return replace(table, headers=new_headers, rows=tuple(new_rows))
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _coef_stem(name: str) -> str:
|
|
312
|
+
"""Strip statsmodels-style level markers from a coefficient name.
|
|
313
|
+
|
|
314
|
+
``C(race)[T.B]`` → ``C(race)``
|
|
315
|
+
``arm[T.Treatment]`` → ``arm``
|
|
316
|
+
``age`` → ``age``
|
|
317
|
+
"""
|
|
318
|
+
for marker in ("[T.", "[", "_T_"):
|
|
319
|
+
if marker in name:
|
|
320
|
+
return name.split(marker, 1)[0]
|
|
321
|
+
return name
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# ----------------------------------------------------------------------
|
|
325
|
+
# add_difference — between-group mean / proportion differences
|
|
326
|
+
# ----------------------------------------------------------------------
|
|
327
|
+
|
|
328
|
+
def add_difference(
|
|
329
|
+
table: SofraTable,
|
|
330
|
+
*,
|
|
331
|
+
digits: int = 2,
|
|
332
|
+
conf_level: float = 0.95,
|
|
333
|
+
) -> SofraTable:
|
|
334
|
+
"""Add an absolute-difference column with CI for a 2-group Table 1.
|
|
335
|
+
|
|
336
|
+
For each continuous row, computes ``mean_2 - mean_1`` and its
|
|
337
|
+
Welch confidence interval. For each dichotomous row, computes
|
|
338
|
+
``prop_2 - prop_1`` and its **Newcombe hybrid-score CI**
|
|
339
|
+
(Newcombe 1998, *Stat Med* 17:873–890, Method 10). The Newcombe
|
|
340
|
+
interval combines two single-sample Wilson scores and is the
|
|
341
|
+
standard recommendation over the Wald (normal-approximation)
|
|
342
|
+
interval, which collapses at extreme proportions. Multi-level
|
|
343
|
+
categorical rows get a ``—``.
|
|
344
|
+
|
|
345
|
+
Requires a SofraTable produced by ``tbl_one`` / ``tbl_summary``
|
|
346
|
+
with exactly two groups (otherwise the differences are ambiguous).
|
|
347
|
+
"""
|
|
348
|
+
if table._spec is None or table._spec.builder not in ("tbl_one",):
|
|
349
|
+
raise ValueError(
|
|
350
|
+
"add_difference is only supported on tbl_one / tbl_summary tables."
|
|
351
|
+
)
|
|
352
|
+
spec = table._spec
|
|
353
|
+
opts = spec.options
|
|
354
|
+
by = opts["by"]
|
|
355
|
+
if by is None:
|
|
356
|
+
raise ValueError("add_difference requires a stratification variable (by=).")
|
|
357
|
+
|
|
358
|
+
# The rebuild closure is the only handle we have on the original data.
|
|
359
|
+
rebuild = table._rebuild
|
|
360
|
+
if rebuild is None:
|
|
361
|
+
raise ValueError(
|
|
362
|
+
"add_difference needs access to the original data — only tables "
|
|
363
|
+
"built directly by tbl_one / tbl_summary qualify."
|
|
364
|
+
)
|
|
365
|
+
# Extract source data from the rebuild closure cell.
|
|
366
|
+
data = _data_from_rebuild(rebuild)
|
|
367
|
+
if data is None:
|
|
368
|
+
raise ValueError("Could not recover source data from table closure.")
|
|
369
|
+
|
|
370
|
+
by_series = data[by]
|
|
371
|
+
levels = sorted(by_series.dropna().unique(), key=str)
|
|
372
|
+
if len(levels) != 2:
|
|
373
|
+
raise ValueError(
|
|
374
|
+
f"add_difference requires exactly 2 groups; got {len(levels)}."
|
|
375
|
+
)
|
|
376
|
+
g1, g2 = levels
|
|
377
|
+
mask1 = by_series == g1
|
|
378
|
+
mask2 = by_series == g2
|
|
379
|
+
|
|
380
|
+
kinds = opts["kinds"]
|
|
381
|
+
variables = opts["variables"]
|
|
382
|
+
|
|
383
|
+
diffs: dict[str, tuple[float | None, float | None, float | None]] = {}
|
|
384
|
+
for var in variables:
|
|
385
|
+
if kinds[var] == "continuous":
|
|
386
|
+
a = pd.to_numeric(data.loc[mask1, var], errors="coerce").dropna()
|
|
387
|
+
b = pd.to_numeric(data.loc[mask2, var], errors="coerce").dropna()
|
|
388
|
+
if len(a) < 2 or len(b) < 2:
|
|
389
|
+
diffs[var] = (None, None, None)
|
|
390
|
+
continue
|
|
391
|
+
diff = float(b.mean() - a.mean())
|
|
392
|
+
se = math.sqrt(b.var(ddof=1) / len(b) + a.var(ddof=1) / len(a))
|
|
393
|
+
df_w = (
|
|
394
|
+
(b.var(ddof=1) / len(b) + a.var(ddof=1) / len(a)) ** 2
|
|
395
|
+
/ (
|
|
396
|
+
(b.var(ddof=1) / len(b)) ** 2 / (len(b) - 1)
|
|
397
|
+
+ (a.var(ddof=1) / len(a)) ** 2 / (len(a) - 1)
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
tcrit = float(sp_stats.t.ppf(0.5 + conf_level / 2, df=df_w))
|
|
401
|
+
diffs[var] = (diff, diff - tcrit * se, diff + tcrit * se)
|
|
402
|
+
elif kinds[var] == "dichotomous":
|
|
403
|
+
s = data[var]
|
|
404
|
+
if isinstance(s.dtype, pd.CategoricalDtype):
|
|
405
|
+
lvls = list(s.cat.categories)
|
|
406
|
+
else:
|
|
407
|
+
lvls = sorted(s.dropna().unique(), key=str)
|
|
408
|
+
if len(lvls) != 2:
|
|
409
|
+
diffs[var] = (None, None, None)
|
|
410
|
+
continue
|
|
411
|
+
success = lvls[1]
|
|
412
|
+
n1 = int(mask1.sum())
|
|
413
|
+
n2 = int(mask2.sum())
|
|
414
|
+
x1 = int((data.loc[mask1, var] == success).sum())
|
|
415
|
+
x2 = int((data.loc[mask2, var] == success).sum())
|
|
416
|
+
if n1 == 0 or n2 == 0: # pragma: no cover — guarded: by_series == lvl always has ≥1 match
|
|
417
|
+
diffs[var] = (None, None, None)
|
|
418
|
+
continue
|
|
419
|
+
p1, p2 = x1 / n1, x2 / n2
|
|
420
|
+
diff = p2 - p1
|
|
421
|
+
zcrit = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
|
|
422
|
+
# Newcombe's (1998) Method 10 — the Wilson-based hybrid CI
|
|
423
|
+
# for the difference of two independent proportions. It is
|
|
424
|
+
# the standard recommendation over the Wald
|
|
425
|
+
# (normal-approximation) interval, which collapses at the
|
|
426
|
+
# extremes p≈0 or p≈1. Reference: Newcombe (1998), Stat Med
|
|
427
|
+
# 17:873–890. Here diff = p2 - p1; the lower bound is
|
|
428
|
+
# attained at p1=U1, p2=L2 (and vice versa for the upper).
|
|
429
|
+
lo1, hi1 = _wilson_ci(x1, n1, z=zcrit)
|
|
430
|
+
lo2, hi2 = _wilson_ci(x2, n2, z=zcrit)
|
|
431
|
+
lo = diff - math.sqrt((hi1 - p1) ** 2 + (p2 - lo2) ** 2)
|
|
432
|
+
hi = diff + math.sqrt((p1 - lo1) ** 2 + (hi2 - p2) ** 2)
|
|
433
|
+
diffs[var] = (diff, lo, hi)
|
|
434
|
+
else:
|
|
435
|
+
diffs[var] = (None, None, None)
|
|
436
|
+
|
|
437
|
+
# Insert a new column right before any p-value column.
|
|
438
|
+
new_headers = _insert_after_groups_header(
|
|
439
|
+
table.headers,
|
|
440
|
+
f"Diff ({int(round(conf_level * 100))}% CI)",
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Walk rows and patch.
|
|
444
|
+
new_rows: list[Row] = []
|
|
445
|
+
for r in table.rows:
|
|
446
|
+
label = r.cells[0].text
|
|
447
|
+
var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
|
|
448
|
+
text: str
|
|
449
|
+
value: Any
|
|
450
|
+
if var is not None and var in diffs:
|
|
451
|
+
d_opt, lo_opt, hi_opt = diffs[var]
|
|
452
|
+
if (
|
|
453
|
+
d_opt is None or lo_opt is None or hi_opt is None
|
|
454
|
+
or any(_isnan(x) for x in (d_opt, lo_opt, hi_opt))
|
|
455
|
+
):
|
|
456
|
+
text = "—"
|
|
457
|
+
value = None
|
|
458
|
+
else:
|
|
459
|
+
d, lo, hi = d_opt, lo_opt, hi_opt
|
|
460
|
+
text = (
|
|
461
|
+
f"{fmt_number(d, digits)} "
|
|
462
|
+
f"({fmt_number(lo, digits)}, {fmt_number(hi, digits)})"
|
|
463
|
+
)
|
|
464
|
+
value = (d, lo, hi)
|
|
465
|
+
else:
|
|
466
|
+
text = ""
|
|
467
|
+
value = None
|
|
468
|
+
new_rows.append(_insert_after_groups_cell(r, text, value=value,
|
|
469
|
+
kind="ci"))
|
|
470
|
+
return replace(table, headers=new_headers, rows=tuple(new_rows))
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# ----------------------------------------------------------------------
|
|
474
|
+
# add_ci — confidence intervals for each summary cell
|
|
475
|
+
# ----------------------------------------------------------------------
|
|
476
|
+
|
|
477
|
+
def add_ci(
|
|
478
|
+
table: SofraTable,
|
|
479
|
+
*,
|
|
480
|
+
conf_level: float = 0.95,
|
|
481
|
+
) -> SofraTable:
|
|
482
|
+
"""Append a parenthesised confidence interval to each summary cell.
|
|
483
|
+
|
|
484
|
+
For continuous rows the existing ``mean (SD)`` cell becomes
|
|
485
|
+
``mean (SD) [lo, hi]`` where ``[lo, hi]`` is the Welch CI for the
|
|
486
|
+
mean. For dichotomous rows the ``n (%)`` cell gains a Wilson-score
|
|
487
|
+
CI for the proportion. Multi-level categorical rows are unchanged.
|
|
488
|
+
"""
|
|
489
|
+
if table._spec is None or table._rebuild is None:
|
|
490
|
+
raise ValueError(
|
|
491
|
+
"add_ci needs access to the source data — only tables built "
|
|
492
|
+
"directly by tbl_one / tbl_summary qualify."
|
|
493
|
+
)
|
|
494
|
+
data = _data_from_rebuild(table._rebuild)
|
|
495
|
+
if data is None:
|
|
496
|
+
raise ValueError("Could not recover source data from table closure.")
|
|
497
|
+
|
|
498
|
+
opts = table._spec.options
|
|
499
|
+
by = opts["by"]
|
|
500
|
+
kinds = opts["kinds"]
|
|
501
|
+
variables = opts["variables"]
|
|
502
|
+
|
|
503
|
+
group_keys, group_masks = _resolve_groups(data, by)
|
|
504
|
+
if opts.get("overall"):
|
|
505
|
+
group_keys = [opts.get("overall_label", "Overall"), *group_keys]
|
|
506
|
+
group_masks = {opts.get("overall_label", "Overall"):
|
|
507
|
+
pd.Series(True, index=data.index), **group_masks}
|
|
508
|
+
|
|
509
|
+
new_rows: list[Row] = []
|
|
510
|
+
z = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
|
|
511
|
+
|
|
512
|
+
for r in table.rows:
|
|
513
|
+
label = r.cells[0].text
|
|
514
|
+
var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
|
|
515
|
+
if var is None:
|
|
516
|
+
new_rows.append(r)
|
|
517
|
+
continue
|
|
518
|
+
kind = kinds[var]
|
|
519
|
+
# Patch group cells (columns 1..1+len(group_keys)).
|
|
520
|
+
new_cells = list(r.cells)
|
|
521
|
+
for offset, k in enumerate(group_keys):
|
|
522
|
+
col = 1 + offset
|
|
523
|
+
if col >= len(new_cells):
|
|
524
|
+
break
|
|
525
|
+
old = new_cells[col]
|
|
526
|
+
mask = group_masks[k]
|
|
527
|
+
if kind == "continuous":
|
|
528
|
+
v = pd.to_numeric(data.loc[mask, var], errors="coerce").dropna()
|
|
529
|
+
if len(v) < 2:
|
|
530
|
+
continue
|
|
531
|
+
m = float(v.mean())
|
|
532
|
+
se = float(v.std(ddof=1)) / math.sqrt(len(v))
|
|
533
|
+
tcrit = float(sp_stats.t.ppf(0.5 + conf_level / 2, df=len(v) - 1))
|
|
534
|
+
lo, hi = m - tcrit * se, m + tcrit * se
|
|
535
|
+
ci = f" [{fmt_number(lo, 2)}, {fmt_number(hi, 2)}]"
|
|
536
|
+
new_cells[col] = replace(old, text=old.text + ci)
|
|
537
|
+
elif kind == "dichotomous" and "=" in label:
|
|
538
|
+
# Dichotomous rows have "label = success_level"
|
|
539
|
+
s = data[var]
|
|
540
|
+
lvls = (list(s.cat.categories)
|
|
541
|
+
if isinstance(s.dtype, pd.CategoricalDtype)
|
|
542
|
+
else sorted(s.dropna().unique(), key=str))
|
|
543
|
+
if len(lvls) != 2:
|
|
544
|
+
continue
|
|
545
|
+
success = lvls[1]
|
|
546
|
+
n = int(data.loc[mask, var].notna().sum())
|
|
547
|
+
x = int((data.loc[mask, var] == success).sum())
|
|
548
|
+
if n == 0:
|
|
549
|
+
continue
|
|
550
|
+
lo, hi = _wilson_ci(x, n, z=z)
|
|
551
|
+
ci = f" [{fmt_number(100*lo, 1)}%, {fmt_number(100*hi, 1)}%]"
|
|
552
|
+
new_cells[col] = replace(old, text=old.text + ci)
|
|
553
|
+
new_rows.append(replace(r, cells=tuple(new_cells)))
|
|
554
|
+
|
|
555
|
+
fn = (
|
|
556
|
+
f"Bracketed intervals: {int(round(conf_level*100))}% confidence "
|
|
557
|
+
"interval (Welch for means, Wilson-score for proportions)."
|
|
558
|
+
)
|
|
559
|
+
return replace(
|
|
560
|
+
table,
|
|
561
|
+
rows=tuple(new_rows),
|
|
562
|
+
footnotes=tuple([*table.footnotes, fn]),
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _wilson_ci(x: int, n: int, *, z: float) -> tuple[float, float]:
|
|
567
|
+
"""Wilson score CI for a proportion."""
|
|
568
|
+
if n == 0:
|
|
569
|
+
return float("nan"), float("nan")
|
|
570
|
+
p = x / n
|
|
571
|
+
denom = 1 + z * z / n
|
|
572
|
+
center = (p + z * z / (2 * n)) / denom
|
|
573
|
+
half = (z * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n))) / denom
|
|
574
|
+
return max(0.0, center - half), min(1.0, center + half)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
# ----------------------------------------------------------------------
|
|
578
|
+
# Formatter override modifiers
|
|
579
|
+
# ----------------------------------------------------------------------
|
|
580
|
+
|
|
581
|
+
def with_pvalue_fmt(
|
|
582
|
+
table: SofraTable,
|
|
583
|
+
fn: Callable[[float], str],
|
|
584
|
+
) -> SofraTable:
|
|
585
|
+
"""Re-format every p-value cell with ``fn(value) -> str``."""
|
|
586
|
+
return _apply_formatter(table, kind="p_value", fn=fn)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def with_estimate_fmt(
|
|
590
|
+
table: SofraTable,
|
|
591
|
+
fn: Callable[[float], str],
|
|
592
|
+
) -> SofraTable:
|
|
593
|
+
"""Re-format every numeric estimate cell with ``fn(value) -> str``."""
|
|
594
|
+
return _apply_formatter(table, kind="numeric", fn=fn)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _apply_formatter(
|
|
598
|
+
table: SofraTable,
|
|
599
|
+
*,
|
|
600
|
+
kind: str,
|
|
601
|
+
fn: Callable[[float], str],
|
|
602
|
+
) -> SofraTable:
|
|
603
|
+
new_rows: list[Row] = []
|
|
604
|
+
for r in table.rows:
|
|
605
|
+
new_cells = []
|
|
606
|
+
for c in r.cells:
|
|
607
|
+
if c.kind == kind and isinstance(c.value, (int, float)) \
|
|
608
|
+
and not _isnan(c.value):
|
|
609
|
+
new_cells.append(replace(c, text=fn(float(c.value))))
|
|
610
|
+
else:
|
|
611
|
+
new_cells.append(c)
|
|
612
|
+
new_rows.append(replace(r, cells=tuple(new_cells)))
|
|
613
|
+
return replace(table, rows=tuple(new_rows))
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
# ----------------------------------------------------------------------
|
|
617
|
+
# Helpers
|
|
618
|
+
# ----------------------------------------------------------------------
|
|
619
|
+
|
|
620
|
+
def _isnan(x: Any) -> bool:
|
|
621
|
+
try:
|
|
622
|
+
return math.isnan(float(x))
|
|
623
|
+
except (TypeError, ValueError):
|
|
624
|
+
return False
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _data_from_rebuild(rebuild: Callable[..., Any]) -> pd.DataFrame | None:
|
|
628
|
+
"""Recover the source DataFrame captured by a builder's rebuild closure."""
|
|
629
|
+
closure = getattr(rebuild, "__closure__", None)
|
|
630
|
+
if not closure:
|
|
631
|
+
return None
|
|
632
|
+
for cell in closure:
|
|
633
|
+
contents = cell.cell_contents
|
|
634
|
+
if isinstance(contents, pd.DataFrame):
|
|
635
|
+
return contents
|
|
636
|
+
return None
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _find_variable_for_row(
|
|
640
|
+
label: str,
|
|
641
|
+
variables: tuple[str, ...],
|
|
642
|
+
kinds: dict[str, str],
|
|
643
|
+
*,
|
|
644
|
+
labels: dict[str, str] | None = None,
|
|
645
|
+
) -> str | None:
|
|
646
|
+
"""Match a body-row's displayed text back to its source variable.
|
|
647
|
+
|
|
648
|
+
Handles three cases:
|
|
649
|
+
|
|
650
|
+
* Raw variable name (``"age"``)
|
|
651
|
+
* Dichotomous renaming (``"sex = Male"``)
|
|
652
|
+
* Display-relabelled rows via the ``labels={...}`` argument to
|
|
653
|
+
``tbl_one`` (``"Patient sex = Male"`` for ``labels={"sex":
|
|
654
|
+
"Patient sex"}``)
|
|
655
|
+
"""
|
|
656
|
+
labels = labels or {}
|
|
657
|
+
for v in variables:
|
|
658
|
+
if label == v:
|
|
659
|
+
return v
|
|
660
|
+
if label.startswith(f"{v} ="):
|
|
661
|
+
return v
|
|
662
|
+
# Display-relabelled rows: scan the labels mapping.
|
|
663
|
+
for src, disp in labels.items():
|
|
664
|
+
if not disp:
|
|
665
|
+
continue
|
|
666
|
+
if label == disp:
|
|
667
|
+
return src
|
|
668
|
+
if label.startswith(f"{disp} ="):
|
|
669
|
+
return src
|
|
670
|
+
return None
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _resolve_groups(data: pd.DataFrame, by: str | None) -> tuple[list[Any], dict[Any, pd.Series]]:
|
|
674
|
+
if by is None:
|
|
675
|
+
return ["Overall"], {"Overall": pd.Series(True, index=data.index)}
|
|
676
|
+
s = data[by]
|
|
677
|
+
levels = (list(s.cat.categories)
|
|
678
|
+
if isinstance(s.dtype, pd.CategoricalDtype)
|
|
679
|
+
else sorted(s.dropna().unique(), key=str))
|
|
680
|
+
return list(levels), {k: (s == k) for k in levels}
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _insert_after_pvalue_header(headers: tuple[HeaderRow, ...], label: str) -> tuple[HeaderRow, ...]:
|
|
684
|
+
"""Insert a header cell named ``label`` right after the first p-value column."""
|
|
685
|
+
new_headers: list[HeaderRow] = []
|
|
686
|
+
for hr in headers:
|
|
687
|
+
new_cells = list(hr.cells)
|
|
688
|
+
for j, c in enumerate(new_cells):
|
|
689
|
+
if c.text.lower().startswith("p-value") or c.text.lower() == "p":
|
|
690
|
+
new_cells.insert(j + 1, HeaderCell(text=label))
|
|
691
|
+
break
|
|
692
|
+
else:
|
|
693
|
+
new_cells.append(HeaderCell(text=label))
|
|
694
|
+
new_headers.append(HeaderRow(cells=tuple(new_cells)))
|
|
695
|
+
return tuple(new_headers)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def _insert_after_pvalue_cell(row: Row, text: str, *, value: Any) -> Row:
|
|
699
|
+
new_cells = list(row.cells)
|
|
700
|
+
for j, c in enumerate(new_cells):
|
|
701
|
+
if c.kind == "p_value":
|
|
702
|
+
new_cells.insert(j + 1, make_cell(text, value=value, align="right"))
|
|
703
|
+
break
|
|
704
|
+
else:
|
|
705
|
+
new_cells.append(make_cell(text, value=value, align="right"))
|
|
706
|
+
return replace(row, cells=tuple(new_cells))
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
def _insert_after_groups_header(
|
|
710
|
+
headers: tuple[HeaderRow, ...], label: str,
|
|
711
|
+
) -> tuple[HeaderRow, ...]:
|
|
712
|
+
"""Insert a header cell named ``label`` right before any p-value column."""
|
|
713
|
+
new_headers: list[HeaderRow] = []
|
|
714
|
+
for hr in headers:
|
|
715
|
+
new_cells = list(hr.cells)
|
|
716
|
+
insert_at = len(new_cells)
|
|
717
|
+
for j, c in enumerate(new_cells):
|
|
718
|
+
if c.text.lower().startswith(("p-value", "p", "smd")):
|
|
719
|
+
insert_at = j
|
|
720
|
+
break
|
|
721
|
+
new_cells.insert(insert_at, HeaderCell(text=label))
|
|
722
|
+
new_headers.append(HeaderRow(cells=tuple(new_cells)))
|
|
723
|
+
return tuple(new_headers)
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _insert_after_groups_cell(
|
|
727
|
+
row: Row,
|
|
728
|
+
text: str,
|
|
729
|
+
*,
|
|
730
|
+
value: Any,
|
|
731
|
+
kind: Any = "text",
|
|
732
|
+
) -> Row:
|
|
733
|
+
new_cells: list[Cell] = list(row.cells)
|
|
734
|
+
insert_at = len(new_cells)
|
|
735
|
+
for j, c in enumerate(new_cells):
|
|
736
|
+
if c.kind in ("p_value", "q_value") or (
|
|
737
|
+
c.kind == "numeric" and j == len(new_cells) - 1
|
|
738
|
+
and isinstance(c.value, (int, float))
|
|
739
|
+
and not _isnan(c.value or 0)
|
|
740
|
+
and c.text and c.text.replace(".", "").replace("-", "").isdigit()
|
|
741
|
+
):
|
|
742
|
+
insert_at = j
|
|
743
|
+
break
|
|
744
|
+
new_cells.insert(insert_at, make_cell(text, value=value, kind=kind, align="right"))
|
|
745
|
+
return replace(row, cells=tuple(new_cells))
|