pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pysofra/__init__.py +82 -0
  2. pysofra/core/__init__.py +14 -0
  3. pysofra/core/compose.py +167 -0
  4. pysofra/core/format.py +155 -0
  5. pysofra/core/frames.py +69 -0
  6. pysofra/core/schema.py +128 -0
  7. pysofra/core/table.py +924 -0
  8. pysofra/io/__init__.py +1 -0
  9. pysofra/models/__init__.py +6 -0
  10. pysofra/models/extract.py +249 -0
  11. pysofra/models/pool.py +119 -0
  12. pysofra/models/regression.py +507 -0
  13. pysofra/models/survival.py +395 -0
  14. pysofra/models/uvregression.py +438 -0
  15. pysofra/notebook/__init__.py +6 -0
  16. pysofra/plot/__init__.py +23 -0
  17. pysofra/plot/_backend.py +32 -0
  18. pysofra/plot/forest.py +159 -0
  19. pysofra/plot/inline.py +171 -0
  20. pysofra/plot/km.py +249 -0
  21. pysofra/render/__init__.py +28 -0
  22. pysofra/render/_zip_determinism.py +57 -0
  23. pysofra/render/base.py +22 -0
  24. pysofra/render/docx.py +286 -0
  25. pysofra/render/html.py +442 -0
  26. pysofra/render/image.py +130 -0
  27. pysofra/render/latex.py +253 -0
  28. pysofra/render/markdown.py +128 -0
  29. pysofra/render/pptx.py +340 -0
  30. pysofra/render/xlsx.py +226 -0
  31. pysofra/summary/__init__.py +6 -0
  32. pysofra/summary/calibrate.py +214 -0
  33. pysofra/summary/design.py +246 -0
  34. pysofra/summary/effect_size.py +187 -0
  35. pysofra/summary/extras.py +745 -0
  36. pysofra/summary/smd.py +133 -0
  37. pysofra/summary/stats.py +135 -0
  38. pysofra/summary/tbl_cross.py +339 -0
  39. pysofra/summary/tbl_one.py +1220 -0
  40. pysofra/summary/tbl_summary.py +51 -0
  41. pysofra/summary/tests.py +370 -0
  42. pysofra/summary/typing.py +129 -0
  43. pysofra/summary/weights.py +161 -0
  44. pysofra/themes/__init__.py +5 -0
  45. pysofra/themes/registry.py +272 -0
  46. pysofra-0.1.0a1.dist-info/METADATA +301 -0
  47. pysofra-0.1.0a1.dist-info/RECORD +50 -0
  48. pysofra-0.1.0a1.dist-info/WHEEL +4 -0
  49. pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
  50. pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,745 @@
1
+ """Extra modifiers — joint Type-III tests, between-group differences,
2
+ descriptive confidence intervals, formatter overrides.
3
+
4
+ These are smaller add-ons to the core ``tbl_one`` / ``tbl_summary``
5
+ output, modelled on the corresponding ``gtsummary`` functions
6
+ (``add_global_p``, ``add_difference``, ``add_ci``, ``estimate_fun=``,
7
+ ``pvalue_fun=``).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from collections.abc import Callable
14
+ from dataclasses import replace
15
+ from typing import Any
16
+
17
+ import pandas as pd
18
+ from scipy import stats as sp_stats
19
+
20
+ from ..core.format import fmt_number
21
+ from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
22
+ from ..core.table import SofraTable
23
+
24
+ # ----------------------------------------------------------------------
25
+ # add_global_p — Type-III joint test per categorical variable
26
+ # ----------------------------------------------------------------------
27
+
28
+ def add_significance_stars(
29
+ table: SofraTable,
30
+ *,
31
+ thresholds: tuple[tuple[float, str], ...] = (
32
+ (0.001, "***"),
33
+ (0.01, "**"),
34
+ (0.05, "*"),
35
+ ),
36
+ ) -> SofraTable:
37
+ """Append a ``stars`` column with ``*** / ** / *`` significance markers.
38
+
39
+ ``thresholds`` is a tuple of ``(cutoff, marker)`` pairs sorted from
40
+ smallest to largest cutoff. A p-value is marked with the first
41
+ marker whose cutoff it falls below (matching standard journal
42
+ convention).
43
+ """
44
+ new_headers = _insert_after_pvalue_header(table.headers, "")
45
+ new_rows: list[Row] = []
46
+ for r in table.rows:
47
+ p_cell = next(
48
+ (c for c in r.cells
49
+ if c.kind == "p_value" and isinstance(c.value, (int, float))),
50
+ None,
51
+ )
52
+ marker = ""
53
+ if p_cell is not None and p_cell.value is not None:
54
+ p = float(p_cell.value)
55
+ if not _isnan(p):
56
+ for cutoff, m in thresholds:
57
+ if p < cutoff:
58
+ marker = m
59
+ break
60
+ new_rows.append(_insert_after_pvalue_cell(r, marker, value=None))
61
+
62
+ # Drop the placeholder header label — significance stars don't need one.
63
+ cleaned_headers: list[HeaderRow] = []
64
+ for hr in new_headers:
65
+ cleaned_headers.append(hr)
66
+ return replace(table, headers=tuple(cleaned_headers), rows=tuple(new_rows))
67
+
68
+
69
+ def add_n(table: SofraTable) -> SofraTable:
70
+ """Append a per-row ``N`` column with the non-missing sample size.
71
+
72
+ Reads the rebuild closure to recover the source data, then counts
73
+ non-missing observations per variable. For categorical rows the
74
+ column shows the variable's overall N (not per-level N).
75
+ """
76
+ if table._spec is None or table._rebuild is None:
77
+ raise ValueError(
78
+ "add_n needs access to the source data — only tables built "
79
+ "directly by tbl_one / tbl_summary qualify."
80
+ )
81
+ data = _data_from_rebuild(table._rebuild)
82
+ if data is None:
83
+ raise ValueError("Could not recover source data from table closure.")
84
+
85
+ opts = table._spec.options
86
+ variables = opts["variables"]
87
+ kinds = opts["kinds"]
88
+
89
+ n_for: dict[str, int] = {}
90
+ for v in variables:
91
+ n_for[v] = int(data[v].notna().sum())
92
+
93
+ new_headers = _insert_after_label_header(table.headers, "N")
94
+ new_rows: list[Row] = []
95
+ for r in table.rows:
96
+ label = r.cells[0].text
97
+ var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
98
+ text = f"{n_for[var]:,}" if var is not None and var in n_for else ""
99
+ new_rows.append(_insert_after_label_cell(
100
+ r, text, value=n_for.get(var) if var else None,
101
+ ))
102
+ return replace(table, headers=new_headers, rows=tuple(new_rows))
103
+
104
+
105
+ def add_stat_label(table: SofraTable) -> SofraTable:
106
+ """Append a ``Statistic`` column describing each row's summary form.
107
+
108
+ Continuous (non-nonnormal) rows display "Mean (SD)"; ``nonnormal``
109
+ rows display "Median (Q1, Q3)"; categorical rows display "n (%)".
110
+ """
111
+ if table._spec is None or table._rebuild is None:
112
+ raise ValueError(
113
+ "add_stat_label needs a tbl_one / tbl_summary source table."
114
+ )
115
+ opts = table._spec.options
116
+ variables = opts["variables"]
117
+ kinds = opts["kinds"]
118
+ nonnormal = set(opts.get("nonnormal", set()))
119
+
120
+ label_for: dict[str, str] = {}
121
+ for v in variables:
122
+ if kinds[v] == "continuous":
123
+ label_for[v] = (
124
+ "Median (Q1, Q3)" if v in nonnormal else "Mean (SD)"
125
+ )
126
+ else:
127
+ label_for[v] = "n (%)"
128
+
129
+ new_headers = _insert_after_label_header(table.headers, "Statistic")
130
+ new_rows: list[Row] = []
131
+ for r in table.rows:
132
+ label = r.cells[0].text
133
+ var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
134
+ text = label_for.get(var, "") if var else ""
135
+ new_rows.append(_insert_after_label_cell(r, text, value=None))
136
+ return replace(table, headers=new_headers, rows=tuple(new_rows))
137
+
138
+
139
+ def color_scale_if(
140
+ table: SofraTable,
141
+ *,
142
+ column: int,
143
+ palette: tuple[str, str, str] = ("#fff5f0", "#fcae91", "#cb181d"),
144
+ skip_blank: bool = True,
145
+ ) -> SofraTable:
146
+ """Heatmap-style cell colouring for a numeric column (HTML only).
147
+
148
+ Walks the body rows, finds the cell at ``column``, and assigns a
149
+ background colour interpolated across the three-stop ``palette``
150
+ according to the cell's numeric value. Non-numeric cells are left
151
+ untouched (or skipped when ``skip_blank=True``).
152
+
153
+ Renderers other than HTML ignore the colour (DOCX/XLSX could honour
154
+ it via per-cell ``style``; that's left to a future round).
155
+ """
156
+ # Pass 1: collect numeric values.
157
+ vals: list[tuple[int, float]] = []
158
+ for i, r in enumerate(table.rows):
159
+ if column >= len(r.cells):
160
+ continue
161
+ v = r.cells[column].value
162
+ if isinstance(v, (int, float)) and not _isnan(v):
163
+ vals.append((i, float(v)))
164
+
165
+ if not vals:
166
+ return table # nothing to colour
167
+ lo = min(v for _, v in vals)
168
+ hi = max(v for _, v in vals)
169
+ span = hi - lo if hi > lo else 1.0
170
+ mid_color = palette[1]
171
+ lo_color, hi_color = palette[0], palette[2]
172
+
173
+ def interp(value: float) -> str:
174
+ t = (value - lo) / span
175
+ # Two-stop: lo→mid for t<0.5, mid→hi for t>=0.5.
176
+ if t < 0.5:
177
+ return _mix_hex(lo_color, mid_color, t / 0.5)
178
+ return _mix_hex(mid_color, hi_color, (t - 0.5) / 0.5)
179
+
180
+ new_rows: list[Row] = []
181
+ val_dict = dict(vals)
182
+ for i, r in enumerate(table.rows):
183
+ if i not in val_dict:
184
+ new_rows.append(r)
185
+ continue
186
+ cells = list(r.cells)
187
+ c = cells[column]
188
+ color = interp(val_dict[i])
189
+ style = dict(c.style or {})
190
+ style["html"] = (style.get("html", "") + f"background:{color};").strip(";")
191
+ cells[column] = replace(c, style=style)
192
+ new_rows.append(replace(r, cells=tuple(cells)))
193
+ return replace(table, rows=tuple(new_rows))
194
+
195
+
196
+ def _mix_hex(a: str, b: str, t: float) -> str:
197
+ """Linear-interpolate two ``#rrggbb`` colours at parameter ``t``."""
198
+ t = max(0.0, min(1.0, t))
199
+ a = a.lstrip("#")
200
+ b = b.lstrip("#")
201
+ ar, ag, ab = int(a[0:2], 16), int(a[2:4], 16), int(a[4:6], 16)
202
+ br, bg, bb = int(b[0:2], 16), int(b[2:4], 16), int(b[4:6], 16)
203
+ r = int(round(ar + (br - ar) * t))
204
+ g = int(round(ag + (bg - ag) * t))
205
+ bl = int(round(ab + (bb - ab) * t))
206
+ return f"#{r:02x}{g:02x}{bl:02x}"
207
+
208
+
209
+ def _insert_after_label_header(
210
+ headers: tuple[HeaderRow, ...], label: str,
211
+ ) -> tuple[HeaderRow, ...]:
212
+ """Insert a header cell right after the first column (the label column)."""
213
+ new_headers: list[HeaderRow] = []
214
+ for hr in headers:
215
+ new_cells = list(hr.cells)
216
+ new_cells.insert(1, HeaderCell(text=label))
217
+ new_headers.append(HeaderRow(cells=tuple(new_cells)))
218
+ return tuple(new_headers)
219
+
220
+
221
+ def _insert_after_label_cell(
222
+ row: Row, text: str, *, value: Any,
223
+ ) -> Row:
224
+ new_cells = list(row.cells)
225
+ new_cells.insert(1, make_cell(text, value=value, align="right"))
226
+ return replace(row, cells=tuple(new_cells))
227
+
228
+
229
+ def add_global_p(table: SofraTable) -> SofraTable:
230
+ """Add a joint Type-III p-value column to a :func:`tbl_regression` table.
231
+
232
+ For each multi-level categorical predictor, the rows share a single
233
+ joint Wald-F p-value computed via ``model.f_test()`` on the
234
+ contrast matrix that zeroes out every level simultaneously.
235
+ Single-level coefficients receive their existing p-value
236
+ duplicated.
237
+
238
+ Raises
239
+ ------
240
+ NotImplementedError
241
+ If the table was not built by :func:`tbl_regression` (i.e. no
242
+ fitted ``model`` is attached). Joint Type-III tests on a
243
+ :func:`tbl_one` table require re-fitting per-variable
244
+ regressions on the source data — that path is not yet
245
+ implemented; raising avoids silently emitting a column of
246
+ em-dashes that would mislead a reader of the published table.
247
+
248
+ Notes
249
+ -----
250
+ The contrast matrix is built from coefficient stems detected by
251
+ :func:`_coef_stem` (handles statsmodels'
252
+ ``C(race)[T.B]`` / ``arm[T.Treatment]`` markers).
253
+ """
254
+ model = (table.metadata or {}).get("model")
255
+ if model is None or not hasattr(model, "f_test"):
256
+ raise NotImplementedError(
257
+ "add_global_p currently supports tbl_regression tables only. "
258
+ "For a tbl_one / tbl_summary table, joint Type-III tests would "
259
+ "require re-fitting per-variable regressions on the source "
260
+ "data — that path is not implemented yet. Track the issue "
261
+ "before using `add_global_p` on a non-regression table."
262
+ )
263
+
264
+ new_headers = _insert_after_pvalue_header(table.headers, "global p")
265
+
266
+ # Group coefficient rows by their stem (the part before the level
267
+ # marker that statsmodels uses, e.g. ``C(race)[T.B]``).
268
+ stems: dict[str, list[int]] = {}
269
+ row_label_to_stem: dict[int, str] = {}
270
+ for i, r in enumerate(table.rows):
271
+ label = r.cells[0].text
272
+ stem = _coef_stem(label)
273
+ stems.setdefault(stem, []).append(i)
274
+ row_label_to_stem[i] = stem
275
+
276
+ # Compute one joint p-value per stem.
277
+ joint_p: dict[str, float | None] = {}
278
+ params = getattr(model, "params", None)
279
+ param_names = (
280
+ list(params.index)
281
+ if params is not None and hasattr(params, "index")
282
+ else []
283
+ )
284
+ for stem, _idxs in stems.items():
285
+ # The coefficient names contributing to this stem are every param
286
+ # whose stem matches.
287
+ contributing = [p for p in param_names if _coef_stem(p) == stem]
288
+ if not contributing:
289
+ joint_p[stem] = None
290
+ continue
291
+ # Build a constraint string "c1 = 0, c2 = 0, ..."
292
+ constraint = ", ".join(f"{c} = 0" for c in contributing)
293
+ try:
294
+ res = model.f_test(constraint)
295
+ joint_p[stem] = float(res.pvalue)
296
+ except Exception: # pragma: no cover — exotic models / singular cov
297
+ joint_p[stem] = None
298
+
299
+ from ..core.format import fmt_p_value
300
+
301
+ new_rows = []
302
+ for i, r in enumerate(table.rows):
303
+ stem = row_label_to_stem[i]
304
+ p = joint_p.get(stem)
305
+ new_rows.append(_insert_after_pvalue_cell(
306
+ r, fmt_p_value(p) if p is not None else "—", value=p,
307
+ ))
308
+ return replace(table, headers=new_headers, rows=tuple(new_rows))
309
+
310
+
311
+ def _coef_stem(name: str) -> str:
312
+ """Strip statsmodels-style level markers from a coefficient name.
313
+
314
+ ``C(race)[T.B]`` → ``C(race)``
315
+ ``arm[T.Treatment]`` → ``arm``
316
+ ``age`` → ``age``
317
+ """
318
+ for marker in ("[T.", "[", "_T_"):
319
+ if marker in name:
320
+ return name.split(marker, 1)[0]
321
+ return name
322
+
323
+
324
+ # ----------------------------------------------------------------------
325
+ # add_difference — between-group mean / proportion differences
326
+ # ----------------------------------------------------------------------
327
+
328
+ def add_difference(
329
+ table: SofraTable,
330
+ *,
331
+ digits: int = 2,
332
+ conf_level: float = 0.95,
333
+ ) -> SofraTable:
334
+ """Add an absolute-difference column with CI for a 2-group Table 1.
335
+
336
+ For each continuous row, computes ``mean_2 - mean_1`` and its
337
+ Welch confidence interval. For each dichotomous row, computes
338
+ ``prop_2 - prop_1`` and its **Newcombe hybrid-score CI**
339
+ (Newcombe 1998, *Stat Med* 17:873–890, Method 10). The Newcombe
340
+ interval combines two single-sample Wilson scores and is the
341
+ standard recommendation over the Wald (normal-approximation)
342
+ interval, which collapses at extreme proportions. Multi-level
343
+ categorical rows get a ``—``.
344
+
345
+ Requires a SofraTable produced by ``tbl_one`` / ``tbl_summary``
346
+ with exactly two groups (otherwise the differences are ambiguous).
347
+ """
348
+ if table._spec is None or table._spec.builder not in ("tbl_one",):
349
+ raise ValueError(
350
+ "add_difference is only supported on tbl_one / tbl_summary tables."
351
+ )
352
+ spec = table._spec
353
+ opts = spec.options
354
+ by = opts["by"]
355
+ if by is None:
356
+ raise ValueError("add_difference requires a stratification variable (by=).")
357
+
358
+ # The rebuild closure is the only handle we have on the original data.
359
+ rebuild = table._rebuild
360
+ if rebuild is None:
361
+ raise ValueError(
362
+ "add_difference needs access to the original data — only tables "
363
+ "built directly by tbl_one / tbl_summary qualify."
364
+ )
365
+ # Extract source data from the rebuild closure cell.
366
+ data = _data_from_rebuild(rebuild)
367
+ if data is None:
368
+ raise ValueError("Could not recover source data from table closure.")
369
+
370
+ by_series = data[by]
371
+ levels = sorted(by_series.dropna().unique(), key=str)
372
+ if len(levels) != 2:
373
+ raise ValueError(
374
+ f"add_difference requires exactly 2 groups; got {len(levels)}."
375
+ )
376
+ g1, g2 = levels
377
+ mask1 = by_series == g1
378
+ mask2 = by_series == g2
379
+
380
+ kinds = opts["kinds"]
381
+ variables = opts["variables"]
382
+
383
+ diffs: dict[str, tuple[float | None, float | None, float | None]] = {}
384
+ for var in variables:
385
+ if kinds[var] == "continuous":
386
+ a = pd.to_numeric(data.loc[mask1, var], errors="coerce").dropna()
387
+ b = pd.to_numeric(data.loc[mask2, var], errors="coerce").dropna()
388
+ if len(a) < 2 or len(b) < 2:
389
+ diffs[var] = (None, None, None)
390
+ continue
391
+ diff = float(b.mean() - a.mean())
392
+ se = math.sqrt(b.var(ddof=1) / len(b) + a.var(ddof=1) / len(a))
393
+ df_w = (
394
+ (b.var(ddof=1) / len(b) + a.var(ddof=1) / len(a)) ** 2
395
+ / (
396
+ (b.var(ddof=1) / len(b)) ** 2 / (len(b) - 1)
397
+ + (a.var(ddof=1) / len(a)) ** 2 / (len(a) - 1)
398
+ )
399
+ )
400
+ tcrit = float(sp_stats.t.ppf(0.5 + conf_level / 2, df=df_w))
401
+ diffs[var] = (diff, diff - tcrit * se, diff + tcrit * se)
402
+ elif kinds[var] == "dichotomous":
403
+ s = data[var]
404
+ if isinstance(s.dtype, pd.CategoricalDtype):
405
+ lvls = list(s.cat.categories)
406
+ else:
407
+ lvls = sorted(s.dropna().unique(), key=str)
408
+ if len(lvls) != 2:
409
+ diffs[var] = (None, None, None)
410
+ continue
411
+ success = lvls[1]
412
+ n1 = int(mask1.sum())
413
+ n2 = int(mask2.sum())
414
+ x1 = int((data.loc[mask1, var] == success).sum())
415
+ x2 = int((data.loc[mask2, var] == success).sum())
416
+ if n1 == 0 or n2 == 0: # pragma: no cover — guarded: by_series == lvl always has ≥1 match
417
+ diffs[var] = (None, None, None)
418
+ continue
419
+ p1, p2 = x1 / n1, x2 / n2
420
+ diff = p2 - p1
421
+ zcrit = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
422
+ # Newcombe's (1998) Method 10 — the Wilson-based hybrid CI
423
+ # for the difference of two independent proportions. It is
424
+ # the standard recommendation over the Wald
425
+ # (normal-approximation) interval, which collapses at the
426
+ # extremes p≈0 or p≈1. Reference: Newcombe (1998), Stat Med
427
+ # 17:873–890. Here diff = p2 - p1; the lower bound is
428
+ # attained at p1=U1, p2=L2 (and vice versa for the upper).
429
+ lo1, hi1 = _wilson_ci(x1, n1, z=zcrit)
430
+ lo2, hi2 = _wilson_ci(x2, n2, z=zcrit)
431
+ lo = diff - math.sqrt((hi1 - p1) ** 2 + (p2 - lo2) ** 2)
432
+ hi = diff + math.sqrt((p1 - lo1) ** 2 + (hi2 - p2) ** 2)
433
+ diffs[var] = (diff, lo, hi)
434
+ else:
435
+ diffs[var] = (None, None, None)
436
+
437
+ # Insert a new column right before any p-value column.
438
+ new_headers = _insert_after_groups_header(
439
+ table.headers,
440
+ f"Diff ({int(round(conf_level * 100))}% CI)",
441
+ )
442
+
443
+ # Walk rows and patch.
444
+ new_rows: list[Row] = []
445
+ for r in table.rows:
446
+ label = r.cells[0].text
447
+ var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
448
+ text: str
449
+ value: Any
450
+ if var is not None and var in diffs:
451
+ d_opt, lo_opt, hi_opt = diffs[var]
452
+ if (
453
+ d_opt is None or lo_opt is None or hi_opt is None
454
+ or any(_isnan(x) for x in (d_opt, lo_opt, hi_opt))
455
+ ):
456
+ text = "—"
457
+ value = None
458
+ else:
459
+ d, lo, hi = d_opt, lo_opt, hi_opt
460
+ text = (
461
+ f"{fmt_number(d, digits)} "
462
+ f"({fmt_number(lo, digits)}, {fmt_number(hi, digits)})"
463
+ )
464
+ value = (d, lo, hi)
465
+ else:
466
+ text = ""
467
+ value = None
468
+ new_rows.append(_insert_after_groups_cell(r, text, value=value,
469
+ kind="ci"))
470
+ return replace(table, headers=new_headers, rows=tuple(new_rows))
471
+
472
+
473
+ # ----------------------------------------------------------------------
474
+ # add_ci — confidence intervals for each summary cell
475
+ # ----------------------------------------------------------------------
476
+
477
+ def add_ci(
478
+ table: SofraTable,
479
+ *,
480
+ conf_level: float = 0.95,
481
+ ) -> SofraTable:
482
+ """Append a parenthesised confidence interval to each summary cell.
483
+
484
+ For continuous rows the existing ``mean (SD)`` cell becomes
485
+ ``mean (SD) [lo, hi]`` where ``[lo, hi]`` is the Welch CI for the
486
+ mean. For dichotomous rows the ``n (%)`` cell gains a Wilson-score
487
+ CI for the proportion. Multi-level categorical rows are unchanged.
488
+ """
489
+ if table._spec is None or table._rebuild is None:
490
+ raise ValueError(
491
+ "add_ci needs access to the source data — only tables built "
492
+ "directly by tbl_one / tbl_summary qualify."
493
+ )
494
+ data = _data_from_rebuild(table._rebuild)
495
+ if data is None:
496
+ raise ValueError("Could not recover source data from table closure.")
497
+
498
+ opts = table._spec.options
499
+ by = opts["by"]
500
+ kinds = opts["kinds"]
501
+ variables = opts["variables"]
502
+
503
+ group_keys, group_masks = _resolve_groups(data, by)
504
+ if opts.get("overall"):
505
+ group_keys = [opts.get("overall_label", "Overall"), *group_keys]
506
+ group_masks = {opts.get("overall_label", "Overall"):
507
+ pd.Series(True, index=data.index), **group_masks}
508
+
509
+ new_rows: list[Row] = []
510
+ z = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
511
+
512
+ for r in table.rows:
513
+ label = r.cells[0].text
514
+ var = _find_variable_for_row(label, variables, kinds, labels=opts.get("labels"))
515
+ if var is None:
516
+ new_rows.append(r)
517
+ continue
518
+ kind = kinds[var]
519
+ # Patch group cells (columns 1..1+len(group_keys)).
520
+ new_cells = list(r.cells)
521
+ for offset, k in enumerate(group_keys):
522
+ col = 1 + offset
523
+ if col >= len(new_cells):
524
+ break
525
+ old = new_cells[col]
526
+ mask = group_masks[k]
527
+ if kind == "continuous":
528
+ v = pd.to_numeric(data.loc[mask, var], errors="coerce").dropna()
529
+ if len(v) < 2:
530
+ continue
531
+ m = float(v.mean())
532
+ se = float(v.std(ddof=1)) / math.sqrt(len(v))
533
+ tcrit = float(sp_stats.t.ppf(0.5 + conf_level / 2, df=len(v) - 1))
534
+ lo, hi = m - tcrit * se, m + tcrit * se
535
+ ci = f" [{fmt_number(lo, 2)}, {fmt_number(hi, 2)}]"
536
+ new_cells[col] = replace(old, text=old.text + ci)
537
+ elif kind == "dichotomous" and "=" in label:
538
+ # Dichotomous rows have "label = success_level"
539
+ s = data[var]
540
+ lvls = (list(s.cat.categories)
541
+ if isinstance(s.dtype, pd.CategoricalDtype)
542
+ else sorted(s.dropna().unique(), key=str))
543
+ if len(lvls) != 2:
544
+ continue
545
+ success = lvls[1]
546
+ n = int(data.loc[mask, var].notna().sum())
547
+ x = int((data.loc[mask, var] == success).sum())
548
+ if n == 0:
549
+ continue
550
+ lo, hi = _wilson_ci(x, n, z=z)
551
+ ci = f" [{fmt_number(100*lo, 1)}%, {fmt_number(100*hi, 1)}%]"
552
+ new_cells[col] = replace(old, text=old.text + ci)
553
+ new_rows.append(replace(r, cells=tuple(new_cells)))
554
+
555
+ fn = (
556
+ f"Bracketed intervals: {int(round(conf_level*100))}% confidence "
557
+ "interval (Welch for means, Wilson-score for proportions)."
558
+ )
559
+ return replace(
560
+ table,
561
+ rows=tuple(new_rows),
562
+ footnotes=tuple([*table.footnotes, fn]),
563
+ )
564
+
565
+
566
+ def _wilson_ci(x: int, n: int, *, z: float) -> tuple[float, float]:
567
+ """Wilson score CI for a proportion."""
568
+ if n == 0:
569
+ return float("nan"), float("nan")
570
+ p = x / n
571
+ denom = 1 + z * z / n
572
+ center = (p + z * z / (2 * n)) / denom
573
+ half = (z * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n))) / denom
574
+ return max(0.0, center - half), min(1.0, center + half)
575
+
576
+
577
+ # ----------------------------------------------------------------------
578
+ # Formatter override modifiers
579
+ # ----------------------------------------------------------------------
580
+
581
+ def with_pvalue_fmt(
582
+ table: SofraTable,
583
+ fn: Callable[[float], str],
584
+ ) -> SofraTable:
585
+ """Re-format every p-value cell with ``fn(value) -> str``."""
586
+ return _apply_formatter(table, kind="p_value", fn=fn)
587
+
588
+
589
+ def with_estimate_fmt(
590
+ table: SofraTable,
591
+ fn: Callable[[float], str],
592
+ ) -> SofraTable:
593
+ """Re-format every numeric estimate cell with ``fn(value) -> str``."""
594
+ return _apply_formatter(table, kind="numeric", fn=fn)
595
+
596
+
597
+ def _apply_formatter(
598
+ table: SofraTable,
599
+ *,
600
+ kind: str,
601
+ fn: Callable[[float], str],
602
+ ) -> SofraTable:
603
+ new_rows: list[Row] = []
604
+ for r in table.rows:
605
+ new_cells = []
606
+ for c in r.cells:
607
+ if c.kind == kind and isinstance(c.value, (int, float)) \
608
+ and not _isnan(c.value):
609
+ new_cells.append(replace(c, text=fn(float(c.value))))
610
+ else:
611
+ new_cells.append(c)
612
+ new_rows.append(replace(r, cells=tuple(new_cells)))
613
+ return replace(table, rows=tuple(new_rows))
614
+
615
+
616
+ # ----------------------------------------------------------------------
617
+ # Helpers
618
+ # ----------------------------------------------------------------------
619
+
620
+ def _isnan(x: Any) -> bool:
621
+ try:
622
+ return math.isnan(float(x))
623
+ except (TypeError, ValueError):
624
+ return False
625
+
626
+
627
+ def _data_from_rebuild(rebuild: Callable[..., Any]) -> pd.DataFrame | None:
628
+ """Recover the source DataFrame captured by a builder's rebuild closure."""
629
+ closure = getattr(rebuild, "__closure__", None)
630
+ if not closure:
631
+ return None
632
+ for cell in closure:
633
+ contents = cell.cell_contents
634
+ if isinstance(contents, pd.DataFrame):
635
+ return contents
636
+ return None
637
+
638
+
639
+ def _find_variable_for_row(
640
+ label: str,
641
+ variables: tuple[str, ...],
642
+ kinds: dict[str, str],
643
+ *,
644
+ labels: dict[str, str] | None = None,
645
+ ) -> str | None:
646
+ """Match a body-row's displayed text back to its source variable.
647
+
648
+ Handles three cases:
649
+
650
+ * Raw variable name (``"age"``)
651
+ * Dichotomous renaming (``"sex = Male"``)
652
+ * Display-relabelled rows via the ``labels={...}`` argument to
653
+ ``tbl_one`` (``"Patient sex = Male"`` for ``labels={"sex":
654
+ "Patient sex"}``)
655
+ """
656
+ labels = labels or {}
657
+ for v in variables:
658
+ if label == v:
659
+ return v
660
+ if label.startswith(f"{v} ="):
661
+ return v
662
+ # Display-relabelled rows: scan the labels mapping.
663
+ for src, disp in labels.items():
664
+ if not disp:
665
+ continue
666
+ if label == disp:
667
+ return src
668
+ if label.startswith(f"{disp} ="):
669
+ return src
670
+ return None
671
+
672
+
673
+ def _resolve_groups(data: pd.DataFrame, by: str | None) -> tuple[list[Any], dict[Any, pd.Series]]:
674
+ if by is None:
675
+ return ["Overall"], {"Overall": pd.Series(True, index=data.index)}
676
+ s = data[by]
677
+ levels = (list(s.cat.categories)
678
+ if isinstance(s.dtype, pd.CategoricalDtype)
679
+ else sorted(s.dropna().unique(), key=str))
680
+ return list(levels), {k: (s == k) for k in levels}
681
+
682
+
683
+ def _insert_after_pvalue_header(headers: tuple[HeaderRow, ...], label: str) -> tuple[HeaderRow, ...]:
684
+ """Insert a header cell named ``label`` right after the first p-value column."""
685
+ new_headers: list[HeaderRow] = []
686
+ for hr in headers:
687
+ new_cells = list(hr.cells)
688
+ for j, c in enumerate(new_cells):
689
+ if c.text.lower().startswith("p-value") or c.text.lower() == "p":
690
+ new_cells.insert(j + 1, HeaderCell(text=label))
691
+ break
692
+ else:
693
+ new_cells.append(HeaderCell(text=label))
694
+ new_headers.append(HeaderRow(cells=tuple(new_cells)))
695
+ return tuple(new_headers)
696
+
697
+
698
+ def _insert_after_pvalue_cell(row: Row, text: str, *, value: Any) -> Row:
699
+ new_cells = list(row.cells)
700
+ for j, c in enumerate(new_cells):
701
+ if c.kind == "p_value":
702
+ new_cells.insert(j + 1, make_cell(text, value=value, align="right"))
703
+ break
704
+ else:
705
+ new_cells.append(make_cell(text, value=value, align="right"))
706
+ return replace(row, cells=tuple(new_cells))
707
+
708
+
709
+ def _insert_after_groups_header(
710
+ headers: tuple[HeaderRow, ...], label: str,
711
+ ) -> tuple[HeaderRow, ...]:
712
+ """Insert a header cell named ``label`` right before any p-value column."""
713
+ new_headers: list[HeaderRow] = []
714
+ for hr in headers:
715
+ new_cells = list(hr.cells)
716
+ insert_at = len(new_cells)
717
+ for j, c in enumerate(new_cells):
718
+ if c.text.lower().startswith(("p-value", "p", "smd")):
719
+ insert_at = j
720
+ break
721
+ new_cells.insert(insert_at, HeaderCell(text=label))
722
+ new_headers.append(HeaderRow(cells=tuple(new_cells)))
723
+ return tuple(new_headers)
724
+
725
+
726
+ def _insert_after_groups_cell(
727
+ row: Row,
728
+ text: str,
729
+ *,
730
+ value: Any,
731
+ kind: Any = "text",
732
+ ) -> Row:
733
+ new_cells: list[Cell] = list(row.cells)
734
+ insert_at = len(new_cells)
735
+ for j, c in enumerate(new_cells):
736
+ if c.kind in ("p_value", "q_value") or (
737
+ c.kind == "numeric" and j == len(new_cells) - 1
738
+ and isinstance(c.value, (int, float))
739
+ and not _isnan(c.value or 0)
740
+ and c.text and c.text.replace(".", "").replace("-", "").isdigit()
741
+ ):
742
+ insert_at = j
743
+ break
744
+ new_cells.insert(insert_at, make_cell(text, value=value, kind=kind, align="right"))
745
+ return replace(row, cells=tuple(new_cells))