pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pysofra/__init__.py +82 -0
  2. pysofra/core/__init__.py +14 -0
  3. pysofra/core/compose.py +167 -0
  4. pysofra/core/format.py +155 -0
  5. pysofra/core/frames.py +69 -0
  6. pysofra/core/schema.py +128 -0
  7. pysofra/core/table.py +924 -0
  8. pysofra/io/__init__.py +1 -0
  9. pysofra/models/__init__.py +6 -0
  10. pysofra/models/extract.py +249 -0
  11. pysofra/models/pool.py +119 -0
  12. pysofra/models/regression.py +507 -0
  13. pysofra/models/survival.py +395 -0
  14. pysofra/models/uvregression.py +438 -0
  15. pysofra/notebook/__init__.py +6 -0
  16. pysofra/plot/__init__.py +23 -0
  17. pysofra/plot/_backend.py +32 -0
  18. pysofra/plot/forest.py +159 -0
  19. pysofra/plot/inline.py +171 -0
  20. pysofra/plot/km.py +249 -0
  21. pysofra/render/__init__.py +28 -0
  22. pysofra/render/_zip_determinism.py +57 -0
  23. pysofra/render/base.py +22 -0
  24. pysofra/render/docx.py +286 -0
  25. pysofra/render/html.py +442 -0
  26. pysofra/render/image.py +130 -0
  27. pysofra/render/latex.py +253 -0
  28. pysofra/render/markdown.py +128 -0
  29. pysofra/render/pptx.py +340 -0
  30. pysofra/render/xlsx.py +226 -0
  31. pysofra/summary/__init__.py +6 -0
  32. pysofra/summary/calibrate.py +214 -0
  33. pysofra/summary/design.py +246 -0
  34. pysofra/summary/effect_size.py +187 -0
  35. pysofra/summary/extras.py +745 -0
  36. pysofra/summary/smd.py +133 -0
  37. pysofra/summary/stats.py +135 -0
  38. pysofra/summary/tbl_cross.py +339 -0
  39. pysofra/summary/tbl_one.py +1220 -0
  40. pysofra/summary/tbl_summary.py +51 -0
  41. pysofra/summary/tests.py +370 -0
  42. pysofra/summary/typing.py +129 -0
  43. pysofra/summary/weights.py +161 -0
  44. pysofra/themes/__init__.py +5 -0
  45. pysofra/themes/registry.py +272 -0
  46. pysofra-0.1.0a1.dist-info/METADATA +301 -0
  47. pysofra-0.1.0a1.dist-info/RECORD +50 -0
  48. pysofra-0.1.0a1.dist-info/WHEEL +4 -0
  49. pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
  50. pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,1220 @@
1
+ """Table 1 builder — baseline characteristic tables.
2
+
3
+ Mirrors the workflow of R's ``tableone`` while staying Pythonic:
4
+
5
+ .. code-block:: python
6
+
7
+ import pysofra as ps
8
+
9
+ (
10
+ ps.tbl_one(df, by="treatment")
11
+ .add_p()
12
+ .add_smd()
13
+ .add_overall()
14
+ )
15
+
16
+ The function returns a :class:`~pysofra.core.SofraTable` that renders
17
+ beautifully in notebooks and exports to HTML/Markdown/DOCX.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from typing import Any
23
+
24
+ import pandas as pd
25
+
26
+ from ..core.format import (
27
+ fmt_mean_sd,
28
+ fmt_median_iqr,
29
+ fmt_p_value,
30
+ fmt_smd,
31
+ )
32
+ from ..core.frames import to_pandas
33
+ from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
34
+ from ..core.table import SofraTable, TableSpec
35
+ from .design import SurveyDesign, design_mean_var, replicate_mean_var
36
+ from .smd import categorical_smd, continuous_smd
37
+ from .stats import continuous_stats
38
+ from .tests import (
39
+ categorical_test,
40
+ continuous_test,
41
+ rao_scott_chisq,
42
+ run_named_test,
43
+ svyttest,
44
+ )
45
+ from .typing import VarKind, apply_overrides, infer_kind
46
+ from .weights import weighted_continuous_stats
47
+
48
+
49
+ def _is_cat_dtype(series: pd.Series) -> bool:
50
+ return isinstance(series.dtype, pd.CategoricalDtype)
51
+
52
+
53
+ DEFAULT_OVERALL_LABEL = "Overall"
54
+ MISSING_LABEL = "Missing"
55
+
56
+
57
+ def tbl_one(
58
+ data: pd.DataFrame,
59
+ *,
60
+ by: str | None = None,
61
+ variables: list[str] | None = None,
62
+ labels: dict[str, str] | None = None,
63
+ types: dict[str, VarKind] | None = None,
64
+ nonnormal: list[str] | None = None,
65
+ tests: dict[str, str] | None = None,
66
+ weights: str | None = None,
67
+ design: SurveyDesign | None = None,
68
+ digits: int = 2,
69
+ pct_digits: int = 1,
70
+ missing: str = "ifany",
71
+ include_missing: bool | None = None,
72
+ ) -> SofraTable:
73
+ """Build a Table 1.
74
+
75
+ Parameters
76
+ ----------
77
+ data
78
+ Source dataframe.
79
+ by
80
+ Optional column name to stratify on. If omitted, a single
81
+ ``Overall`` column is produced.
82
+ variables
83
+ Explicit list of variables to include. Defaults to all columns
84
+ other than ``by``.
85
+ labels
86
+ Mapping of column name → display label.
87
+ types
88
+ Override automatic variable typing on a per-column basis.
89
+ nonnormal
90
+ Continuous variables that should be summarised as
91
+ ``median (Q1, Q3)`` and tested with rank-based tests.
92
+ tests
93
+ Per-variable statistical test overrides, e.g.
94
+ ``{'age': 'wilcoxon', 'race': 'fisher'}``. See
95
+ :func:`pysofra.summary.tests.available_tests` for the registry.
96
+ weights
97
+ Column name carrying non-negative frequency weights. When
98
+ supplied, continuous summaries become weighted means / variances
99
+ and categorical summaries become weighted proportions. The
100
+ weights column is excluded from the variable list automatically.
101
+ design
102
+ A :class:`SurveyDesign` describing a complex sampling structure
103
+ (weights + optional strata, clusters, and FPC). When provided,
104
+ variance estimates use Taylor linearisation instead of the
105
+ simple frequency-weighted formula. If both ``weights`` and
106
+ ``design`` are passed, ``design`` wins.
107
+ digits
108
+ Decimal places for continuous summaries.
109
+ pct_digits
110
+ Decimal places for percentages.
111
+ missing
112
+ ``"ifany"`` (default) — include a *Missing* row only when there is
113
+ missing data; ``"always"`` — always include; ``"never"``.
114
+ include_missing
115
+ Deprecated alias for ``missing``. ``True`` maps to ``"ifany"``,
116
+ ``False`` to ``"never"``.
117
+ """
118
+ if include_missing is not None:
119
+ missing = "ifany" if include_missing else "never"
120
+ if missing not in ("ifany", "always", "never"):
121
+ raise ValueError("missing must be one of 'ifany', 'always', 'never'")
122
+
123
+ data = to_pandas(data)
124
+ # Duplicate-column-name check — without this, downstream pandas calls
125
+ # raise a confusing ``AttributeError`` on the duplicated Series.
126
+ duplicate_cols = [c for c in data.columns if list(data.columns).count(c) > 1]
127
+ if duplicate_cols:
128
+ raise ValueError(
129
+ f"tbl_one cannot accept a DataFrame with duplicate column names; "
130
+ f"got duplicates: {sorted(set(duplicate_cols))}."
131
+ )
132
+ if by is not None and by not in data.columns:
133
+ raise KeyError(f"by column {by!r} not in data")
134
+ if design is not None and weights is not None and weights != design.weights:
135
+ import warnings
136
+ warnings.warn(
137
+ f"Both weights={weights!r} and design= were provided; "
138
+ f"using design.weights={design.weights!r}. Pass only one.",
139
+ UserWarning,
140
+ stacklevel=2,
141
+ )
142
+ if design is not None:
143
+ design.validate(data)
144
+ weights = design.weights
145
+ if weights is not None and weights not in data.columns:
146
+ raise KeyError(f"weights column {weights!r} not in data")
147
+ if weights is not None:
148
+ # Warn — but don't refuse — on negative weights; the standard
149
+ # behaviour is to drop them (matches survey::svydesign).
150
+ import warnings
151
+ w_col = pd.to_numeric(data[weights], errors="coerce")
152
+ n_neg = int((w_col < 0).sum())
153
+ if n_neg > 0:
154
+ warnings.warn(
155
+ f"weights column {weights!r} contains {n_neg} negative value(s); "
156
+ "rows with negative weight are excluded from summaries.",
157
+ UserWarning,
158
+ stacklevel=2,
159
+ )
160
+ excluded: set[str] = {c for c in (by, weights) if c is not None}
161
+ if design is not None:
162
+ if design.strata is not None:
163
+ excluded.add(design.strata)
164
+ if design.cluster is not None:
165
+ if isinstance(design.cluster, tuple):
166
+ excluded.update(design.cluster)
167
+ else:
168
+ excluded.add(design.cluster)
169
+ if design.fpc is not None:
170
+ excluded.add(design.fpc)
171
+ if design.replicate_weights is not None:
172
+ excluded.update(design.replicate_weights)
173
+ if variables is None:
174
+ variables = [c for c in data.columns if c not in excluded]
175
+ else:
176
+ missing_cols = [v for v in variables if v not in data.columns]
177
+ if missing_cols:
178
+ raise KeyError(f"variables not in data: {missing_cols}")
179
+ # Warn when the user-supplied variables list overlaps the design /
180
+ # stratification columns; silently dropping them is surprising.
181
+ overlap = [v for v in variables if v in excluded]
182
+ if overlap:
183
+ import warnings
184
+ warnings.warn(
185
+ f"variables={overlap} overlap with the by= / weights / design "
186
+ "columns and were excluded.",
187
+ UserWarning,
188
+ stacklevel=2,
189
+ )
190
+ variables = [v for v in variables if v not in excluded]
191
+
192
+ labels = dict(labels or {})
193
+ nonnormal_set = set(nonnormal or [])
194
+ inferred = {v: infer_kind(data[v]) for v in variables}
195
+ kinds = apply_overrides(inferred, types)
196
+ tests_map = dict(tests or {})
197
+
198
+ # Warn the user if labels / nonnormal / tests reference columns that
199
+ # are NOT in the final variable list — those entries are silently
200
+ # ignored otherwise, leading to wrong tests (a ``nonnormal=["hbac1"]``
201
+ # typo → Welch instead of Wilcoxon, no warning), wrong row labels
202
+ # (``labels={"hbac1": "HbA1c"}`` typo → raw column name in the table),
203
+ # and wrong tests (``tests={"hbac1": "wilcoxon"}`` typo → default).
204
+ _var_set = set(variables)
205
+ _bad_labels = sorted(set(labels) - _var_set)
206
+ _bad_nonnormal = sorted(nonnormal_set - _var_set)
207
+ _bad_tests = sorted(set(tests_map) - _var_set)
208
+ if _bad_labels or _bad_nonnormal or _bad_tests:
209
+ import warnings
210
+ msgs = []
211
+ if _bad_labels:
212
+ msgs.append(f"labels={_bad_labels}")
213
+ if _bad_nonnormal:
214
+ msgs.append(f"nonnormal={_bad_nonnormal}")
215
+ if _bad_tests:
216
+ msgs.append(f"tests={_bad_tests}")
217
+ warnings.warn(
218
+ f"Variables referenced but not in the table: {'; '.join(msgs)}. "
219
+ f"Check for typos against {sorted(_var_set)!r}. The entries were "
220
+ "ignored.",
221
+ UserWarning,
222
+ stacklevel=2,
223
+ )
224
+
225
+ spec = TableSpec(
226
+ builder="tbl_one",
227
+ options={
228
+ "by": by,
229
+ "variables": tuple(variables),
230
+ "labels": labels,
231
+ "kinds": kinds,
232
+ "nonnormal": frozenset(nonnormal_set),
233
+ "tests": tests_map,
234
+ "weights": weights,
235
+ "design": design,
236
+ "digits": int(digits),
237
+ "pct_digits": int(pct_digits),
238
+ "missing": missing,
239
+ "p_value": False,
240
+ "smd": False,
241
+ "overall": False,
242
+ "overall_label": DEFAULT_OVERALL_LABEL,
243
+ "q_value": False,
244
+ "q_method": "fdr_bh",
245
+ },
246
+ )
247
+ # We close over the *original data* so spec changes don't lose it.
248
+ return _build(data, spec)
249
+
250
+
251
+ # ----------------------------------------------------------------------
252
+ # Internals
253
+ # ----------------------------------------------------------------------
254
+
255
+ def _build(data: pd.DataFrame, spec: TableSpec) -> SofraTable:
256
+ """Construct a SofraTable from a (data, spec) pair."""
257
+ opts = spec.options
258
+ by: str | None = opts["by"]
259
+ variables: tuple[str, ...] = opts["variables"]
260
+ labels: dict[str, str] = opts["labels"]
261
+ kinds: dict[str, VarKind] = opts["kinds"]
262
+ nonnormal: frozenset[str] = opts["nonnormal"]
263
+ tests_map: dict[str, str] = opts.get("tests", {}) or {}
264
+ weights_col: str | None = opts.get("weights")
265
+ design: SurveyDesign | None = opts.get("design")
266
+ digits: int = opts["digits"]
267
+ pct_digits: int = opts["pct_digits"]
268
+ missing_mode: str = opts["missing"]
269
+ show_p: bool = bool(opts["p_value"])
270
+ show_smd: bool = bool(opts["smd"])
271
+ show_overall: bool = bool(opts["overall"])
272
+ overall_label: str = opts["overall_label"]
273
+ show_q: bool = bool(opts.get("q_value"))
274
+ q_method: str = opts.get("q_method", "fdr_bh")
275
+ bold_p_mode: bool = False
276
+ bold_p_threshold: float = 0.05
277
+
278
+ if by is None:
279
+ group_keys: list[Any] = [overall_label]
280
+ group_masks = {overall_label: pd.Series(True, index=data.index)}
281
+ show_overall = False # already overall-only
282
+ else:
283
+ # Drop rows missing the stratification variable; record N dropped.
284
+ by_series = data[by]
285
+ # Preserve categorical / sorted order
286
+ if _is_cat_dtype(by_series):
287
+ group_keys = [k for k in by_series.cat.categories if (by_series == k).any()]
288
+ else:
289
+ group_keys = sorted(by_series.dropna().unique(), key=_sort_key)
290
+ # cast to plain list[Any]
291
+ group_keys = list(group_keys)
292
+ group_masks = {k: (by_series == k) for k in group_keys}
293
+ # Guard against degenerate ``by=`` columns
294
+ # silently produced an unstratified or empty table. Match the R9
295
+ # policy (clear UserWarning when the user's intent doesn't match
296
+ # the input) so the user knows to drop ``by=`` or fix the column.
297
+ if len(group_keys) == 0:
298
+ import warnings
299
+ warnings.warn(
300
+ f"by={by!r} has no non-missing values; the resulting table "
301
+ "has no stratification columns. Pass by=None for an "
302
+ "explicitly unstratified summary, or fix the column.",
303
+ UserWarning,
304
+ stacklevel=2,
305
+ )
306
+ elif len(group_keys) == 1:
307
+ import warnings
308
+ warnings.warn(
309
+ f"by={by!r} has only one non-missing level "
310
+ f"({group_keys[0]!r}); the resulting table has a single "
311
+ "stratum column and no between-group statistics. Pass "
312
+ "by=None for an explicitly unstratified summary.",
313
+ UserWarning,
314
+ stacklevel=2,
315
+ )
316
+
317
+ if weights_col is not None:
318
+ w_series = pd.to_numeric(data[weights_col], errors="coerce").fillna(0.0)
319
+ n_per_group = {k: float(w_series[group_masks[k]].sum()) for k in group_keys}
320
+ n_overall = (
321
+ float(w_series.sum())
322
+ if by is None
323
+ else float(w_series[data[by].notna()].sum())
324
+ )
325
+ else:
326
+ w_series = None
327
+ n_per_group = {k: int(group_masks[k].sum()) for k in group_keys}
328
+ n_overall = int(len(data)) if by is None else int(data[by].notna().sum())
329
+
330
+ # ------------------------------------------------------------------
331
+ # Headers
332
+ # ------------------------------------------------------------------
333
+ header_cells: list[HeaderCell] = [HeaderCell(text="Characteristic", align="left")]
334
+
335
+ def _fmt_n(val: float | int) -> str:
336
+ if isinstance(val, float):
337
+ return f"{val:,.1f}" if val != int(val) else f"{int(val):,}"
338
+ return f"{val:,}"
339
+
340
+ if show_overall:
341
+ header_cells.append(
342
+ HeaderCell(text=f"{overall_label}\nN = {_fmt_n(n_overall)}")
343
+ )
344
+ for k in group_keys:
345
+ header_cells.append(
346
+ HeaderCell(text=f"{_fmt_level(k)}\nN = {_fmt_n(n_per_group[k])}")
347
+ )
348
+ if show_p:
349
+ header_cells.append(HeaderCell(text="p-value"))
350
+ if show_q:
351
+ header_cells.append(HeaderCell(text="q-value"))
352
+ if show_smd:
353
+ header_cells.append(HeaderCell(text="SMD"))
354
+
355
+ headers: tuple[HeaderRow, ...] = (HeaderRow(cells=tuple(header_cells)),)
356
+
357
+ # ------------------------------------------------------------------
358
+ # Body rows
359
+ # ------------------------------------------------------------------
360
+ rows: list[Row] = []
361
+ test_names: set[str] = set()
362
+
363
+ for var in variables:
364
+ kind = kinds[var]
365
+ label = labels.get(var, var)
366
+ is_nonnormal = var in nonnormal
367
+
368
+ test_override = tests_map.get(var)
369
+
370
+ if kind == "continuous":
371
+ row_blocks, test_used = _continuous_rows(
372
+ data, var, label, by, group_keys, group_masks,
373
+ digits=digits,
374
+ pct_digits=pct_digits,
375
+ show_overall=show_overall,
376
+ show_p=show_p,
377
+ show_q=show_q,
378
+ show_smd=show_smd,
379
+ nonnormal=is_nonnormal,
380
+ missing_mode=missing_mode,
381
+ bold_p_mode=bold_p_mode,
382
+ bold_p_threshold=bold_p_threshold,
383
+ test_override=test_override,
384
+ weights=w_series,
385
+ design=design,
386
+ )
387
+ rows.extend(row_blocks)
388
+ if test_used:
389
+ test_names.add(test_used)
390
+ else:
391
+ row_blocks, test_used = _categorical_rows(
392
+ data, var, label, by, group_keys, group_masks,
393
+ kind=kind,
394
+ pct_digits=pct_digits,
395
+ show_overall=show_overall,
396
+ show_p=show_p,
397
+ show_q=show_q,
398
+ show_smd=show_smd,
399
+ missing_mode=missing_mode,
400
+ bold_p_mode=bold_p_mode,
401
+ bold_p_threshold=bold_p_threshold,
402
+ test_override=test_override,
403
+ weights=w_series,
404
+ )
405
+ rows.extend(row_blocks)
406
+ if test_used:
407
+ test_names.add(test_used)
408
+
409
+ # ------------------------------------------------------------------
410
+ # Footnotes
411
+ # ------------------------------------------------------------------
412
+ footnotes: list[str] = []
413
+ # Continuous summary footnote
414
+ cont_vars = [v for v in variables if kinds[v] == "continuous"]
415
+ nn_vars = [v for v in cont_vars if v in nonnormal]
416
+ nm_vars = [v for v in cont_vars if v not in nonnormal]
417
+ design_with_variance = (
418
+ design is not None and weights_col is not None
419
+ and (design.strata is not None or design.cluster is not None)
420
+ )
421
+ if nm_vars and design_with_variance:
422
+ footnotes.append(
423
+ "Mean (SE) for continuous variables (design-based "
424
+ "Taylor-linearised variance)."
425
+ )
426
+ elif nm_vars:
427
+ footnotes.append("Mean (SD) for continuous variables.")
428
+ if nn_vars:
429
+ labelled = ", ".join(labels.get(v, v) for v in nn_vars)
430
+ footnotes.append(f"Median (Q1, Q3) for: {labelled}.")
431
+ cat_vars = [v for v in variables if kinds[v] != "continuous"]
432
+ if cat_vars:
433
+ footnotes.append("n (%) for categorical variables.")
434
+ if show_p and test_names:
435
+ footnotes.append("Tests: " + "; ".join(sorted(test_names)) + ".")
436
+ if show_q:
437
+ footnotes.append(f"q-value = {_q_method_label(q_method)} adjusted p-value.")
438
+ if show_smd:
439
+ footnotes.append("SMD = standardized mean difference (max pairwise).")
440
+
441
+ if show_q:
442
+ rows = _patch_q_values(rows, method=q_method)
443
+
444
+ # ------------------------------------------------------------------
445
+ # add_global_p() — joint Wald p-value per variable, fitted as
446
+ # ``Logit(by == reference_level ~ variable + adjust_for)`` for
447
+ # each variable. Adds a "global p" column to the right of any
448
+ # existing p-value column. Requires a 2-level ``by=``; >2-level
449
+ # ``by=`` is out of scope for v1 (would require multinomial
450
+ # logit).
451
+ # ------------------------------------------------------------------
452
+ if opts.get("global_p"):
453
+ rows, headers, footnotes_extra = _attach_global_p(
454
+ data=data, by=by,
455
+ variables=variables, kinds=kinds, labels=labels,
456
+ rows=rows, headers=headers,
457
+ adjust_for=tuple(opts.get("global_p_adjust_for", ()) or ()),
458
+ )
459
+ footnotes = list(footnotes) + list(footnotes_extra)
460
+
461
+ def _rebuild_fn(new_spec: TableSpec) -> SofraTable:
462
+ return _build(data, new_spec)
463
+
464
+ return SofraTable(
465
+ rows=tuple(rows),
466
+ headers=headers,
467
+ footnotes=tuple(footnotes),
468
+ metadata={"builder": "tbl_one", "tests": sorted(test_names)},
469
+ _spec=spec,
470
+ _rebuild=_rebuild_fn,
471
+ )
472
+
473
+
474
+ # ----------------------------------------------------------------------
475
+ # q-value post-processing
476
+ # ----------------------------------------------------------------------
477
+
478
+ _Q_METHOD_LABELS = {
479
+ "fdr_bh": "Benjamini–Hochberg",
480
+ "fdr_by": "Benjamini–Yekutieli",
481
+ "bonferroni": "Bonferroni",
482
+ "holm": "Holm",
483
+ "hommel": "Hommel",
484
+ "sidak": "Šidák",
485
+ }
486
+
487
+
488
+ def _q_method_label(method: str) -> str:
489
+ return _Q_METHOD_LABELS.get(method, method)
490
+
491
+
492
+ def _patch_q_values(rows: list[Row], *, method: str) -> list[Row]:
493
+ """Walk rows, collect p-values, compute q-values, patch q-value cells."""
494
+ # Find rows that have *both* a numeric p-value cell and a q-value placeholder.
495
+ # NaN p-values are silently skipped — feeding them to ``multipletests``
496
+ # contaminates the entire adjustment (statsmodels returns NaN for every
497
+ # output), which would wrongly null out the q-values of valid rows.
498
+ import math
499
+ from dataclasses import replace as dc_replace
500
+
501
+ from statsmodels.stats.multitest import multipletests
502
+
503
+ pairs: list[tuple[int, int, int, float]] = [] # (row_idx, p_col, q_col, p_val)
504
+ for i, r in enumerate(rows):
505
+ p_col = q_col = None
506
+ for j, c in enumerate(r.cells):
507
+ if c.kind == "p_value" and isinstance(c.value, (int, float)):
508
+ if math.isnan(float(c.value)):
509
+ p_col = None # poison; skip this row entirely
510
+ break
511
+ p_col = j
512
+ elif c.kind == "q_value":
513
+ q_col = j
514
+ if p_col is not None and q_col is not None:
515
+ pairs.append((i, p_col, q_col, float(rows[i].cells[p_col].value)))
516
+
517
+ if not pairs:
518
+ return rows
519
+
520
+ pvals = [p for _, _, _, p in pairs]
521
+ _, qvals, _, _ = multipletests(pvals, method=method)
522
+
523
+ new_rows = list(rows)
524
+ for (i, _p_col, q_col, _p), q in zip(pairs, qvals, strict=True):
525
+ old_row = new_rows[i]
526
+ new_cells = list(old_row.cells)
527
+ new_cells[q_col] = dc_replace(
528
+ new_cells[q_col],
529
+ text=fmt_p_value(float(q), digits=3),
530
+ value=float(q),
531
+ )
532
+ new_rows[i] = dc_replace(old_row, cells=tuple(new_cells))
533
+ return new_rows
534
+
535
+
536
+ def _fmt_level(k: Any) -> str:
537
+ if isinstance(k, bool):
538
+ return "Yes" if k else "No"
539
+ return str(k)
540
+
541
+
542
+ def _fmt_weighted_n_pct(n: float, total: float, pct_digits: int) -> str:
543
+ """Render ``n (xx.x%)`` with weighted (possibly non-integer) counts."""
544
+ if total <= 0:
545
+ n_str = f"{n:,.1f}" if n != int(n) else f"{int(n):,}"
546
+ return f"{n_str} (—)"
547
+ pct = 100.0 * n / total
548
+ n_str = f"{n:,.1f}" if n != int(n) else f"{int(n):,}"
549
+ return f"{n_str} ({pct:.{pct_digits}f}%)"
550
+
551
+
552
+ def _sort_key(x: Any) -> tuple[int, Any]:
553
+ if isinstance(x, bool):
554
+ return (0, int(x))
555
+ if isinstance(x, (int, float)):
556
+ return (0, float(x))
557
+ if isinstance(x, str):
558
+ return (1, x)
559
+ return (2, repr(x))
560
+
561
+
562
+ # ----------------------------------------------------------------------
563
+ # Continuous rows
564
+ # ----------------------------------------------------------------------
565
+
566
+ def _continuous_rows(
567
+ data: pd.DataFrame,
568
+ var: str,
569
+ label: str,
570
+ by: str | None,
571
+ group_keys: list[Any],
572
+ group_masks: dict[Any, pd.Series],
573
+ *,
574
+ digits: int,
575
+ pct_digits: int,
576
+ show_overall: bool,
577
+ show_p: bool,
578
+ show_q: bool,
579
+ show_smd: bool,
580
+ nonnormal: bool,
581
+ missing_mode: str,
582
+ bold_p_mode: bool,
583
+ bold_p_threshold: float,
584
+ test_override: str | None = None,
585
+ weights: pd.Series | None = None,
586
+ design: SurveyDesign | None = None,
587
+ ) -> tuple[list[Row], str | None]:
588
+ """Produce 1 (+ optional missing) rows for one continuous variable."""
589
+
590
+ def _summary_for(mask: pd.Series) -> str:
591
+ # Design-based: report mean (SE) when the user has opted into a
592
+ # complex design (strata or cluster). For weight-only designs we
593
+ # fall through to the frequency-weighted mean (SD) path below.
594
+ if design is not None and weights is not None and (
595
+ design.strata is not None
596
+ or design.cluster is not None
597
+ or design.replicate_weights is not None
598
+ ):
599
+ if design.replicate_weights is not None:
600
+ rep_series = [data.loc[mask, c] for c in design.replicate_weights]
601
+ mean, var_, n_eff = replicate_mean_var(
602
+ data.loc[mask, var],
603
+ weights.loc[mask],
604
+ rep_series,
605
+ replicate_type=design.replicate_type,
606
+ )
607
+ else:
608
+ mean, var_, n_eff = design_mean_var(
609
+ data.loc[mask, var],
610
+ weights.loc[mask],
611
+ strata=(data.loc[mask, design.strata]
612
+ if design.strata else None),
613
+ cluster=(data.loc[mask, design.primary_cluster]
614
+ if design.cluster else None),
615
+ fpc=(data.loc[mask, design.fpc]
616
+ if design.fpc else None),
617
+ )
618
+ if n_eff <= 0:
619
+ return "—"
620
+ import math
621
+ se = math.sqrt(max(var_, 0.0)) if not math.isnan(var_) else float("nan")
622
+ return fmt_mean_sd(mean, se, digits=digits)
623
+ if weights is not None:
624
+ st = weighted_continuous_stats(data.loc[mask, var], weights.loc[mask])
625
+ if st.n_eff <= 0:
626
+ return "—"
627
+ if nonnormal:
628
+ return fmt_median_iqr(st.median, st.q1, st.q3, digits=digits)
629
+ return fmt_mean_sd(st.mean, st.sd, digits=digits)
630
+ cs = continuous_stats(data.loc[mask, var])
631
+ if cs.n == 0:
632
+ return "—"
633
+ if nonnormal:
634
+ return fmt_median_iqr(cs.median, cs.q1, cs.q3, digits=digits)
635
+ return fmt_mean_sd(cs.mean, cs.sd, digits=digits)
636
+
637
+ p_value: float | None = None
638
+ test_used: str | None = None
639
+ if show_p and by is not None:
640
+ if test_override is not None:
641
+ res = run_named_test(test_override, data[var], data[by], kind="continuous")
642
+ elif weights is not None:
643
+ # Any weighted call gets the design-adjusted two-sample
644
+ # t-test (Taylor-linearised; ``svyttest`` analogue) on the
645
+ # 2-group case. Strata/cluster from ``design`` are honoured
646
+ # when present; bare ``weights=`` falls through with
647
+ # ``strata=None``/``cluster=None``, which still gives a
648
+ # weighted SE rather than the unweighted Welch fallback the
649
+ # earlier behaviour silently produced.
650
+ two_grp = len(set(data[by].dropna().unique())) == 2
651
+ if two_grp:
652
+ strata_col = (data[design.strata]
653
+ if design is not None and design.strata else None)
654
+ cluster_col = (data[design.primary_cluster]
655
+ if design is not None and design.primary_cluster
656
+ else None)
657
+ res = svyttest(
658
+ data[var], data[by], weights,
659
+ strata=strata_col,
660
+ cluster=cluster_col,
661
+ )
662
+ else:
663
+ # >2 groups under weights: design-adjusted F-test is
664
+ # out of scope (see the README "discussed but not
665
+ # implemented" list); fall back to design-naive ANOVA /
666
+ # Kruskal–Wallis with the existing footnote.
667
+ res = continuous_test(data[var], data[by], nonnormal=nonnormal)
668
+ else:
669
+ res = continuous_test(data[var], data[by], nonnormal=nonnormal)
670
+ p_value = res.p_value
671
+ test_used = res.test if res.p_value is not None else None
672
+
673
+ smd_val: float | None = None
674
+ if show_smd and by is not None:
675
+ smd_val = continuous_smd(data[var], data[by])
676
+
677
+ bold_row = (
678
+ bold_p_mode
679
+ and p_value is not None
680
+ and p_value < bold_p_threshold
681
+ )
682
+
683
+ cells: list[Cell] = [make_cell(label, align="left", bold=bold_row)]
684
+ if show_overall:
685
+ cells.append(
686
+ make_cell(_summary_for(pd.Series(True, index=data.index)),
687
+ kind="numeric", align="right")
688
+ )
689
+ for k in group_keys:
690
+ cells.append(
691
+ make_cell(_summary_for(group_masks[k]), kind="numeric", align="right")
692
+ )
693
+ if show_p:
694
+ cells.append(make_cell(fmt_p_value(p_value), value=p_value, kind="p_value",
695
+ align="right", bold=bold_row))
696
+ if show_q:
697
+ # Placeholder; patched by _patch_q_values after build.
698
+ cells.append(make_cell("", value=None, kind="q_value", align="right"))
699
+ if show_smd:
700
+ cells.append(make_cell(fmt_smd(smd_val), value=smd_val, kind="numeric", align="right"))
701
+
702
+ rows: list[Row] = [Row(cells=tuple(cells))]
703
+
704
+ _maybe_append_missing(rows, data, var, group_keys, group_masks,
705
+ show_overall, show_p, show_q, show_smd,
706
+ pct_digits=pct_digits, missing_mode=missing_mode,
707
+ weights=weights)
708
+ return rows, test_used
709
+
710
+
711
+ # ----------------------------------------------------------------------
712
+ # Categorical rows
713
+ # ----------------------------------------------------------------------
714
+
715
+ def _categorical_rows(
716
+ data: pd.DataFrame,
717
+ var: str,
718
+ label: str,
719
+ by: str | None,
720
+ group_keys: list[Any],
721
+ group_masks: dict[Any, pd.Series],
722
+ *,
723
+ kind: VarKind,
724
+ pct_digits: int,
725
+ show_overall: bool,
726
+ show_p: bool,
727
+ show_q: bool,
728
+ show_smd: bool,
729
+ missing_mode: str,
730
+ bold_p_mode: bool,
731
+ bold_p_threshold: float,
732
+ test_override: str | None = None,
733
+ weights: pd.Series | None = None,
734
+ ) -> tuple[list[Row], str | None]:
735
+ """Produce a header row + one row per level (+ optional missing)."""
736
+ s_all = data[var]
737
+ # Determine levels from full data so all groups share them.
738
+ if _is_cat_dtype(s_all):
739
+ levels: list[Any] = list(s_all.cat.categories)
740
+ else:
741
+ levels = sorted(s_all.dropna().unique(), key=_sort_key)
742
+
743
+ # All-NaN variable: emit a single "no data" row and any missing-row
744
+ # follow-up. Without this short-circuit the multi-level path produces
745
+ # a confusing group header with empty group cells.
746
+ if len(levels) == 0:
747
+ empty_cells: list[Cell] = [make_cell(label, align="left", bold=True)]
748
+ if show_overall:
749
+ empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
750
+ for _ in group_keys:
751
+ empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
752
+ if show_p:
753
+ empty_cells.append(make_cell("—", value=None, kind="p_value", align="right"))
754
+ if show_q:
755
+ empty_cells.append(make_cell("—", value=None, kind="q_value", align="right"))
756
+ if show_smd:
757
+ empty_cells.append(make_cell("—", value=None, kind="numeric", align="right"))
758
+ empty_rows: list[Row] = [Row(cells=tuple(empty_cells))]
759
+ _maybe_append_missing(empty_rows, data, var, group_keys, group_masks,
760
+ show_overall, show_p, show_q, show_smd,
761
+ pct_digits=pct_digits, missing_mode=missing_mode,
762
+ weights=weights)
763
+ return empty_rows, None
764
+
765
+ p_value: float | None = None
766
+ test_used: str | None = None
767
+ if show_p and by is not None:
768
+ if test_override is not None:
769
+ res = run_named_test(test_override, s_all, data[by], kind="categorical")
770
+ elif weights is not None:
771
+ # Survey-weighted data → Rao–Scott corrected chi-square.
772
+ res = rao_scott_chisq(s_all, data[by], weights)
773
+ else:
774
+ res = categorical_test(s_all, data[by])
775
+ p_value = res.p_value
776
+ test_used = res.test if res.p_value is not None else None
777
+
778
+ smd_val: float | None = None
779
+ if show_smd and by is not None:
780
+ smd_val = categorical_smd(s_all, data[by], levels=levels)
781
+
782
+ bold_row = bold_p_mode and p_value is not None and p_value < bold_p_threshold
783
+
784
+ def _weighted_n_tot(mask: pd.Series, target_value: Any) -> tuple[float, float]:
785
+ sub = data.loc[mask]
786
+ valid = sub[var].notna()
787
+ if weights is not None:
788
+ w_sub = weights.loc[sub.index]
789
+ tot = float(w_sub[valid].sum())
790
+ n_match = float(w_sub[valid & (sub[var] == target_value)].sum())
791
+ else:
792
+ tot = float(valid.sum())
793
+ n_match = float((sub[var] == target_value).sum())
794
+ return n_match, tot
795
+
796
+ # Dichotomous: render as a single row "var, level = level1" with n (%)
797
+ # for the second (success) level. This matches gtsummary defaults.
798
+ if kind == "dichotomous" and len(levels) == 2:
799
+ success = levels[1]
800
+ success_label = _fmt_level(success)
801
+ row_label = f"{label} = {success_label}"
802
+ cells: list[Cell] = [make_cell(row_label, align="left", bold=bold_row)]
803
+ if show_overall:
804
+ n, tot = _weighted_n_tot(pd.Series(True, index=data.index), success)
805
+ cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
806
+ kind="numeric", align="right"))
807
+ for k in group_keys:
808
+ n, tot = _weighted_n_tot(group_masks[k], success)
809
+ cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
810
+ kind="numeric", align="right"))
811
+ if show_p:
812
+ cells.append(make_cell(fmt_p_value(p_value), value=p_value,
813
+ kind="p_value", align="right", bold=bold_row))
814
+ if show_q:
815
+ cells.append(make_cell("", value=None, kind="q_value", align="right"))
816
+ if show_smd:
817
+ cells.append(make_cell(fmt_smd(smd_val), value=smd_val,
818
+ kind="numeric", align="right"))
819
+ rows: list[Row] = [Row(cells=tuple(cells))]
820
+ _maybe_append_missing(rows, data, var, group_keys, group_masks,
821
+ show_overall, show_p, show_q, show_smd,
822
+ pct_digits=pct_digits, missing_mode=missing_mode,
823
+ weights=weights)
824
+ return rows, test_used
825
+
826
+ # Multi-level categorical: header row with overall N + p-value + SMD,
827
+ # then one indented row per level.
828
+ rows = []
829
+ hdr: list[Cell] = [make_cell(label, align="left", bold=True)]
830
+ if show_overall:
831
+ hdr.append(make_cell("", value=None))
832
+ for _ in group_keys:
833
+ hdr.append(make_cell("", value=None))
834
+ if show_p:
835
+ hdr.append(make_cell(fmt_p_value(p_value), value=p_value,
836
+ kind="p_value", align="right",
837
+ bold=bold_row))
838
+ if show_q:
839
+ hdr.append(make_cell("", value=None, kind="q_value", align="right"))
840
+ if show_smd:
841
+ hdr.append(make_cell(fmt_smd(smd_val), value=smd_val,
842
+ kind="numeric", align="right"))
843
+ rows.append(Row(cells=tuple(hdr), is_group_header=True))
844
+
845
+ for lvl in levels:
846
+ cells = [make_cell(f"{_fmt_level(lvl)}", align="left", indent=1)]
847
+ if show_overall:
848
+ n, tot = _weighted_n_tot(pd.Series(True, index=data.index), lvl)
849
+ cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
850
+ kind="numeric", align="right"))
851
+ for k in group_keys:
852
+ n, tot = _weighted_n_tot(group_masks[k], lvl)
853
+ cells.append(make_cell(_fmt_weighted_n_pct(n, tot, pct_digits),
854
+ kind="numeric", align="right"))
855
+ if show_p:
856
+ cells.append(make_cell("", value=None))
857
+ if show_q:
858
+ cells.append(make_cell("", value=None))
859
+ if show_smd:
860
+ cells.append(make_cell("", value=None))
861
+ rows.append(Row(cells=tuple(cells)))
862
+
863
+ _maybe_append_missing(rows, data, var, group_keys, group_masks,
864
+ show_overall, show_p, show_q, show_smd,
865
+ pct_digits=pct_digits, missing_mode=missing_mode,
866
+ weights=weights)
867
+ return rows, test_used
868
+
869
+
870
+ def _maybe_append_missing(
871
+ rows: list[Row],
872
+ data: pd.DataFrame,
873
+ var: str,
874
+ group_keys: list[Any],
875
+ group_masks: dict[Any, pd.Series],
876
+ show_overall: bool,
877
+ show_p: bool,
878
+ show_q: bool,
879
+ show_smd: bool,
880
+ *,
881
+ pct_digits: int,
882
+ missing_mode: str,
883
+ weights: pd.Series | None = None,
884
+ ) -> None:
885
+ if missing_mode == "never":
886
+ return
887
+ if weights is not None:
888
+ n_miss_overall = float(weights[data[var].isna()].sum())
889
+ else:
890
+ n_miss_overall = float(data[var].isna().sum())
891
+ if missing_mode == "ifany" and n_miss_overall == 0:
892
+ return
893
+
894
+ cells: list[Cell] = [make_cell(MISSING_LABEL, align="left", indent=1)]
895
+ if show_overall:
896
+ tot = float(weights.sum()) if weights is not None else float(len(data))
897
+ cells.append(make_cell(_fmt_weighted_n_pct(n_miss_overall, tot, pct_digits),
898
+ kind="numeric", align="right"))
899
+ for k in group_keys:
900
+ mask = group_masks[k]
901
+ if weights is not None:
902
+ n_miss = float(weights.loc[mask][data.loc[mask, var].isna()].sum())
903
+ tot = float(weights.loc[mask].sum())
904
+ else:
905
+ n_miss = float(data.loc[mask, var].isna().sum())
906
+ tot = float(mask.sum())
907
+ cells.append(make_cell(_fmt_weighted_n_pct(n_miss, tot, pct_digits),
908
+ kind="numeric", align="right"))
909
+ if show_p:
910
+ cells.append(make_cell("", value=None))
911
+ if show_q:
912
+ cells.append(make_cell("", value=None))
913
+ if show_smd:
914
+ cells.append(make_cell("", value=None))
915
+ rows.append(Row(cells=tuple(cells)))
916
+
917
+
918
+ # ----------------------------------------------------------------------
919
+ # add_global_p() — joint Wald p-value per variable for tbl_one /
920
+ # tbl_summary. Re-fits a logistic regression per variable on the
921
+ # source data (`Logit(by == ref ~ variable [+ adjust_for])`) and
922
+ # computes the joint p-value over the variable's coefficients. Adds
923
+ # a "global p" column to each row.
924
+ #
925
+ # Single-coefficient predictors (continuous, dichotomous) get the
926
+ # Wald p of that one coefficient. Multi-level categorical predictors
927
+ # (k levels → k-1 dummies) get the joint Wald F-test across all
928
+ # dummies — same statistic as gtsummary's ``add_global_p()``.
929
+ #
930
+ # v1 scope: 2-level ``by=``. With ≥3-level ``by=`` the joint test
931
+ # requires multinomial logit, which is left out of scope; the column
932
+ # is filled with em-dash and a footnote explains why.
933
+ # ----------------------------------------------------------------------
934
+
935
+
936
+ def _attach_global_p(
937
+ *,
938
+ data: pd.DataFrame,
939
+ by: str | None,
940
+ variables: tuple[str, ...],
941
+ kinds: dict[str, VarKind],
942
+ labels: dict[str, str],
943
+ rows: list[Row],
944
+ headers: tuple[HeaderRow, ...],
945
+ adjust_for: tuple[str, ...],
946
+ ) -> tuple[list[Row], tuple[HeaderRow, ...], list[str]]:
947
+ """Attach a joint-p column to a tbl_one table.
948
+
949
+ Walks the existing rows, identifies which rows belong to which
950
+ variable (by label-matching the first cell), and inserts a new
951
+ "global p" column carrying the joint Wald p-value for each
952
+ variable. Single-coefficient predictors get the Wald p directly;
953
+ multi-coefficient (categorical) predictors get the F-test joint p.
954
+
955
+ Parameters
956
+ ----------
957
+ data
958
+ The source DataFrame closed over by the ``tbl_one`` rebuild.
959
+ by
960
+ The stratifying column name; ``None`` is unsupported and
961
+ causes the function to return rows unchanged with a footnote
962
+ explaining the column was skipped.
963
+ variables, kinds, labels
964
+ Variable list + kind / label maps from the spec.
965
+ rows
966
+ The already-built body rows (will be re-emitted with one
967
+ extra cell appended).
968
+ headers
969
+ The already-built header rows (will be re-emitted with one
970
+ extra header cell appended).
971
+ adjust_for
972
+ Tuple of column names to include as covariates. Each is
973
+ treated as continuous if numeric and categorical (dummy
974
+ coded) otherwise.
975
+
976
+ Returns
977
+ -------
978
+ new_rows, new_headers, extra_footnotes
979
+ Rows / headers with the new column inserted, plus any
980
+ explanatory footnotes (e.g. for variables that couldn't be
981
+ fit).
982
+ """
983
+ extra_footnotes: list[str] = []
984
+
985
+ # Validate adjust_for columns up-front — fail fast with a clear
986
+ # ``KeyError`` rather than letting pandas raise a generic
987
+ # ``KeyError: ['NOPE'] not in index`` from deep inside the fit.
988
+ # Matches the validation pattern for ``by=`` and ``weights=``.
989
+ missing_adj = [c for c in adjust_for if c not in data.columns]
990
+ if missing_adj:
991
+ raise KeyError(
992
+ f"add_global_p: adjust_for column(s) {missing_adj!r} not in data"
993
+ )
994
+
995
+ if by is None:
996
+ extra_footnotes.append(
997
+ "add_global_p: skipped (no by= column).",
998
+ )
999
+ # Append blank cells so the column shape stays consistent.
1000
+ return (
1001
+ _append_blank_column(rows),
1002
+ _append_header_column(headers, "global p"),
1003
+ extra_footnotes,
1004
+ )
1005
+
1006
+ by_series = data[by]
1007
+ levels = sorted(by_series.dropna().unique(), key=_sort_key)
1008
+ if len(levels) != 2:
1009
+ # Multinomial logit is out of scope for v1.
1010
+ extra_footnotes.append(
1011
+ f"add_global_p: by={by!r} has {len(levels)} levels; "
1012
+ "v1 supports only 2-level stratification (multinomial "
1013
+ "logit not implemented).",
1014
+ )
1015
+ return (
1016
+ _append_blank_column(rows, fill="—"),
1017
+ _append_header_column(headers, "global p"),
1018
+ extra_footnotes,
1019
+ )
1020
+
1021
+ # Compute one joint p-value per variable.
1022
+ p_per_var: dict[str, float | None] = {}
1023
+ for var in variables:
1024
+ p_per_var[var] = _fit_global_p(
1025
+ data=data, by=by, by_levels=levels,
1026
+ var=var, kind=kinds[var], adjust_for=adjust_for,
1027
+ )
1028
+
1029
+ # Walk rows, map each to its variable, and append a cell. The map
1030
+ # uses the existing row labels:
1031
+ # - Continuous / categorical parent row: label = labels.get(var, var)
1032
+ # - Dichotomous: label = "varlabel = displayed_level"
1033
+ # - Categorical level rows: indented level text (parent row above)
1034
+ # - Missing sub-row: label = MISSING_LABEL
1035
+ # We rely on the build order: variables are processed sequentially
1036
+ # and the parent row of each variable always appears before its
1037
+ # level / missing sub-rows.
1038
+ var_label_to_var = {labels.get(v, v): v for v in variables}
1039
+ # For dichotomous "var = Level" rows, also map the prefix.
1040
+ dichot_label_to_var = {
1041
+ f"{labels.get(v, v)} = ": v for v in variables if kinds[v] == "dichotomous"
1042
+ }
1043
+
1044
+ new_rows: list[Row] = []
1045
+ for r in rows:
1046
+ first = r.cells[0].text
1047
+ # Identify variable for this row.
1048
+ matched_var: str | None = None
1049
+ if first in var_label_to_var:
1050
+ matched_var = var_label_to_var[first]
1051
+ else:
1052
+ for prefix, v in dichot_label_to_var.items():
1053
+ if first.startswith(prefix):
1054
+ matched_var = v
1055
+ break
1056
+
1057
+ if matched_var is not None:
1058
+ p = p_per_var.get(matched_var)
1059
+ cell = make_cell(
1060
+ fmt_p_value(p) if p is not None else "—",
1061
+ value=p, kind="p_value", align="right",
1062
+ )
1063
+ else:
1064
+ # Sub-row (categorical level, missing): blank so the
1065
+ # joint-p is visually anchored to the variable's parent row.
1066
+ cell = make_cell("", value=None)
1067
+
1068
+ new_rows.append(
1069
+ Row(cells=tuple(list(r.cells) + [cell]),
1070
+ is_group_header=r.is_group_header),
1071
+ )
1072
+
1073
+ new_headers = _append_header_column(headers, "global p")
1074
+ if adjust_for:
1075
+ extra_footnotes.append(
1076
+ "global p: joint Wald test on the variable's coefficients "
1077
+ f"from Logit({by} ~ variable + "
1078
+ f"{' + '.join(adjust_for)}).",
1079
+ )
1080
+ else:
1081
+ extra_footnotes.append(
1082
+ "global p: joint Wald test on the variable's coefficients "
1083
+ f"from Logit({by} ~ variable).",
1084
+ )
1085
+ return new_rows, new_headers, extra_footnotes
1086
+
1087
+
1088
+ def _fit_global_p(
1089
+ *,
1090
+ data: pd.DataFrame,
1091
+ by: str,
1092
+ by_levels: list[Any],
1093
+ var: str,
1094
+ kind: VarKind,
1095
+ adjust_for: tuple[str, ...],
1096
+ ) -> float | None:
1097
+ """Fit one logistic regression and return the joint Wald p-value
1098
+ for ``var``'s coefficients.
1099
+
1100
+ Implementation choices:
1101
+ * Outcome encoded as ``by == by_levels[1]`` (alphabetically
1102
+ second level) so the reference is well-defined.
1103
+ * Variable encoded based on inferred kind: continuous columns
1104
+ used as-is; dichotomous / categorical columns one-hot encoded
1105
+ via ``pd.get_dummies(drop_first=True)``.
1106
+ * Adjustment columns each encoded the same way (numeric →
1107
+ as-is; non-numeric → dummies).
1108
+ * Joint test built as a constraint string
1109
+ ``"c1 = 0, c2 = 0, ..."`` over the variable's columns and
1110
+ passed to ``model.f_test`` (matches what ``add_global_p()``
1111
+ does for ``tbl_regression``).
1112
+ * Singular design / convergence failure → ``None`` (renders as
1113
+ em-dash; never a misleading numeric).
1114
+ """
1115
+ # Build the working frame: drop rows with NaN in any required col.
1116
+ # Deduplicate column references — when the variable being tested is
1117
+ # also listed in ``adjust_for``, the duplicate would (a) make
1118
+ # ``data[cols]`` produce a 2-D selection that crashes
1119
+ # ``pd.to_numeric``, and (b) make the design matrix singular.
1120
+ # Variable always wins; the matching adjustment column is dropped.
1121
+ seen: set[str] = set()
1122
+ cols: list[str] = []
1123
+ for c in (by, var, *adjust_for):
1124
+ if c not in seen:
1125
+ seen.add(c)
1126
+ cols.append(c)
1127
+ sub = data[cols].dropna()
1128
+ if sub.empty or sub[by].nunique() < 2:
1129
+ return None
1130
+
1131
+ y = (sub[by] == by_levels[1]).astype(int).to_numpy()
1132
+
1133
+ var_cols = _design_columns(sub, var, kind)
1134
+ if not var_cols:
1135
+ return None
1136
+ adj_cols: list[tuple[str, Any]] = []
1137
+ for a in adjust_for:
1138
+ if a == var:
1139
+ continue # already in var_cols
1140
+ akind = _quick_kind(sub[a])
1141
+ for name, col in _design_columns(sub, a, akind):
1142
+ adj_cols.append((name, col))
1143
+
1144
+ # Stack into a single design matrix.
1145
+ import numpy as np
1146
+ import statsmodels.api as sm
1147
+ X_parts = [c for _, c in var_cols] + [c for _, c in adj_cols]
1148
+ X = np.column_stack(X_parts)
1149
+ # Add a constant column (intercept).
1150
+ X = sm.add_constant(X, has_constant="add")
1151
+
1152
+ # Column-name registry: index 0 is the const, then var_cols, then
1153
+ # adj_cols.
1154
+ col_names = ["const"] + [n for n, _ in var_cols] + [n for n, _ in adj_cols]
1155
+ if X.shape[1] != len(col_names): # pragma: no cover — defensive
1156
+ return None
1157
+
1158
+ import warnings as _w
1159
+ try:
1160
+ with _w.catch_warnings():
1161
+ _w.simplefilter("ignore") # statsmodels convergence chatter
1162
+ res = sm.Logit(y, X).fit(disp=False, method="newton",
1163
+ maxiter=100)
1164
+ except Exception: # pragma: no cover — defensive: singular design / no convergence
1165
+ return None
1166
+
1167
+ if not hasattr(res, "f_test"): # pragma: no cover
1168
+ return None
1169
+
1170
+ # Build the joint hypothesis: variable's dummies = 0.
1171
+ var_names = [n for n, _ in var_cols]
1172
+ constraint = ", ".join(
1173
+ f"x{col_names.index(n)} = 0" for n in var_names
1174
+ )
1175
+ try:
1176
+ ftest = res.f_test(constraint)
1177
+ return float(ftest.pvalue)
1178
+ except Exception: # pragma: no cover
1179
+ return None
1180
+
1181
+
1182
+ def _design_columns(
1183
+ sub: pd.DataFrame, var: str, kind: VarKind,
1184
+ ) -> list[tuple[str, Any]]:
1185
+ """Return a list of (column_name, numpy_array) pairs for a single
1186
+ variable, dummy-coded if categorical."""
1187
+ s = sub[var]
1188
+ if kind == "continuous":
1189
+ return [(var, pd.to_numeric(s, errors="coerce").to_numpy())]
1190
+ # Dichotomous and categorical both go through one-hot encoding;
1191
+ # ``drop_first=True`` keeps the design full-rank.
1192
+ dummies = pd.get_dummies(s, prefix=var, drop_first=True, dtype=float)
1193
+ return [(c, dummies[c].to_numpy()) for c in dummies.columns]
1194
+
1195
+
1196
+ def _quick_kind(s: pd.Series) -> VarKind:
1197
+ """Best-effort kind inference for adjustment columns."""
1198
+ if pd.api.types.is_numeric_dtype(s) and s.nunique() > 2:
1199
+ return "continuous"
1200
+ if s.nunique() <= 2:
1201
+ return "dichotomous"
1202
+ return "categorical"
1203
+
1204
+
1205
+ def _append_header_column(
1206
+ headers: tuple[HeaderRow, ...], label: str,
1207
+ ) -> tuple[HeaderRow, ...]:
1208
+ out = []
1209
+ for hr in headers:
1210
+ new_cells = list(hr.cells) + [HeaderCell(text=label, align="center", bold=True)]
1211
+ out.append(HeaderRow(cells=tuple(new_cells)))
1212
+ return tuple(out)
1213
+
1214
+
1215
+ def _append_blank_column(rows: list[Row], fill: str = "") -> list[Row]:
1216
+ return [
1217
+ Row(cells=tuple(list(r.cells) + [make_cell(fill, value=None)]),
1218
+ is_group_header=r.is_group_header)
1219
+ for r in rows
1220
+ ]