pysofra 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysofra/__init__.py +82 -0
- pysofra/core/__init__.py +14 -0
- pysofra/core/compose.py +167 -0
- pysofra/core/format.py +155 -0
- pysofra/core/frames.py +69 -0
- pysofra/core/schema.py +128 -0
- pysofra/core/table.py +924 -0
- pysofra/io/__init__.py +1 -0
- pysofra/models/__init__.py +6 -0
- pysofra/models/extract.py +249 -0
- pysofra/models/pool.py +119 -0
- pysofra/models/regression.py +507 -0
- pysofra/models/survival.py +395 -0
- pysofra/models/uvregression.py +438 -0
- pysofra/notebook/__init__.py +6 -0
- pysofra/plot/__init__.py +23 -0
- pysofra/plot/_backend.py +32 -0
- pysofra/plot/forest.py +159 -0
- pysofra/plot/inline.py +171 -0
- pysofra/plot/km.py +249 -0
- pysofra/render/__init__.py +28 -0
- pysofra/render/_zip_determinism.py +57 -0
- pysofra/render/base.py +22 -0
- pysofra/render/docx.py +286 -0
- pysofra/render/html.py +442 -0
- pysofra/render/image.py +130 -0
- pysofra/render/latex.py +253 -0
- pysofra/render/markdown.py +128 -0
- pysofra/render/pptx.py +340 -0
- pysofra/render/xlsx.py +226 -0
- pysofra/summary/__init__.py +6 -0
- pysofra/summary/calibrate.py +214 -0
- pysofra/summary/design.py +246 -0
- pysofra/summary/effect_size.py +187 -0
- pysofra/summary/extras.py +745 -0
- pysofra/summary/smd.py +133 -0
- pysofra/summary/stats.py +135 -0
- pysofra/summary/tbl_cross.py +339 -0
- pysofra/summary/tbl_one.py +1220 -0
- pysofra/summary/tbl_summary.py +51 -0
- pysofra/summary/tests.py +370 -0
- pysofra/summary/typing.py +129 -0
- pysofra/summary/weights.py +161 -0
- pysofra/themes/__init__.py +5 -0
- pysofra/themes/registry.py +272 -0
- pysofra-0.1.0a1.dist-info/METADATA +301 -0
- pysofra-0.1.0a1.dist-info/RECORD +50 -0
- pysofra-0.1.0a1.dist-info/WHEEL +4 -0
- pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
- pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""Univariable regression — one model per predictor, stacked side-by-side.
|
|
2
|
+
|
|
3
|
+
Equivalent to R ``gtsummary::tbl_uvregression``. For each predictor,
|
|
4
|
+
a separate regression of ``outcome ~ predictor`` (optionally
|
|
5
|
+
``+ adjust_for``) is fit; results are stacked vertically into a single
|
|
6
|
+
table.
|
|
7
|
+
|
|
8
|
+
Categorical predictors are dummy-encoded (first level = reference). A
|
|
9
|
+
group-header row is emitted for each multi-level predictor; each
|
|
10
|
+
non-reference level becomes its own indented body row, matching the
|
|
11
|
+
gtsummary layout.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from ..core.frames import to_pandas
|
|
22
|
+
from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
|
|
23
|
+
from ..core.table import SofraTable, TableSpec
|
|
24
|
+
from .extract import extract
|
|
25
|
+
from .regression import _default_estimate_label
|
|
26
|
+
|
|
27
|
+
# Separator used when dummy-encoding a categorical predictor's columns.
|
|
28
|
+
# Triple underscore makes accidental collisions with real column names
|
|
29
|
+
# vanishingly rare.
|
|
30
|
+
_DUMMY_SEP = "___"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_continuous(col: pd.Series) -> bool:
|
|
34
|
+
"""A predictor is treated as continuous iff its dtype is numeric
|
|
35
|
+
and *not* boolean (booleans are dichotomous)."""
|
|
36
|
+
return (
|
|
37
|
+
pd.api.types.is_numeric_dtype(col)
|
|
38
|
+
and not pd.api.types.is_bool_dtype(col)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _expand_predictor(
|
|
43
|
+
sub: pd.DataFrame, pred: str,
|
|
44
|
+
) -> tuple[pd.DataFrame, list[tuple[str | None, str, bool]]]:
|
|
45
|
+
"""Return ``(design_frame, level_specs)`` for one predictor.
|
|
46
|
+
|
|
47
|
+
For a numeric predictor this is the identity: one design column,
|
|
48
|
+
one output row, no reference level.
|
|
49
|
+
|
|
50
|
+
For a categorical predictor we drop the first level (the reference)
|
|
51
|
+
and dummy-encode the rest. The returned ``level_specs`` is a list
|
|
52
|
+
of ``(design_column_name_or_None, display_label, is_reference)``
|
|
53
|
+
tuples — one tuple per *displayed* row, ordered top-to-bottom.
|
|
54
|
+
"""
|
|
55
|
+
col = sub[pred]
|
|
56
|
+
if _is_continuous(col):
|
|
57
|
+
return pd.DataFrame({pred: col}), [(pred, pred, False)]
|
|
58
|
+
|
|
59
|
+
# Categorical / boolean — enumerate levels.
|
|
60
|
+
if isinstance(col.dtype, pd.CategoricalDtype):
|
|
61
|
+
levels: list[Any] = list(col.cat.categories)
|
|
62
|
+
elif pd.api.types.is_bool_dtype(col):
|
|
63
|
+
levels = [False, True]
|
|
64
|
+
else:
|
|
65
|
+
levels = sorted(col.dropna().unique(), key=str)
|
|
66
|
+
|
|
67
|
+
if len(levels) < 2:
|
|
68
|
+
# Single-level → nothing to fit.
|
|
69
|
+
empty = pd.DataFrame(index=sub.index)
|
|
70
|
+
return empty, []
|
|
71
|
+
|
|
72
|
+
ref = levels[0]
|
|
73
|
+
dummies = pd.get_dummies(col, prefix=pred, prefix_sep=_DUMMY_SEP, dtype=float)
|
|
74
|
+
ref_col = f"{pred}{_DUMMY_SEP}{ref}"
|
|
75
|
+
if ref_col in dummies.columns:
|
|
76
|
+
dummies = dummies.drop(columns=[ref_col])
|
|
77
|
+
# Drop unused levels — pd.Categorical creates a dummy column for
|
|
78
|
+
# every declared category, even if no observation belongs to it.
|
|
79
|
+
# An all-zero column is collinear with the intercept and breaks the
|
|
80
|
+
# fit; remove it so the reference set excludes phantom levels.
|
|
81
|
+
zero_var = [c for c in dummies.columns if dummies[c].sum() == 0]
|
|
82
|
+
if zero_var:
|
|
83
|
+
dummies = dummies.drop(columns=zero_var)
|
|
84
|
+
|
|
85
|
+
# Order rows: reference first (label only, no fit), then each
|
|
86
|
+
# non-reference level. Boolean columns get nicer display labels.
|
|
87
|
+
def _fmt_level(x: Any) -> str:
|
|
88
|
+
if isinstance(x, bool):
|
|
89
|
+
return "Yes" if x else "No"
|
|
90
|
+
return str(x)
|
|
91
|
+
|
|
92
|
+
level_specs: list[tuple[str | None, str, bool]] = [
|
|
93
|
+
(None, _fmt_level(ref), True)
|
|
94
|
+
]
|
|
95
|
+
for lvl in levels[1:]:
|
|
96
|
+
cname = f"{pred}{_DUMMY_SEP}{lvl}"
|
|
97
|
+
if cname not in dummies.columns:
|
|
98
|
+
# Level present in `levels` but every observation in `sub`
|
|
99
|
+
# was a different level (categorical with unused categories).
|
|
100
|
+
# Skip it.
|
|
101
|
+
continue
|
|
102
|
+
level_specs.append((cname, _fmt_level(lvl), False))
|
|
103
|
+
|
|
104
|
+
return dummies, level_specs
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def tbl_uvregression(
|
|
108
|
+
data: Any,
|
|
109
|
+
*,
|
|
110
|
+
outcome: str,
|
|
111
|
+
predictors: list[str] | None = None,
|
|
112
|
+
method: Callable[..., Any] | str = "OLS",
|
|
113
|
+
method_kwargs: dict[str, Any] | None = None,
|
|
114
|
+
adjust_for: list[str] | None = None,
|
|
115
|
+
exponentiate: bool | None = None,
|
|
116
|
+
conf_level: float = 0.95,
|
|
117
|
+
digits: int = 2,
|
|
118
|
+
labels: dict[str, str] | None = None,
|
|
119
|
+
) -> SofraTable:
|
|
120
|
+
"""Univariable regression — one model per predictor.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
data
|
|
125
|
+
Source dataframe (pandas or polars).
|
|
126
|
+
outcome
|
|
127
|
+
Column name of the response variable.
|
|
128
|
+
predictors
|
|
129
|
+
Predictor columns. Defaults to every column except ``outcome``
|
|
130
|
+
and any ``adjust_for`` covariates (numeric *and* categorical).
|
|
131
|
+
method
|
|
132
|
+
Either a callable that takes ``(y, X)`` and returns a fitted
|
|
133
|
+
statsmodels-style results object, or one of the string aliases
|
|
134
|
+
``"OLS"``, ``"Logit"``, ``"Poisson"``, ``"GLM"``.
|
|
135
|
+
method_kwargs
|
|
136
|
+
Extra keyword arguments forwarded to the model class.
|
|
137
|
+
adjust_for
|
|
138
|
+
Optional list of covariates included in every univariable fit
|
|
139
|
+
(matching ``gtsummary``'s ``include`` argument). Adjustment
|
|
140
|
+
covariates are themselves dummy-encoded if categorical.
|
|
141
|
+
exponentiate
|
|
142
|
+
If ``True``, exponentiate point estimates and CI bounds.
|
|
143
|
+
``None`` (default) auto-selects based on the model family.
|
|
144
|
+
conf_level
|
|
145
|
+
Confidence level for the CI column.
|
|
146
|
+
digits
|
|
147
|
+
Decimal places for estimates and CI bounds.
|
|
148
|
+
labels
|
|
149
|
+
Mapping from predictor name → display label. Applied to the
|
|
150
|
+
group-header row for categorical predictors.
|
|
151
|
+
|
|
152
|
+
Notes
|
|
153
|
+
-----
|
|
154
|
+
For a categorical predictor with K levels the result has
|
|
155
|
+
``K`` rows: a header naming the variable, plus ``K`` indented
|
|
156
|
+
rows (the reference level rendered as ``— ref``, and one row
|
|
157
|
+
per non-reference level with its estimate / CI / p-value).
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
import statsmodels.api as sm
|
|
161
|
+
except ImportError as e: # pragma: no cover
|
|
162
|
+
raise ImportError(
|
|
163
|
+
"tbl_uvregression requires statsmodels. "
|
|
164
|
+
"Install with `pip install statsmodels`."
|
|
165
|
+
) from e
|
|
166
|
+
|
|
167
|
+
df = to_pandas(data)
|
|
168
|
+
if outcome not in df.columns:
|
|
169
|
+
raise KeyError(f"outcome column {outcome!r} not in data")
|
|
170
|
+
df = df.dropna(subset=[outcome])
|
|
171
|
+
|
|
172
|
+
adjust_for = list(adjust_for or [])
|
|
173
|
+
if predictors is None:
|
|
174
|
+
excluded = {outcome, *adjust_for}
|
|
175
|
+
predictors = [c for c in df.columns if c not in excluded]
|
|
176
|
+
|
|
177
|
+
# Predictor / adjust_for overlap doesn't make sense ("regress y on x
|
|
178
|
+
# adjusted for x") and would also break design-matrix assembly
|
|
179
|
+
# because pandas returns a DataFrame, not a Series, when a column
|
|
180
|
+
# name is duplicated in a slice.
|
|
181
|
+
overlap = sorted(set(predictors) & set(adjust_for))
|
|
182
|
+
if overlap:
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"Predictor(s) {overlap} also appear in adjust_for; remove "
|
|
185
|
+
f"from one of the two."
|
|
186
|
+
)
|
|
187
|
+
if outcome in predictors:
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"outcome {outcome!r} must not appear in predictors."
|
|
190
|
+
)
|
|
191
|
+
if outcome in adjust_for:
|
|
192
|
+
raise ValueError(
|
|
193
|
+
f"outcome {outcome!r} must not appear in adjust_for."
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
labels = dict(labels or {})
|
|
197
|
+
method_kwargs = dict(method_kwargs or {})
|
|
198
|
+
|
|
199
|
+
model_factory: Callable[..., Any]
|
|
200
|
+
if callable(method):
|
|
201
|
+
model_factory = method
|
|
202
|
+
elif method == "OLS":
|
|
203
|
+
model_factory = sm.OLS
|
|
204
|
+
elif method == "Logit":
|
|
205
|
+
model_factory = sm.Logit
|
|
206
|
+
elif method == "Poisson":
|
|
207
|
+
model_factory = sm.Poisson
|
|
208
|
+
elif method == "GLM":
|
|
209
|
+
model_factory = sm.GLM
|
|
210
|
+
else:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"Unknown method {method!r}; pass a callable or one of "
|
|
213
|
+
"'OLS', 'Logit', 'Poisson', 'GLM'."
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Pre-expand the adjust_for block once — it's shared across rows.
|
|
217
|
+
if adjust_for:
|
|
218
|
+
sub_adjust = df[adjust_for]
|
|
219
|
+
adjust_design_blocks: list[pd.DataFrame] = []
|
|
220
|
+
for a in adjust_for:
|
|
221
|
+
adj_design, _ = _expand_predictor(sub_adjust, a)
|
|
222
|
+
adjust_design_blocks.append(adj_design)
|
|
223
|
+
adjust_block_template = (
|
|
224
|
+
pd.concat(adjust_design_blocks, axis=1)
|
|
225
|
+
if adjust_design_blocks else pd.DataFrame(index=df.index)
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
adjust_block_template = pd.DataFrame(index=df.index)
|
|
229
|
+
|
|
230
|
+
# ------------------------------------------------------------------
|
|
231
|
+
# Fit one regression per predictor.
|
|
232
|
+
# ------------------------------------------------------------------
|
|
233
|
+
rows: list[Row] = []
|
|
234
|
+
families: set[str] = set()
|
|
235
|
+
exp_per: list[bool] = []
|
|
236
|
+
failed: list[str] = []
|
|
237
|
+
|
|
238
|
+
for pred in predictors:
|
|
239
|
+
# Build the working sub-frame: outcome + adjust_for + this predictor.
|
|
240
|
+
cols_needed = [outcome, pred, *adjust_for]
|
|
241
|
+
sub = df[cols_needed].dropna()
|
|
242
|
+
if sub.empty:
|
|
243
|
+
failed.append(pred)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
pred_design, level_specs = _expand_predictor(sub, pred)
|
|
247
|
+
if pred_design.empty or not level_specs:
|
|
248
|
+
failed.append(pred)
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
adjust_block = (
|
|
252
|
+
adjust_block_template.loc[sub.index]
|
|
253
|
+
if not adjust_block_template.empty else None
|
|
254
|
+
)
|
|
255
|
+
# Stitch predictor + adjust into a single design matrix.
|
|
256
|
+
if adjust_block is not None and not adjust_block.empty:
|
|
257
|
+
design_X = pd.concat([pred_design, adjust_block], axis=1)
|
|
258
|
+
else:
|
|
259
|
+
design_X = pred_design.copy()
|
|
260
|
+
X = sm.add_constant(design_X)
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
fit = model_factory(sub[outcome], X, **method_kwargs).fit(disp=False)
|
|
264
|
+
except Exception:
|
|
265
|
+
failed.append(pred)
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
summary = extract(fit, conf_level=conf_level)
|
|
269
|
+
families.add(summary.family)
|
|
270
|
+
exp = summary.natural_exponentiate if exponentiate is None else bool(exponentiate)
|
|
271
|
+
exp_per.append(exp)
|
|
272
|
+
|
|
273
|
+
n_sub = int(len(sub))
|
|
274
|
+
display_label = labels.get(pred, pred)
|
|
275
|
+
n_levels = len(level_specs)
|
|
276
|
+
is_categorical_predictor = n_levels > 1 or (
|
|
277
|
+
n_levels == 1 and level_specs[0][2] # reference-only edge case
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if not is_categorical_predictor:
|
|
281
|
+
# Numeric / continuous → single row, no header.
|
|
282
|
+
design_col = level_specs[0][0]
|
|
283
|
+
assert design_col is not None
|
|
284
|
+
if design_col not in summary.estimates.index:
|
|
285
|
+
failed.append(pred)
|
|
286
|
+
continue
|
|
287
|
+
rows.append(_one_predictor_row(
|
|
288
|
+
design_col, summary, exp=exp, digits=digits,
|
|
289
|
+
label=display_label, n=n_sub, indent=0,
|
|
290
|
+
source_name=pred,
|
|
291
|
+
))
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# Multi-level categorical → group header + one row per level.
|
|
295
|
+
rows.append(_group_header_row(
|
|
296
|
+
display_label, source_name=pred, n=n_sub, n_cols=5,
|
|
297
|
+
))
|
|
298
|
+
for design_col, lvl_label, is_ref in level_specs:
|
|
299
|
+
if is_ref:
|
|
300
|
+
rows.append(_reference_row(lvl_label, source_name=pred))
|
|
301
|
+
continue
|
|
302
|
+
if design_col is None or design_col not in summary.estimates.index: # pragma: no cover — would require a singular fit dropping a non-ref column
|
|
303
|
+
failed.append(f"{pred}={lvl_label}")
|
|
304
|
+
continue
|
|
305
|
+
# Count the level by summing the dummy column (avoids
|
|
306
|
+
# string-vs-bool comparison pitfalls when reversing the
|
|
307
|
+
# mangle).
|
|
308
|
+
level_n = int(pred_design[design_col].sum())
|
|
309
|
+
rows.append(_one_predictor_row(
|
|
310
|
+
design_col, summary, exp=exp, digits=digits,
|
|
311
|
+
label=lvl_label, n=level_n,
|
|
312
|
+
indent=1, source_name=pred,
|
|
313
|
+
))
|
|
314
|
+
|
|
315
|
+
if not rows and not failed:
|
|
316
|
+
raise ValueError("No predictors produced a coefficient.")
|
|
317
|
+
|
|
318
|
+
# Estimate label uses the first family / first exponentiate setting.
|
|
319
|
+
family_label = next(iter(families)) if families else "?"
|
|
320
|
+
est_label = _default_estimate_label(family_label, exp_per[0] if exp_per else False)
|
|
321
|
+
|
|
322
|
+
headers = (HeaderRow(cells=(
|
|
323
|
+
HeaderCell(text="Predictor", align="left"),
|
|
324
|
+
HeaderCell(text="N"),
|
|
325
|
+
HeaderCell(text=est_label),
|
|
326
|
+
HeaderCell(text=f"{int(round(conf_level * 100))}% CI"),
|
|
327
|
+
HeaderCell(text="p-value"),
|
|
328
|
+
)),)
|
|
329
|
+
|
|
330
|
+
footnotes: list[str] = []
|
|
331
|
+
if adjust_for:
|
|
332
|
+
footnotes.append("Each variable's coefficient is adjusted for: "
|
|
333
|
+
f"{', '.join(adjust_for)}.")
|
|
334
|
+
else:
|
|
335
|
+
footnotes.append(
|
|
336
|
+
"Each row is a univariable regression of the outcome on the "
|
|
337
|
+
"named predictor."
|
|
338
|
+
)
|
|
339
|
+
if any(exp_per):
|
|
340
|
+
footnotes.append(
|
|
341
|
+
f"{est_label} = exponentiated coefficient; "
|
|
342
|
+
f"CI = {int(round(conf_level * 100))}% confidence interval."
|
|
343
|
+
)
|
|
344
|
+
else:
|
|
345
|
+
footnotes.append(f"CI = {int(round(conf_level * 100))}% confidence interval.")
|
|
346
|
+
if families:
|
|
347
|
+
footnotes.append(f"Model: {next(iter(families))}.")
|
|
348
|
+
if failed:
|
|
349
|
+
footnotes.append(
|
|
350
|
+
f"{len(failed)} predictor(s) / level(s) failed to converge or "
|
|
351
|
+
f"had no data: {', '.join(failed)}."
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
spec = TableSpec(
|
|
355
|
+
builder="tbl_uvregression",
|
|
356
|
+
options={
|
|
357
|
+
"outcome": outcome,
|
|
358
|
+
"predictors": tuple(predictors),
|
|
359
|
+
"method": method if isinstance(method, str) else method.__name__,
|
|
360
|
+
"exponentiate": exponentiate,
|
|
361
|
+
"conf_level": conf_level,
|
|
362
|
+
"digits": digits,
|
|
363
|
+
},
|
|
364
|
+
)
|
|
365
|
+
return SofraTable(
|
|
366
|
+
rows=tuple(rows),
|
|
367
|
+
headers=headers,
|
|
368
|
+
footnotes=tuple(footnotes),
|
|
369
|
+
metadata={
|
|
370
|
+
"builder": "tbl_uvregression",
|
|
371
|
+
"family": next(iter(families), None),
|
|
372
|
+
"failed": failed,
|
|
373
|
+
},
|
|
374
|
+
_spec=spec,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _group_header_row(label: str, *, source_name: str, n: int, n_cols: int) -> Row:
|
|
379
|
+
"""Bold predictor-name row introducing a categorical predictor's levels."""
|
|
380
|
+
cells = [make_cell(label, align="left", bold=True)]
|
|
381
|
+
cells.append(make_cell(str(n), value=n, kind="numeric", align="right"))
|
|
382
|
+
for _ in range(n_cols - 2):
|
|
383
|
+
cells.append(Cell(text="", value=None))
|
|
384
|
+
return Row(cells=tuple(cells), is_group_header=True,
|
|
385
|
+
metadata={"variable": source_name})
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _reference_row(level_label: str, *, source_name: str) -> Row:
|
|
389
|
+
"""The reference level — no estimate, marked '— ref'."""
|
|
390
|
+
return Row(cells=(
|
|
391
|
+
make_cell(level_label, align="left", indent=1),
|
|
392
|
+
Cell(text="", value=None),
|
|
393
|
+
make_cell("— ref", value=None, kind="numeric", align="right"),
|
|
394
|
+
Cell(text="", value=None, kind="ci"),
|
|
395
|
+
Cell(text="", value=None, kind="p_value"),
|
|
396
|
+
), metadata={"variable": source_name})
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _one_predictor_row(
|
|
400
|
+
design_col: str,
|
|
401
|
+
summary: Any, # ModelSummary
|
|
402
|
+
*,
|
|
403
|
+
exp: bool,
|
|
404
|
+
digits: int,
|
|
405
|
+
label: str,
|
|
406
|
+
n: int,
|
|
407
|
+
indent: int = 0,
|
|
408
|
+
source_name: str | None = None,
|
|
409
|
+
) -> Row:
|
|
410
|
+
from math import exp as _exp
|
|
411
|
+
|
|
412
|
+
from ..core.format import fmt_number, fmt_p_value
|
|
413
|
+
|
|
414
|
+
est = float(summary.estimates[design_col])
|
|
415
|
+
lo = float(summary.ci_lo[design_col]) if design_col in summary.ci_lo.index else float("nan")
|
|
416
|
+
hi = float(summary.ci_hi[design_col]) if design_col in summary.ci_hi.index else float("nan")
|
|
417
|
+
p = float(summary.pvalues[design_col]) if design_col in summary.pvalues.index else float("nan")
|
|
418
|
+
|
|
419
|
+
def _safe_exp(x: float) -> float:
|
|
420
|
+
try:
|
|
421
|
+
return _exp(x)
|
|
422
|
+
except OverflowError:
|
|
423
|
+
return float("inf") if x > 0 else 0.0
|
|
424
|
+
|
|
425
|
+
if exp:
|
|
426
|
+
est_d, lo_d, hi_d = _safe_exp(est), _safe_exp(lo), _safe_exp(hi)
|
|
427
|
+
else:
|
|
428
|
+
est_d, lo_d, hi_d = est, lo, hi
|
|
429
|
+
|
|
430
|
+
return Row(cells=(
|
|
431
|
+
make_cell(label, align="left", indent=indent),
|
|
432
|
+
make_cell(str(n), value=n, kind="numeric", align="right"),
|
|
433
|
+
make_cell(fmt_number(est_d, digits), value=est_d,
|
|
434
|
+
kind="numeric", align="right"),
|
|
435
|
+
make_cell(f"{fmt_number(lo_d, digits)}, {fmt_number(hi_d, digits)}",
|
|
436
|
+
value=(lo_d, hi_d), kind="ci", align="right"),
|
|
437
|
+
make_cell(fmt_p_value(p), value=p, kind="p_value", align="right"),
|
|
438
|
+
), metadata={"variable": source_name} if source_name else {})
|
pysofra/plot/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Matplotlib-backed plot generation for SofraTables.
|
|
2
|
+
|
|
3
|
+
Plot helpers return :class:`InlinePlot` objects carrying SVG, PNG, and
|
|
4
|
+
PDF serialisations of the same matplotlib figure so every renderer
|
|
5
|
+
(HTML, DOCX, PPTX, LaTeX) can embed the plot consistently.
|
|
6
|
+
|
|
7
|
+
* :func:`forest_plot` — point estimates + CIs from a regression table.
|
|
8
|
+
* :func:`km_curve` — Kaplan–Meier survival curves.
|
|
9
|
+
|
|
10
|
+
Both depend on ``matplotlib``, which is an optional dependency.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .forest import forest_plot, forest_plot_svg
|
|
14
|
+
from .inline import InlinePlot
|
|
15
|
+
from .km import km_curve, km_curve_svg
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"InlinePlot",
|
|
19
|
+
"forest_plot",
|
|
20
|
+
"forest_plot_svg",
|
|
21
|
+
"km_curve",
|
|
22
|
+
"km_curve_svg",
|
|
23
|
+
]
|
pysofra/plot/_backend.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Headless-safe matplotlib backend setup.
|
|
2
|
+
|
|
3
|
+
PySofra never opens a window — every figure is serialised to bytes
|
|
4
|
+
(PNG, SVG, PDF). We therefore force matplotlib's ``Agg`` backend
|
|
5
|
+
before pyplot first creates a figure. Without this, the default
|
|
6
|
+
backend on macOS is ``MacOSX`` which calls into the GUI subsystem;
|
|
7
|
+
in sandboxed environments (HOME=/nonexistent, no display, container
|
|
8
|
+
without X) this aborts with a Cocoa error during figure creation.
|
|
9
|
+
|
|
10
|
+
This helper is idempotent: ``matplotlib.use("Agg", force=True)`` is
|
|
11
|
+
a cheap dictionary update once the backend is loaded. Calling it
|
|
12
|
+
from every plot entry point is the simplest way to guarantee Agg
|
|
13
|
+
is in effect regardless of import order.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def use_headless_backend() -> None:
|
|
20
|
+
"""Force matplotlib's Agg backend in this process.
|
|
21
|
+
|
|
22
|
+
Called at the top of every render/plot entry point in PySofra,
|
|
23
|
+
immediately before ``import matplotlib.pyplot as plt``. Safe to
|
|
24
|
+
call repeatedly. No-op if matplotlib is not installed (the caller
|
|
25
|
+
handles the optional-dependency import error in its own
|
|
26
|
+
try/except).
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
import matplotlib
|
|
30
|
+
except ImportError: # pragma: no cover — caller raises a friendlier error
|
|
31
|
+
return
|
|
32
|
+
matplotlib.use("Agg", force=True)
|
pysofra/plot/forest.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Forest plot rendering for regression SofraTables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..core.table import SofraTable
|
|
9
|
+
from .inline import InlinePlot, fig_to_svg, render_inline_plot
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def forest_plot(
|
|
13
|
+
table: SofraTable,
|
|
14
|
+
*,
|
|
15
|
+
log_x: bool = True,
|
|
16
|
+
null_line: float = 1.0,
|
|
17
|
+
width_in: float = 6.5,
|
|
18
|
+
height_per_row_in: float = 0.42,
|
|
19
|
+
color: str = "#0b3d91",
|
|
20
|
+
) -> InlinePlot:
|
|
21
|
+
"""Render a forest plot as an :class:`InlinePlot` (SVG + PNG + PDF).
|
|
22
|
+
|
|
23
|
+
Use this when you want the plot embedded across multiple renderers
|
|
24
|
+
(HTML, DOCX, PPTX, LaTeX). For the HTML-only string form, use
|
|
25
|
+
:func:`forest_plot_svg`.
|
|
26
|
+
"""
|
|
27
|
+
fig, height = _build_forest_figure(
|
|
28
|
+
table, log_x=log_x, null_line=null_line,
|
|
29
|
+
width_in=width_in, height_per_row_in=height_per_row_in, color=color,
|
|
30
|
+
)
|
|
31
|
+
plot = render_inline_plot(fig, width_in=width_in, height_in=height)
|
|
32
|
+
try:
|
|
33
|
+
import matplotlib.pyplot as plt
|
|
34
|
+
plt.close(fig)
|
|
35
|
+
except ImportError: # pragma: no cover
|
|
36
|
+
pass
|
|
37
|
+
return plot
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def forest_plot_svg(
|
|
41
|
+
table: SofraTable,
|
|
42
|
+
*,
|
|
43
|
+
log_x: bool = True,
|
|
44
|
+
null_line: float = 1.0,
|
|
45
|
+
width_in: float = 6.5,
|
|
46
|
+
height_per_row_in: float = 0.42,
|
|
47
|
+
color: str = "#0b3d91",
|
|
48
|
+
) -> str:
|
|
49
|
+
"""Render a forest plot for a regression :class:`SofraTable`.
|
|
50
|
+
|
|
51
|
+
Reads point estimates and CI bounds from the body rows: it looks for
|
|
52
|
+
a numeric cell of kind ``numeric`` (the point estimate) followed by
|
|
53
|
+
a cell of kind ``ci`` carrying ``(lo, hi)`` tuples — exactly the
|
|
54
|
+
layout produced by :func:`pysofra.tbl_regression`.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
table
|
|
59
|
+
A SofraTable produced by ``tbl_regression`` (single- or multi-model).
|
|
60
|
+
log_x
|
|
61
|
+
Plot on a log-scale x-axis. Default ``True`` because the natural
|
|
62
|
+
scale for ORs / HRs / IRRs is multiplicative.
|
|
63
|
+
null_line
|
|
64
|
+
x-coordinate of the null reference (1 for exponentiated, 0 for raw).
|
|
65
|
+
width_in
|
|
66
|
+
Figure width in inches.
|
|
67
|
+
height_per_row_in
|
|
68
|
+
Vertical space per coefficient row.
|
|
69
|
+
color
|
|
70
|
+
Hex string for the point + CI segments.
|
|
71
|
+
"""
|
|
72
|
+
fig, _ = _build_forest_figure(
|
|
73
|
+
table, log_x=log_x, null_line=null_line,
|
|
74
|
+
width_in=width_in, height_per_row_in=height_per_row_in, color=color,
|
|
75
|
+
)
|
|
76
|
+
svg = fig_to_svg(fig)
|
|
77
|
+
try:
|
|
78
|
+
import matplotlib.pyplot as plt
|
|
79
|
+
plt.close(fig)
|
|
80
|
+
except ImportError: # pragma: no cover
|
|
81
|
+
pass
|
|
82
|
+
return svg
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _build_forest_figure(
|
|
86
|
+
table: SofraTable,
|
|
87
|
+
*,
|
|
88
|
+
log_x: bool,
|
|
89
|
+
null_line: float,
|
|
90
|
+
width_in: float,
|
|
91
|
+
height_per_row_in: float,
|
|
92
|
+
color: str,
|
|
93
|
+
) -> tuple[Any, float]:
|
|
94
|
+
try:
|
|
95
|
+
from ._backend import use_headless_backend
|
|
96
|
+
use_headless_backend()
|
|
97
|
+
import matplotlib.pyplot as plt
|
|
98
|
+
except ImportError as e: # pragma: no cover
|
|
99
|
+
raise ImportError(
|
|
100
|
+
"Forest plots require matplotlib. Install with "
|
|
101
|
+
"`pip install matplotlib`."
|
|
102
|
+
) from e
|
|
103
|
+
|
|
104
|
+
points: list[tuple[str, float, float, float]] = []
|
|
105
|
+
for r in table.rows:
|
|
106
|
+
label = r.cells[0].text
|
|
107
|
+
est = next((c for c in r.cells if c.kind == "numeric"
|
|
108
|
+
and isinstance(c.value, (int, float))), None)
|
|
109
|
+
ci = next((c for c in r.cells if c.kind == "ci"
|
|
110
|
+
and isinstance(c.value, tuple) and len(c.value) == 2), None)
|
|
111
|
+
if est is None or ci is None:
|
|
112
|
+
continue
|
|
113
|
+
lo, hi = ci.value
|
|
114
|
+
if any(_isnan(x) for x in (est.value, lo, hi)):
|
|
115
|
+
continue
|
|
116
|
+
points.append((label, float(est.value), float(lo), float(hi)))
|
|
117
|
+
|
|
118
|
+
if not points:
|
|
119
|
+
raise ValueError("No (estimate, CI) pairs found in table; "
|
|
120
|
+
"is this a regression table?")
|
|
121
|
+
|
|
122
|
+
n = len(points)
|
|
123
|
+
height = max(2.0, height_per_row_in * n + 1.0)
|
|
124
|
+
fig, ax = plt.subplots(figsize=(width_in, height))
|
|
125
|
+
|
|
126
|
+
labels = [p[0] for p in points]
|
|
127
|
+
estimates = [p[1] for p in points]
|
|
128
|
+
lows = [p[2] for p in points]
|
|
129
|
+
highs = [p[3] for p in points]
|
|
130
|
+
|
|
131
|
+
ys = list(range(n, 0, -1))
|
|
132
|
+
ax.errorbar(
|
|
133
|
+
estimates, ys,
|
|
134
|
+
xerr=[[e - lo for e, lo in zip(estimates, lows, strict=True)],
|
|
135
|
+
[hi - e for e, hi in zip(estimates, highs, strict=True)]],
|
|
136
|
+
fmt="s", color=color, ecolor=color,
|
|
137
|
+
elinewidth=1.5, capsize=4, markersize=7,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
ax.axvline(null_line, color="#888", linewidth=1, linestyle="--", zorder=0)
|
|
141
|
+
if log_x:
|
|
142
|
+
ax.set_xscale("log")
|
|
143
|
+
ax.set_yticks(ys)
|
|
144
|
+
ax.set_yticklabels(labels)
|
|
145
|
+
ax.set_ylim(0.5, n + 0.5)
|
|
146
|
+
ax.tick_params(axis="y", left=False)
|
|
147
|
+
ax.spines["top"].set_visible(False)
|
|
148
|
+
ax.spines["right"].set_visible(False)
|
|
149
|
+
ax.spines["left"].set_visible(False)
|
|
150
|
+
ax.set_xlabel(table.headers[0].cells[1].text if table.headers else "Estimate")
|
|
151
|
+
|
|
152
|
+
return fig, height
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _isnan(x: Any) -> bool:
|
|
156
|
+
try:
|
|
157
|
+
return math.isnan(float(x))
|
|
158
|
+
except (TypeError, ValueError):
|
|
159
|
+
return False
|