pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pysofra/__init__.py +82 -0
  2. pysofra/core/__init__.py +14 -0
  3. pysofra/core/compose.py +167 -0
  4. pysofra/core/format.py +155 -0
  5. pysofra/core/frames.py +69 -0
  6. pysofra/core/schema.py +128 -0
  7. pysofra/core/table.py +924 -0
  8. pysofra/io/__init__.py +1 -0
  9. pysofra/models/__init__.py +6 -0
  10. pysofra/models/extract.py +249 -0
  11. pysofra/models/pool.py +119 -0
  12. pysofra/models/regression.py +507 -0
  13. pysofra/models/survival.py +395 -0
  14. pysofra/models/uvregression.py +438 -0
  15. pysofra/notebook/__init__.py +6 -0
  16. pysofra/plot/__init__.py +23 -0
  17. pysofra/plot/_backend.py +32 -0
  18. pysofra/plot/forest.py +159 -0
  19. pysofra/plot/inline.py +171 -0
  20. pysofra/plot/km.py +249 -0
  21. pysofra/render/__init__.py +28 -0
  22. pysofra/render/_zip_determinism.py +57 -0
  23. pysofra/render/base.py +22 -0
  24. pysofra/render/docx.py +286 -0
  25. pysofra/render/html.py +442 -0
  26. pysofra/render/image.py +130 -0
  27. pysofra/render/latex.py +253 -0
  28. pysofra/render/markdown.py +128 -0
  29. pysofra/render/pptx.py +340 -0
  30. pysofra/render/xlsx.py +226 -0
  31. pysofra/summary/__init__.py +6 -0
  32. pysofra/summary/calibrate.py +214 -0
  33. pysofra/summary/design.py +246 -0
  34. pysofra/summary/effect_size.py +187 -0
  35. pysofra/summary/extras.py +745 -0
  36. pysofra/summary/smd.py +133 -0
  37. pysofra/summary/stats.py +135 -0
  38. pysofra/summary/tbl_cross.py +339 -0
  39. pysofra/summary/tbl_one.py +1220 -0
  40. pysofra/summary/tbl_summary.py +51 -0
  41. pysofra/summary/tests.py +370 -0
  42. pysofra/summary/typing.py +129 -0
  43. pysofra/summary/weights.py +161 -0
  44. pysofra/themes/__init__.py +5 -0
  45. pysofra/themes/registry.py +272 -0
  46. pysofra-0.1.0a1.dist-info/METADATA +301 -0
  47. pysofra-0.1.0a1.dist-info/RECORD +50 -0
  48. pysofra-0.1.0a1.dist-info/WHEEL +4 -0
  49. pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
  50. pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
pysofra/io/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """I/O utilities — reserved for future readers (Stata/SAS/Excel)."""
@@ -0,0 +1,6 @@
1
+ """Model-output table builders."""
2
+
3
+ from .regression import tbl_regression
4
+ from .survival import tbl_survival
5
+
6
+ __all__ = ["tbl_regression", "tbl_survival"]
@@ -0,0 +1,249 @@
1
+ """Coefficient extraction from fitted models.
2
+
3
+ Different libraries expose fitted-model summaries in different ways. This
4
+ module abstracts the extraction into a single :class:`ModelSummary` and
5
+ detects the source by duck-typing — we never hard-import optional
6
+ dependencies (lifelines, sklearn) at module load time.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ModelSummary:
20
+ """Per-coefficient summary used by :func:`tbl_regression`.
21
+
22
+ ``estimates`` / ``ci_lo`` / ``ci_hi`` / ``pvalues`` are aligned Series
23
+ indexed by coefficient name. ``family`` is a short human-readable
24
+ string (``"Logit"``, ``"OLS"``, ``"CoxPHFitter"``, ``"LogisticRegression"``)
25
+ used to pick a sensible estimate column label.
26
+ """
27
+
28
+ estimates: pd.Series
29
+ ci_lo: pd.Series
30
+ ci_hi: pd.Series
31
+ pvalues: pd.Series
32
+ family: str
33
+ natural_exponentiate: bool # whether exp() is the natural reporting metric
34
+
35
+
36
+ def extract(model: Any, conf_level: float = 0.95) -> ModelSummary:
37
+ """Extract a :class:`ModelSummary` from any supported fitted model."""
38
+ qualname = f"{type(model).__module__}.{type(model).__name__}"
39
+
40
+ # PooledSummary (multiple-imputation Rubin'd results) — already extracted.
41
+ if isinstance(model, ModelSummary):
42
+ return model
43
+
44
+ # lifelines first — its fitters don't all expose the statsmodels
45
+ # ``.params`` interface (CoxPHFitter exposes ``.params_`` and ``.summary``).
46
+ if qualname.startswith("lifelines."):
47
+ return _extract_lifelines(model, conf_level)
48
+
49
+ # statsmodels — Results wrapper, recognised by the .params/.bse interface.
50
+ # MixedLM and GEE both honour this interface; the family-label helper
51
+ # picks them out.
52
+ if hasattr(model, "params") and hasattr(model, "pvalues") and hasattr(model, "conf_int"):
53
+ return _extract_statsmodels(model, conf_level)
54
+
55
+ # sklearn linear models — recognised by .coef_ + a `predict`/`fit` method.
56
+ # We extract point estimates only; CIs / p-values are not natively
57
+ # available and are filled with NaN.
58
+ if hasattr(model, "coef_") and (hasattr(model, "predict") or hasattr(model, "fit")):
59
+ return _extract_sklearn(model)
60
+
61
+ raise TypeError(
62
+ f"Unsupported model type {qualname!r}. "
63
+ "tbl_regression supports statsmodels Results, lifelines fitters, "
64
+ "and sklearn linear models."
65
+ )
66
+
67
+
68
+ # ----------------------------------------------------------------------
69
+ # statsmodels
70
+ # ----------------------------------------------------------------------
71
+
72
+ def _extract_statsmodels(model: Any, conf_level: float) -> ModelSummary:
73
+ params = pd.Series(model.params)
74
+ pvalues = pd.Series(getattr(model, "pvalues", pd.Series(dtype=float)))
75
+ try:
76
+ ci = model.conf_int(alpha=1.0 - conf_level)
77
+ except TypeError: # pragma: no cover — older statsmodels signature
78
+ ci = model.conf_int()
79
+ ci = pd.DataFrame(ci)
80
+ ci.columns = ["lo", "hi"]
81
+ ci = ci.reindex(params.index)
82
+
83
+ family_label = _statsmodels_family_label(model)
84
+ natural_exp = _is_log_link(family_label)
85
+
86
+ return ModelSummary(
87
+ estimates=params.astype(float),
88
+ ci_lo=ci["lo"].astype(float),
89
+ ci_hi=ci["hi"].astype(float),
90
+ pvalues=pvalues.reindex(params.index).astype(float),
91
+ family=family_label,
92
+ natural_exponentiate=natural_exp,
93
+ )
94
+
95
+
96
+ def _statsmodels_family_label(model: Any) -> str:
97
+ cls = type(model).__name__
98
+ fam = getattr(model, "family", None)
99
+ if fam is not None:
100
+ return f"{cls} ({type(fam).__name__})"
101
+ inner = getattr(model, "model", None)
102
+ if inner is not None:
103
+ inner_name = type(inner).__name__
104
+ # MixedLM / GEE add their own family/link info that's worth surfacing.
105
+ if "MixedLM" in inner_name:
106
+ return f"{cls} (MixedLM)"
107
+ if "GEE" in inner_name or "GeneralizedEstimatingEquations" in inner_name:
108
+ cov = getattr(inner, "cov_struct", None)
109
+ if cov is not None:
110
+ return f"{cls} (GEE, {type(cov).__name__})"
111
+ return f"{cls} (GEE)"
112
+ return f"{cls} ({inner_name})"
113
+ return cls
114
+
115
+
116
+ def _is_log_link(family_label: str) -> bool:
117
+ fl = family_label.lower()
118
+ return any(k in fl for k in ("logit", "binomial", "probit", "poisson",
119
+ "negativebinomial"))
120
+
121
+
122
+ # ----------------------------------------------------------------------
123
+ # lifelines (Cox PH, AFT, etc.)
124
+ # ----------------------------------------------------------------------
125
+
126
+ def _extract_lifelines(model: Any, conf_level: float) -> ModelSummary:
127
+ """Extract coefficients from a fitted lifelines regression model.
128
+
129
+ ``model.summary`` is a DataFrame with the standard columns
130
+ ``coef``, ``coef lower X%``, ``coef upper X%``, ``p``. The exact column
131
+ names vary by lifelines version and confidence level — we resolve them
132
+ dynamically.
133
+ """
134
+ if not hasattr(model, "summary"):
135
+ raise TypeError(
136
+ "lifelines model has no .summary attribute; "
137
+ "make sure you called .fit() before tbl_regression()."
138
+ )
139
+ summary = model.summary
140
+ if not isinstance(summary, pd.DataFrame):
141
+ raise TypeError("lifelines .summary is not a DataFrame.")
142
+
143
+ # Find the CI columns. Lifelines reports ``coef lower 95%`` /
144
+ # ``coef upper 95%`` by default; we accept any matching lower/upper
145
+ # pair.
146
+ lo_col = _find_col(summary, ["coef lower"])
147
+ hi_col = _find_col(summary, ["coef upper"])
148
+ if lo_col is None or hi_col is None:
149
+ raise ValueError(
150
+ f"Could not locate CI columns in lifelines summary "
151
+ f"(columns: {list(summary.columns)})."
152
+ )
153
+
154
+ estimates = summary["coef"].astype(float)
155
+ ci_lo = summary[lo_col].astype(float)
156
+ ci_hi = summary[hi_col].astype(float)
157
+ pvalues = summary["p"].astype(float) if "p" in summary.columns else pd.Series(
158
+ [float("nan")] * len(summary), index=summary.index
159
+ )
160
+ # AFT models (Weibull / log-logistic / log-normal) carry a MultiIndex
161
+ # ``(param, covariate)`` index — e.g. ``('lambda_', 'age')``. Renderers
162
+ # expect string row labels; flatten with ``covariate (param)`` so the
163
+ # table reads naturally ("age (lambda_)") rather than emitting a tuple
164
+ # that crashes the markdown / HTML escapers.
165
+ if isinstance(estimates.index, pd.MultiIndex):
166
+ flat = [f"{cov} ({param})" for param, cov in estimates.index]
167
+ estimates.index = pd.Index(flat)
168
+ ci_lo.index = pd.Index(flat)
169
+ ci_hi.index = pd.Index(flat)
170
+ pvalues.index = pd.Index(flat)
171
+
172
+ family = type(model).__name__
173
+ # Cox / Weibull / log-normal AFT all naturally report exp(coef) = HR.
174
+ natural_exp = True
175
+ del conf_level # honoured by lifelines at fit time
176
+ return ModelSummary(
177
+ estimates=estimates,
178
+ ci_lo=ci_lo,
179
+ ci_hi=ci_hi,
180
+ pvalues=pvalues,
181
+ family=family,
182
+ natural_exponentiate=natural_exp,
183
+ )
184
+
185
+
186
+ def _find_col(df: pd.DataFrame, prefixes: list[str]) -> str | None:
187
+ # ``df.columns`` items are ``Hashable`` (e.g. tuples for MultiIndex,
188
+ # ints for default-named frames), so coerce to ``str`` for both the
189
+ # match and the return — keeps the declared ``str | None`` return
190
+ # type honest under strict typing.
191
+ for col in df.columns:
192
+ s = str(col).lower()
193
+ if any(s.startswith(p) for p in prefixes):
194
+ return str(col)
195
+ return None
196
+
197
+
198
+ # ----------------------------------------------------------------------
199
+ # sklearn (point estimates only; no native CIs)
200
+ # ----------------------------------------------------------------------
201
+
202
+ def _extract_sklearn(model: Any) -> ModelSummary:
203
+ coef = np.atleast_2d(model.coef_)
204
+ n_outputs, n_features = coef.shape
205
+
206
+ feature_names = getattr(model, "feature_names_in_", None)
207
+ if feature_names is None:
208
+ feature_names = np.array([f"x{i}" for i in range(n_features)])
209
+ feature_names = list(feature_names)
210
+
211
+ family = type(model).__name__
212
+ natural_exp = "logistic" in family.lower() or "poisson" in family.lower()
213
+
214
+ if n_outputs == 1:
215
+ # Binary / single-output: one coefficient vector. Index the
216
+ # ModelSummary by raw feature name.
217
+ labels = list(feature_names)
218
+ values = coef[0, :]
219
+ else:
220
+ # Multi-class (e.g. LogisticRegression(multi_class='multinomial')
221
+ # with 3+ classes, or one-vs-rest). ``coef_`` is
222
+ # (n_classes, n_features); pull the per-class labels from
223
+ # ``model.classes_`` when available. Flatten to one row per
224
+ # (class, feature) pair using the same ``"feature (class=X)"``
225
+ # convention as lifelines AFT models so renderers see clean
226
+ # string labels (the index must be hashable strings — see the
227
+ # AFT path).
228
+ classes = getattr(model, "classes_", None)
229
+ if classes is None: # pragma: no cover — sklearn fits always set classes_
230
+ classes = np.array([f"class_{k}" for k in range(n_outputs)])
231
+ class_labels = [str(c) for c in classes]
232
+ labels = [
233
+ f"{feat} (class={cls})"
234
+ for cls in class_labels for feat in feature_names
235
+ ]
236
+ values = coef.reshape(-1)
237
+
238
+ estimates = pd.Series(values, index=labels, dtype=float)
239
+ nan = pd.Series([float("nan")] * len(labels),
240
+ index=labels, dtype=float)
241
+
242
+ return ModelSummary(
243
+ estimates=estimates,
244
+ ci_lo=nan.copy(),
245
+ ci_hi=nan.copy(),
246
+ pvalues=nan.copy(),
247
+ family=family,
248
+ natural_exponentiate=natural_exp,
249
+ )
pysofra/models/pool.py ADDED
@@ -0,0 +1,119 @@
1
+ """Multiple-imputation pooling — Rubin's rules.
2
+
3
+ Combines a list of fitted models (one per imputed dataset) into a
4
+ single :class:`~pysofra.models.extract.ModelSummary` ready for
5
+ :func:`pysofra.tbl_regression`.
6
+
7
+ Implementation
8
+ --------------
9
+
10
+ * Pooled point estimate: arithmetic mean of imputation-specific
11
+ estimates.
12
+ * Total variance ``T = Ubar + (1 + 1/m) * B`` (Rubin 1987), with
13
+ within-imputation variance ``Ubar`` recovered from the
14
+ per-imputation CIs and between-imputation variance ``B`` taken as
15
+ the sample variance of the estimates.
16
+ * Degrees of freedom: **Rubin (1987)** ``ν = (m-1)·(1 + 1/r)²`` where
17
+ ``r = ((1+1/m)·B) / Ubar``. The Barnard–Rubin (1999) refinement
18
+ ``ν* = (ν · ν_obs) / (ν + ν_obs)`` further trims ``ν`` to respect
19
+ the complete-data degrees of freedom but requires per-imputation
20
+ ``df_resid``, which PySofra does not currently extract for every
21
+ supported model family. For small per-imputation residual df this
22
+ means the CIs / p-values are very slightly narrower than R's
23
+ ``mice::pool`` would produce; the practical difference is
24
+ negligible for the typical clinical-trial sample size (n ≳ 60).
25
+
26
+ References
27
+ ----------
28
+ * Rubin, D.B. (1987). *Multiple Imputation for Nonresponse in
29
+ Surveys*. Wiley.
30
+ * Barnard, J. & Rubin, D.B. (1999). Small-sample degrees of freedom
31
+ with multiple imputation. *Biometrika* 86 (4), 948–955.
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ from typing import Any
37
+
38
+ import numpy as np
39
+ import pandas as pd
40
+ from scipy import stats as sp_stats
41
+
42
+ from .extract import ModelSummary, extract
43
+
44
+
45
+ def pool(models: list[Any], *, conf_level: float = 0.95) -> ModelSummary:
46
+ """Pool a list of fitted models via Rubin's rules.
47
+
48
+ Returns a :class:`ModelSummary` whose estimates / CIs / p-values
49
+ reflect the across-imputation combination. Pass the result directly
50
+ into :func:`pysofra.tbl_regression`.
51
+
52
+ Each input must be a fitted model recognised by
53
+ :func:`pysofra.models.extract.extract` — statsmodels, lifelines,
54
+ sklearn (sklearn has no SEs so the pool degenerates to a simple
55
+ mean-of-coefficients).
56
+ """
57
+ if len(models) < 2:
58
+ raise ValueError(
59
+ "pool requires at least two imputed-dataset fits "
60
+ f"(got {len(models)})."
61
+ )
62
+ summaries = [extract(m, conf_level=conf_level) for m in models]
63
+ coef_names = list(summaries[0].estimates.index)
64
+
65
+ # Each summary must share the same coefficients to pool them coherently.
66
+ for s in summaries[1:]:
67
+ if list(s.estimates.index) != coef_names:
68
+ raise ValueError(
69
+ "All imputed fits must share the same coefficient names; "
70
+ "got different sets."
71
+ )
72
+
73
+ m = len(summaries)
74
+ Qbar = pd.Series(
75
+ np.mean([s.estimates.to_numpy() for s in summaries], axis=0),
76
+ index=coef_names,
77
+ )
78
+
79
+ # Within-imputation variance Ubar (mean of squared SE estimates) —
80
+ # derived from CI half-widths so it works for any model with CIs.
81
+ ses = np.zeros((m, len(coef_names)), dtype=float)
82
+ z_crit = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
83
+ for i, s in enumerate(summaries):
84
+ half = (s.ci_hi.to_numpy() - s.ci_lo.to_numpy()) / 2.0
85
+ ses[i, :] = half / z_crit
86
+ Ubar = np.nanmean(ses ** 2, axis=0)
87
+
88
+ # Between-imputation variance B.
89
+ Q = np.array([s.estimates.to_numpy() for s in summaries])
90
+ B = np.var(Q, axis=0, ddof=1)
91
+
92
+ # Total variance T = Ubar + (1 + 1/m) * B.
93
+ T = Ubar + (1.0 + 1.0 / m) * B
94
+ se_pool = np.sqrt(np.maximum(T, 0.0))
95
+
96
+ # Rubin (1987) degrees of freedom.
97
+ with np.errstate(divide="ignore", invalid="ignore"):
98
+ r = ((1.0 + 1.0 / m) * B) / np.where(Ubar > 0, Ubar, np.nan)
99
+ df_old = (m - 1) * (1.0 + 1.0 / np.where(r > 0, r, np.nan)) ** 2
100
+ df_old = np.where(np.isfinite(df_old), df_old, 10_000.0)
101
+
102
+ # Compute CI bounds and p-values from the pooled t-statistic.
103
+ t_crit = sp_stats.t.ppf(0.5 + conf_level / 2, df=df_old)
104
+ ci_lo = Qbar.to_numpy() - t_crit * se_pool
105
+ ci_hi = Qbar.to_numpy() + t_crit * se_pool
106
+
107
+ with np.errstate(divide="ignore", invalid="ignore"):
108
+ t_stat = Qbar.to_numpy() / np.where(se_pool > 0, se_pool, np.nan)
109
+ p_vals = 2.0 * sp_stats.t.sf(np.abs(t_stat), df=df_old)
110
+ p_vals = np.where(np.isfinite(p_vals), p_vals, float("nan"))
111
+
112
+ return ModelSummary(
113
+ estimates=Qbar.astype(float),
114
+ ci_lo=pd.Series(ci_lo, index=coef_names, dtype=float),
115
+ ci_hi=pd.Series(ci_hi, index=coef_names, dtype=float),
116
+ pvalues=pd.Series(p_vals, index=coef_names, dtype=float),
117
+ family=f"Pooled MI ({m} imputations) — Rubin's rules",
118
+ natural_exponentiate=summaries[0].natural_exponentiate,
119
+ )