PyPI - pysofra - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

pysofra/__init__.py +82 -0
pysofra/core/__init__.py +14 -0
pysofra/core/compose.py +167 -0
pysofra/core/format.py +155 -0
pysofra/core/frames.py +69 -0
pysofra/core/schema.py +128 -0
pysofra/core/table.py +924 -0
pysofra/io/__init__.py +1 -0
pysofra/models/__init__.py +6 -0
pysofra/models/extract.py +249 -0
pysofra/models/pool.py +119 -0
pysofra/models/regression.py +507 -0
pysofra/models/survival.py +395 -0
pysofra/models/uvregression.py +438 -0
pysofra/notebook/__init__.py +6 -0
pysofra/plot/__init__.py +23 -0
pysofra/plot/_backend.py +32 -0
pysofra/plot/forest.py +159 -0
pysofra/plot/inline.py +171 -0
pysofra/plot/km.py +249 -0
pysofra/render/__init__.py +28 -0
pysofra/render/_zip_determinism.py +57 -0
pysofra/render/base.py +22 -0
pysofra/render/docx.py +286 -0
pysofra/render/html.py +442 -0
pysofra/render/image.py +130 -0
pysofra/render/latex.py +253 -0
pysofra/render/markdown.py +128 -0
pysofra/render/pptx.py +340 -0
pysofra/render/xlsx.py +226 -0
pysofra/summary/__init__.py +6 -0
pysofra/summary/calibrate.py +214 -0
pysofra/summary/design.py +246 -0
pysofra/summary/effect_size.py +187 -0
pysofra/summary/extras.py +745 -0
pysofra/summary/smd.py +133 -0
pysofra/summary/stats.py +135 -0
pysofra/summary/tbl_cross.py +339 -0
pysofra/summary/tbl_one.py +1220 -0
pysofra/summary/tbl_summary.py +51 -0
pysofra/summary/tests.py +370 -0
pysofra/summary/typing.py +129 -0
pysofra/summary/weights.py +161 -0
pysofra/themes/__init__.py +5 -0
pysofra/themes/registry.py +272 -0
pysofra-0.1.0a1.dist-info/METADATA +301 -0
pysofra-0.1.0a1.dist-info/RECORD +50 -0
pysofra-0.1.0a1.dist-info/WHEEL +4 -0
pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0

pysofra/io/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """I/O utilities — reserved for future readers (Stata/SAS/Excel)."""

pysofra/models/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Model-output table builders."""
+from .regression import tbl_regression
+from .survival import tbl_survival
+__all__ = ["tbl_regression", "tbl_survival"]

pysofra/models/extract.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Coefficient extraction from fitted models.
+Different libraries expose fitted-model summaries in different ways. This
+module abstracts the extraction into a single :class:`ModelSummary` and
+detects the source by duck-typing — we never hard-import optional
+dependencies (lifelines, sklearn) at module load time.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+import pandas as pd
+@dataclass(frozen=True)
+class ModelSummary:
+    """Per-coefficient summary used by :func:`tbl_regression`.
+    ``estimates`` / ``ci_lo`` / ``ci_hi`` / ``pvalues`` are aligned Series
+    indexed by coefficient name. ``family`` is a short human-readable
+    string (``"Logit"``, ``"OLS"``, ``"CoxPHFitter"``, ``"LogisticRegression"``)
+    used to pick a sensible estimate column label.
+    """
+    estimates: pd.Series
+    ci_lo: pd.Series
+    ci_hi: pd.Series
+    pvalues: pd.Series
+    family: str
+    natural_exponentiate: bool  # whether exp() is the natural reporting metric
+def extract(model: Any, conf_level: float = 0.95) -> ModelSummary:
+    """Extract a :class:`ModelSummary` from any supported fitted model."""
+    qualname = f"{type(model).__module__}.{type(model).__name__}"
+    # PooledSummary (multiple-imputation Rubin'd results) — already extracted.
+    if isinstance(model, ModelSummary):
+        return model
+    # lifelines first — its fitters don't all expose the statsmodels
+    # ``.params`` interface (CoxPHFitter exposes ``.params_`` and ``.summary``).
+    if qualname.startswith("lifelines."):
+        return _extract_lifelines(model, conf_level)
+    # statsmodels — Results wrapper, recognised by the .params/.bse interface.
+    # MixedLM and GEE both honour this interface; the family-label helper
+    # picks them out.
+    if hasattr(model, "params") and hasattr(model, "pvalues") and hasattr(model, "conf_int"):
+        return _extract_statsmodels(model, conf_level)
+    # sklearn linear models — recognised by .coef_ + a `predict`/`fit` method.
+    # We extract point estimates only; CIs / p-values are not natively
+    # available and are filled with NaN.
+    if hasattr(model, "coef_") and (hasattr(model, "predict") or hasattr(model, "fit")):
+        return _extract_sklearn(model)
+    raise TypeError(
+        f"Unsupported model type {qualname!r}. "
+        "tbl_regression supports statsmodels Results, lifelines fitters, "
+        "and sklearn linear models."
+    )
+# ----------------------------------------------------------------------
+# statsmodels
+# ----------------------------------------------------------------------
+def _extract_statsmodels(model: Any, conf_level: float) -> ModelSummary:
+    params = pd.Series(model.params)
+    pvalues = pd.Series(getattr(model, "pvalues", pd.Series(dtype=float)))
+    try:
+        ci = model.conf_int(alpha=1.0 - conf_level)
+    except TypeError:  # pragma: no cover — older statsmodels signature
+        ci = model.conf_int()
+    ci = pd.DataFrame(ci)
+    ci.columns = ["lo", "hi"]
+    ci = ci.reindex(params.index)
+    family_label = _statsmodels_family_label(model)
+    natural_exp = _is_log_link(family_label)
+    return ModelSummary(
+        estimates=params.astype(float),
+        ci_lo=ci["lo"].astype(float),
+        ci_hi=ci["hi"].astype(float),
+        pvalues=pvalues.reindex(params.index).astype(float),
+        family=family_label,
+        natural_exponentiate=natural_exp,
+    )
+def _statsmodels_family_label(model: Any) -> str:
+    cls = type(model).__name__
+    fam = getattr(model, "family", None)
+    if fam is not None:
+        return f"{cls} ({type(fam).__name__})"
+    inner = getattr(model, "model", None)
+    if inner is not None:
+        inner_name = type(inner).__name__
+        # MixedLM / GEE add their own family/link info that's worth surfacing.
+        if "MixedLM" in inner_name:
+            return f"{cls} (MixedLM)"
+        if "GEE" in inner_name or "GeneralizedEstimatingEquations" in inner_name:
+            cov = getattr(inner, "cov_struct", None)
+            if cov is not None:
+                return f"{cls} (GEE, {type(cov).__name__})"
+            return f"{cls} (GEE)"
+        return f"{cls} ({inner_name})"
+    return cls
+def _is_log_link(family_label: str) -> bool:
+    fl = family_label.lower()
+    return any(k in fl for k in ("logit", "binomial", "probit", "poisson",
+                                 "negativebinomial"))
+# ----------------------------------------------------------------------
+# lifelines (Cox PH, AFT, etc.)
+# ----------------------------------------------------------------------
+def _extract_lifelines(model: Any, conf_level: float) -> ModelSummary:
+    """Extract coefficients from a fitted lifelines regression model.
+    ``model.summary`` is a DataFrame with the standard columns
+    ``coef``, ``coef lower X%``, ``coef upper X%``, ``p``. The exact column
+    names vary by lifelines version and confidence level — we resolve them
+    dynamically.
+    """
+    if not hasattr(model, "summary"):
+        raise TypeError(
+            "lifelines model has no .summary attribute; "
+            "make sure you called .fit() before tbl_regression()."
+        )
+    summary = model.summary
+    if not isinstance(summary, pd.DataFrame):
+        raise TypeError("lifelines .summary is not a DataFrame.")
+    # Find the CI columns. Lifelines reports ``coef lower 95%`` /
+    # ``coef upper 95%`` by default; we accept any matching lower/upper
+    # pair.
+    lo_col = _find_col(summary, ["coef lower"])
+    hi_col = _find_col(summary, ["coef upper"])
+    if lo_col is None or hi_col is None:
+        raise ValueError(
+            f"Could not locate CI columns in lifelines summary "
+            f"(columns: {list(summary.columns)})."
+        )
+    estimates = summary["coef"].astype(float)
+    ci_lo = summary[lo_col].astype(float)
+    ci_hi = summary[hi_col].astype(float)
+    pvalues = summary["p"].astype(float) if "p" in summary.columns else pd.Series(
+        [float("nan")] * len(summary), index=summary.index
+    )
+    # AFT models (Weibull / log-logistic / log-normal) carry a MultiIndex
+    # ``(param, covariate)`` index — e.g. ``('lambda_', 'age')``. Renderers
+    # expect string row labels; flatten with ``covariate (param)`` so the
+    # table reads naturally ("age (lambda_)") rather than emitting a tuple
+    # that crashes the markdown / HTML escapers.
+    if isinstance(estimates.index, pd.MultiIndex):
+        flat = [f"{cov} ({param})" for param, cov in estimates.index]
+        estimates.index = pd.Index(flat)
+        ci_lo.index = pd.Index(flat)
+        ci_hi.index = pd.Index(flat)
+        pvalues.index = pd.Index(flat)
+    family = type(model).__name__
+    # Cox / Weibull / log-normal AFT all naturally report exp(coef) = HR.
+    natural_exp = True
+    del conf_level  # honoured by lifelines at fit time
+    return ModelSummary(
+        estimates=estimates,
+        ci_lo=ci_lo,
+        ci_hi=ci_hi,
+        pvalues=pvalues,
+        family=family,
+        natural_exponentiate=natural_exp,
+    )
+def _find_col(df: pd.DataFrame, prefixes: list[str]) -> str | None:
+    # ``df.columns`` items are ``Hashable`` (e.g. tuples for MultiIndex,
+    # ints for default-named frames), so coerce to ``str`` for both the
+    # match and the return — keeps the declared ``str | None`` return
+    # type honest under strict typing.
+    for col in df.columns:
+        s = str(col).lower()
+        if any(s.startswith(p) for p in prefixes):
+            return str(col)
+    return None
+# ----------------------------------------------------------------------
+# sklearn (point estimates only; no native CIs)
+# ----------------------------------------------------------------------
+def _extract_sklearn(model: Any) -> ModelSummary:
+    coef = np.atleast_2d(model.coef_)
+    n_outputs, n_features = coef.shape
+    feature_names = getattr(model, "feature_names_in_", None)
+    if feature_names is None:
+        feature_names = np.array([f"x{i}" for i in range(n_features)])
+    feature_names = list(feature_names)
+    family = type(model).__name__
+    natural_exp = "logistic" in family.lower() or "poisson" in family.lower()
+    if n_outputs == 1:
+        # Binary / single-output: one coefficient vector. Index the
+        # ModelSummary by raw feature name.
+        labels = list(feature_names)
+        values = coef[0, :]
+    else:
+        # Multi-class (e.g. LogisticRegression(multi_class='multinomial')
+        # with 3+ classes, or one-vs-rest). ``coef_`` is
+        # (n_classes, n_features); pull the per-class labels from
+        # ``model.classes_`` when available. Flatten to one row per
+        # (class, feature) pair using the same ``"feature (class=X)"``
+        # convention as lifelines AFT models so renderers see clean
+        # string labels (the index must be hashable strings — see the
+        # AFT path).
+        classes = getattr(model, "classes_", None)
+        if classes is None:  # pragma: no cover — sklearn fits always set classes_
+            classes = np.array([f"class_{k}" for k in range(n_outputs)])
+        class_labels = [str(c) for c in classes]
+        labels = [
+            f"{feat} (class={cls})"
+            for cls in class_labels for feat in feature_names
+        ]
+        values = coef.reshape(-1)
+    estimates = pd.Series(values, index=labels, dtype=float)
+    nan = pd.Series([float("nan")] * len(labels),
+                    index=labels, dtype=float)
+    return ModelSummary(
+        estimates=estimates,
+        ci_lo=nan.copy(),
+        ci_hi=nan.copy(),
+        pvalues=nan.copy(),
+        family=family,
+        natural_exponentiate=natural_exp,
+    )

pysofra/models/pool.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Multiple-imputation pooling — Rubin's rules.
+Combines a list of fitted models (one per imputed dataset) into a
+single :class:`~pysofra.models.extract.ModelSummary` ready for
+:func:`pysofra.tbl_regression`.
+Implementation
+--------------
+* Pooled point estimate: arithmetic mean of imputation-specific
+  estimates.
+* Total variance ``T = Ubar + (1 + 1/m) * B`` (Rubin 1987), with
+  within-imputation variance ``Ubar`` recovered from the
+  per-imputation CIs and between-imputation variance ``B`` taken as
+  the sample variance of the estimates.
+* Degrees of freedom: **Rubin (1987)** ``ν = (m-1)·(1 + 1/r)²`` where
+  ``r = ((1+1/m)·B) / Ubar``. The Barnard–Rubin (1999) refinement
+  ``ν* = (ν · ν_obs) / (ν + ν_obs)`` further trims ``ν`` to respect
+  the complete-data degrees of freedom but requires per-imputation
+  ``df_resid``, which PySofra does not currently extract for every
+  supported model family. For small per-imputation residual df this
+  means the CIs / p-values are very slightly narrower than R's
+  ``mice::pool`` would produce; the practical difference is
+  negligible for the typical clinical-trial sample size (n ≳ 60).
+References
+----------
+* Rubin, D.B. (1987). *Multiple Imputation for Nonresponse in
+  Surveys*. Wiley.
+* Barnard, J. & Rubin, D.B. (1999). Small-sample degrees of freedom
+  with multiple imputation. *Biometrika* 86 (4), 948–955.
+"""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+import pandas as pd
+from scipy import stats as sp_stats
+from .extract import ModelSummary, extract
+def pool(models: list[Any], *, conf_level: float = 0.95) -> ModelSummary:
+    """Pool a list of fitted models via Rubin's rules.
+    Returns a :class:`ModelSummary` whose estimates / CIs / p-values
+    reflect the across-imputation combination. Pass the result directly
+    into :func:`pysofra.tbl_regression`.
+    Each input must be a fitted model recognised by
+    :func:`pysofra.models.extract.extract` — statsmodels, lifelines,
+    sklearn (sklearn has no SEs so the pool degenerates to a simple
+    mean-of-coefficients).
+    """
+    if len(models) < 2:
+        raise ValueError(
+            "pool requires at least two imputed-dataset fits "
+            f"(got {len(models)})."
+        )
+    summaries = [extract(m, conf_level=conf_level) for m in models]
+    coef_names = list(summaries[0].estimates.index)
+    # Each summary must share the same coefficients to pool them coherently.
+    for s in summaries[1:]:
+        if list(s.estimates.index) != coef_names:
+            raise ValueError(
+                "All imputed fits must share the same coefficient names; "
+                "got different sets."
+            )
+    m = len(summaries)
+    Qbar = pd.Series(
+        np.mean([s.estimates.to_numpy() for s in summaries], axis=0),
+        index=coef_names,
+    )
+    # Within-imputation variance Ubar (mean of squared SE estimates) —
+    # derived from CI half-widths so it works for any model with CIs.
+    ses = np.zeros((m, len(coef_names)), dtype=float)
+    z_crit = float(sp_stats.norm.ppf(0.5 + conf_level / 2))
+    for i, s in enumerate(summaries):
+        half = (s.ci_hi.to_numpy() - s.ci_lo.to_numpy()) / 2.0
+        ses[i, :] = half / z_crit
+    Ubar = np.nanmean(ses ** 2, axis=0)
+    # Between-imputation variance B.
+    Q = np.array([s.estimates.to_numpy() for s in summaries])
+    B = np.var(Q, axis=0, ddof=1)
+    # Total variance T = Ubar + (1 + 1/m) * B.
+    T = Ubar + (1.0 + 1.0 / m) * B
+    se_pool = np.sqrt(np.maximum(T, 0.0))
+    # Rubin (1987) degrees of freedom.
+    with np.errstate(divide="ignore", invalid="ignore"):
+        r = ((1.0 + 1.0 / m) * B) / np.where(Ubar > 0, Ubar, np.nan)
+        df_old = (m - 1) * (1.0 + 1.0 / np.where(r > 0, r, np.nan)) ** 2
+    df_old = np.where(np.isfinite(df_old), df_old, 10_000.0)
+    # Compute CI bounds and p-values from the pooled t-statistic.
+    t_crit = sp_stats.t.ppf(0.5 + conf_level / 2, df=df_old)
+    ci_lo = Qbar.to_numpy() - t_crit * se_pool
+    ci_hi = Qbar.to_numpy() + t_crit * se_pool
+    with np.errstate(divide="ignore", invalid="ignore"):
+        t_stat = Qbar.to_numpy() / np.where(se_pool > 0, se_pool, np.nan)
+    p_vals = 2.0 * sp_stats.t.sf(np.abs(t_stat), df=df_old)
+    p_vals = np.where(np.isfinite(p_vals), p_vals, float("nan"))
+    return ModelSummary(
+        estimates=Qbar.astype(float),
+        ci_lo=pd.Series(ci_lo, index=coef_names, dtype=float),
+        ci_hi=pd.Series(ci_hi, index=coef_names, dtype=float),
+        pvalues=pd.Series(p_vals, index=coef_names, dtype=float),
+        family=f"Pooled MI ({m} imputations) — Rubin's rules",
+        natural_exponentiate=summaries[0].natural_exponentiate,
+    )