PyPI - pysofra - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

pysofra/__init__.py +82 -0
pysofra/core/__init__.py +14 -0
pysofra/core/compose.py +167 -0
pysofra/core/format.py +155 -0
pysofra/core/frames.py +69 -0
pysofra/core/schema.py +128 -0
pysofra/core/table.py +924 -0
pysofra/io/__init__.py +1 -0
pysofra/models/__init__.py +6 -0
pysofra/models/extract.py +249 -0
pysofra/models/pool.py +119 -0
pysofra/models/regression.py +507 -0
pysofra/models/survival.py +395 -0
pysofra/models/uvregression.py +438 -0
pysofra/notebook/__init__.py +6 -0
pysofra/plot/__init__.py +23 -0
pysofra/plot/_backend.py +32 -0
pysofra/plot/forest.py +159 -0
pysofra/plot/inline.py +171 -0
pysofra/plot/km.py +249 -0
pysofra/render/__init__.py +28 -0
pysofra/render/_zip_determinism.py +57 -0
pysofra/render/base.py +22 -0
pysofra/render/docx.py +286 -0
pysofra/render/html.py +442 -0
pysofra/render/image.py +130 -0
pysofra/render/latex.py +253 -0
pysofra/render/markdown.py +128 -0
pysofra/render/pptx.py +340 -0
pysofra/render/xlsx.py +226 -0
pysofra/summary/__init__.py +6 -0
pysofra/summary/calibrate.py +214 -0
pysofra/summary/design.py +246 -0
pysofra/summary/effect_size.py +187 -0
pysofra/summary/extras.py +745 -0
pysofra/summary/smd.py +133 -0
pysofra/summary/stats.py +135 -0
pysofra/summary/tbl_cross.py +339 -0
pysofra/summary/tbl_one.py +1220 -0
pysofra/summary/tbl_summary.py +51 -0
pysofra/summary/tests.py +370 -0
pysofra/summary/typing.py +129 -0
pysofra/summary/weights.py +161 -0
pysofra/themes/__init__.py +5 -0
pysofra/themes/registry.py +272 -0
pysofra-0.1.0a1.dist-info/METADATA +301 -0
pysofra-0.1.0a1.dist-info/RECORD +50 -0
pysofra-0.1.0a1.dist-info/WHEEL +4 -0
pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0

pysofra/models/survival.py ADDED Viewed

@@ -0,0 +1,395 @@
+"""Kaplan–Meier summary tables via :func:`tbl_survival`.
+Produces a publication-ready survival summary with:
+* N total / N events / N censored, per group
+* Median survival with confidence interval
+* Survival probability at user-specified time points (with N at risk)
+* Log-rank p-value across groups (when ``by=`` is provided)
+Requires the optional ``lifelines`` dependency. Install with
+``pip install lifelines`` or as part of a survival workflow extras.
+"""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+import pandas as pd
+from ..core.format import fmt_number, fmt_p_value
+from ..core.frames import to_pandas
+from ..core.schema import Cell, HeaderCell, HeaderRow, Row, make_cell
+from ..core.table import SofraTable, TableSpec
+def tbl_survival(
+    data: Any,
+    *,
+    time: str,
+    event: str,
+    by: str | None = None,
+    times: list[float] | tuple[float, ...] | None = None,
+    times_label: str | None = None,
+    conf_level: float = 0.95,
+    digits: int = 2,
+    pct_digits: int = 1,
+    labels: dict[str, str] | None = None,
+    show_logrank: bool = True,
+) -> SofraTable:
+    """Build a Kaplan–Meier summary table.
+    Parameters
+    ----------
+    data
+        Source dataframe (pandas or polars).
+    time
+        Column carrying follow-up time.
+    event
+        Column carrying the event indicator (1 = event, 0 = censored).
+    by
+        Optional stratification column. Without it, a single
+        ``"Overall"`` column is produced.
+    times
+        Optional list of follow-up times at which to report survival
+        probability and N at risk. For example ``[12, 24, 36]`` for
+        1/2/3-year survival in a months-scaled study.
+    times_label
+        Unit label appended to each ``times`` header (e.g. ``"months"``
+        → ``"S(12 months)"``). Defaults to bare numbers.
+    conf_level
+        Confidence level for the median survival CI.
+    digits
+        Decimal places for survival probabilities and median.
+    pct_digits
+        Decimal places for survival percentages.
+    labels
+        Optional mapping from group level → display label.
+    show_logrank
+        Whether to compute and footnote the multi-group log-rank test.
+    """
+    try:
+        from lifelines import KaplanMeierFitter
+        from lifelines.statistics import multivariate_logrank_test
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            "tbl_survival requires lifelines. Install with `pip install lifelines`."
+        ) from e
+    data = to_pandas(data)
+    for col in (time, event):
+        if col not in data.columns:
+            raise KeyError(f"column {col!r} not in data")
+    if by is not None and by not in data.columns:
+        raise KeyError(f"by column {by!r} not in data")
+    labels = dict(labels or {})
+    if by is None:
+        group_keys: list[Any] = ["Overall"]
+        group_masks = {"Overall": pd.Series(True, index=data.index)}
+    else:
+        by_series = data[by]
+        if isinstance(by_series.dtype, pd.CategoricalDtype):
+            group_keys = [k for k in by_series.cat.categories if (by_series == k).any()]
+        else:
+            group_keys = sorted(by_series.dropna().unique(), key=_sort_key)
+        group_keys = list(group_keys)
+        group_masks = {k: (by_series == k) for k in group_keys}
+    # ------------------------------------------------------------------
+    # Headers
+    # ------------------------------------------------------------------
+    header_cells: list[HeaderCell] = [HeaderCell(text="Statistic", align="left")]
+    for k in group_keys:
+        header_cells.append(HeaderCell(text=str(labels.get(k, k))))
+    if show_logrank and by is not None and len(group_keys) > 1:
+        header_cells.append(HeaderCell(text="p-value"))
+    headers = (HeaderRow(cells=tuple(header_cells)),)
+    # ------------------------------------------------------------------
+    # KM fits per group
+    # ------------------------------------------------------------------
+    fits: dict[Any, Any] = {}
+    n_total: dict[Any, int] = {}
+    n_events: dict[Any, int] = {}
+    n_censored: dict[Any, int] = {}
+    medians: dict[Any, tuple[float | None, float | None, float | None]] = {}
+    for k in group_keys:
+        m = group_masks[k]
+        sub = data.loc[m, [time, event]].dropna()
+        kmf = KaplanMeierFitter()
+        if len(sub) > 0:
+            kmf.fit(sub[time], sub[event], alpha=1 - conf_level)
+            fits[k] = kmf
+            n_total[k] = int(len(sub))
+            n_events[k] = int(sub[event].sum())
+            n_censored[k] = int(len(sub) - sub[event].sum())
+            med = float(kmf.median_survival_time_)
+            med_ci = _median_ci(kmf, conf_level)
+            medians[k] = (med, med_ci[0], med_ci[1])
+        else:
+            fits[k] = None
+            n_total[k] = 0
+            n_events[k] = 0
+            n_censored[k] = 0
+            medians[k] = (None, None, None)
+    # ------------------------------------------------------------------
+    # Log-rank
+    # ------------------------------------------------------------------
+    logrank_p: float | None = None
+    if show_logrank and by is not None and len(group_keys) > 1:
+        df = data.dropna(subset=[time, event, by])
+        # Suppress only the third-party deprecation warnings emitted by
+        # lifelines/pandas during the log-rank call (these are
+        # informational and escalate to errors under our strict
+        # ``filterwarnings = error`` configuration). Any other
+        # exception is a genuine numerical failure and surfaces.
+        import warnings
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
+            try:
+                result = multivariate_logrank_test(df[time], df[by], df[event])
+                logrank_p = float(result.p_value)
+            except (ValueError, ZeroDivisionError):  # pragma: no cover
+                logrank_p = None
+    # ------------------------------------------------------------------
+    # Body rows
+    # ------------------------------------------------------------------
+    rows: list[Row] = []
+    has_p_col = show_logrank and by is not None and len(group_keys) > 1
+    n_groups = len(group_keys)
+    def _row_with_blank_p(label_cell: Cell, value_cells: list[Cell]) -> Row:
+        cells = [label_cell, *value_cells]
+        if has_p_col:
+            cells.append(make_cell("", value=None))
+        return Row(cells=tuple(cells))
+    # N total
+    rows.append(_row_with_blank_p(
+        make_cell("N", align="left"),
+        [make_cell(f"{n_total[k]:,}", value=n_total[k], kind="numeric", align="right")
+         for k in group_keys],
+    ))
+    # N events
+    rows.append(_row_with_blank_p(
+        make_cell("Events", align="left"),
+        [make_cell(f"{n_events[k]:,}", value=n_events[k], kind="numeric", align="right")
+         for k in group_keys],
+    ))
+    # N censored
+    rows.append(_row_with_blank_p(
+        make_cell("Censored", align="left"),
+        [make_cell(f"{n_censored[k]:,}", value=n_censored[k],
+                   kind="numeric", align="right")
+         for k in group_keys],
+    ))
+    # Median survival with CI; the log-rank p attaches to this row.
+    median_cells = []
+    for k in group_keys:
+        med_val, lo, hi = medians[k]
+        if med_val is None or np.isnan(med_val):
+            text = "—"
+        else:
+            ci_part = ""
+            if lo is not None and hi is not None and not (np.isnan(lo) or np.isnan(hi)):
+                ci_part = f" ({fmt_number(lo, digits)}, {fmt_number(hi, digits)})"
+            text = f"{fmt_number(med_val, digits)}{ci_part}"
+        median_cells.append(make_cell(text, value=med_val, kind="numeric", align="right"))
+    median_row_cells = [make_cell(
+        f"Median survival ({int(round(conf_level * 100))}% CI)", align="left",
+    ), *median_cells]
+    if has_p_col:
+        median_row_cells.append(make_cell(
+            fmt_p_value(logrank_p), value=logrank_p,
+            kind="p_value", align="right",
+        ))
+    rows.append(Row(cells=tuple(median_row_cells)))
+    # Survival probability at each fixed time
+    if times:
+        for t in times:
+            row_label = _format_time_label(t, times_label)
+            cells: list[Cell] = [make_cell(row_label, align="left")]
+            for k in group_keys:
+                kmf = fits[k]
+                if kmf is None:
+                    cells.append(make_cell("—", value=None,
+                                           kind="numeric", align="right"))
+                    continue
+                surv = _survival_at(kmf, t)
+                n_at_risk = _n_at_risk(kmf, t)
+                if surv is None:
+                    cells.append(make_cell("—", value=None,
+                                           kind="numeric", align="right"))
+                else:
+                    pct = surv * 100.0
+                    text = f"{pct:.{pct_digits}f}% (n={n_at_risk})"
+                    cells.append(make_cell(text, value=surv,
+                                           kind="numeric", align="right"))
+            if has_p_col:
+                cells.append(make_cell("", value=None))
+            rows.append(Row(cells=tuple(cells)))
+    # ------------------------------------------------------------------
+    # Footnotes
+    # ------------------------------------------------------------------
+    footnotes: list[str] = []
+    if times:
+        footnotes.append(
+            "Survival probability shown with N at risk at each time point."
+        )
+    footnotes.append(
+        f"Median survival reported with {int(round(conf_level * 100))}% confidence interval."
+    )
+    if has_p_col and logrank_p is not None:
+        footnotes.append("p-value: multivariate log-rank test across groups.")
+    del n_groups
+    spec = TableSpec(
+        builder="tbl_survival",
+        options={
+            "time": time,
+            "event": event,
+            "by": by,
+            "times": tuple(times) if times else (),
+            "conf_level": conf_level,
+            "digits": digits,
+            "pct_digits": pct_digits,
+        },
+    )
+    table = SofraTable(
+        rows=tuple(rows),
+        headers=headers,
+        footnotes=tuple(footnotes),
+        metadata={"builder": "tbl_survival",
+                  "logrank_p": logrank_p,
+                  "n_groups": len(group_keys),
+                  # Closure used by .with_km_plot to fit curves with the
+                  # *same* data the table was computed from.
+                  "_km_source": {
+                      "data": data,
+                      "time": time,
+                      "event": event,
+                      "by": by,
+                  }},
+        _spec=spec,
+    )
+    return table
+def attach_km_plot(
+    table: SofraTable,
+    *,
+    position: str = "above",
+    **plot_kwargs: Any,
+) -> SofraTable:
+    """Attach a Kaplan–Meier curve to a :func:`tbl_survival` result.
+    Reads the original time / event / by columns out of the table
+    metadata and refits the KM curves with ``lifelines``. The attached
+    plot carries SVG, PNG, and PDF serialisations so it embeds across
+    every PySofra render backend.
+    """
+    from dataclasses import replace as dc_replace
+    src = table.metadata.get("_km_source") if table.metadata else None
+    if not src:
+        raise ValueError(
+            "attach_km_plot expects a SofraTable produced by tbl_survival."
+        )
+    if position not in ("above", "below"):
+        raise ValueError("position must be 'above' or 'below'")
+    from ..plot.km import km_curve
+    plot = km_curve(
+        src["data"], time=src["time"], event=src["event"], by=src["by"],
+        **plot_kwargs,
+    )
+    return dc_replace(
+        table,
+        inline_svg=plot.svg,
+        inline_svg_position=position,
+        inline_plot=plot,
+    )
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+def _median_ci(kmf: Any, conf_level: float) -> tuple[float | None, float | None]:
+    """Try to extract a CI for the median survival time from a lifelines KMF."""
+    try:
+        from lifelines.utils import median_survival_times
+        med_df = median_survival_times(kmf.confidence_interval_)
+        # Returns a DataFrame with columns like 'KM_estimate_lower_X.XX'.
+        row = med_df.iloc[0]
+        if len(row) >= 2:
+            return float(row.iloc[0]), float(row.iloc[1])
+    except Exception:  # pragma: no cover
+        pass
+    del conf_level
+    return None, None
+def _survival_at(kmf: Any, t: float) -> float | None:
+    """Return ``S(t)`` from a fitted KaplanMeierFitter."""
+    try:
+        sf = kmf.survival_function_at_times(t)
+        val = float(sf.iloc[0])
+        if np.isnan(val):
+            return None
+        return val
+    except Exception:  # pragma: no cover
+        return None
+def _n_at_risk(kmf: Any, t: float) -> int:
+    """Return the number of individuals at risk *just before* ``t``.
+    Convention: a person is at risk at time ``t`` if they have not yet
+    had an event or been censored by ``t``. Equivalently, given
+    ``kmf.event_table`` (indexed by event times with an ``at_risk``
+    column whose value at row ``t_i`` is the at-risk count just before
+    ``t_i``), the at-risk count just before query time ``t`` equals
+    the ``at_risk`` value at the first event-table row with
+    ``time >= t``. If no such row exists (``t`` is beyond the last
+    recorded event), the at-risk pool is empty.
+    """
+    try:
+        tbl = kmf.event_table
+        idx = tbl.index[tbl.index >= t]
+        if len(idx) == 0:
+            return 0
+        first_t = idx.min()
+        return int(tbl.loc[first_t, "at_risk"])
+    except Exception:  # pragma: no cover
+        return 0
+def _format_time_label(t: float, unit: str | None) -> str:
+    if unit:
+        return f"S({t:g} {unit})"
+    return f"S(t = {t:g})"
+def _sort_key(x: Any) -> tuple[int, Any]:
+    if isinstance(x, bool):
+        return (0, int(x))
+    if isinstance(x, (int, float)):
+        return (0, float(x))
+    if isinstance(x, str):
+        return (1, x)
+    return (2, repr(x))