PyPI - pysofra - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

pysofra/__init__.py +82 -0
pysofra/core/__init__.py +14 -0
pysofra/core/compose.py +167 -0
pysofra/core/format.py +155 -0
pysofra/core/frames.py +69 -0
pysofra/core/schema.py +128 -0
pysofra/core/table.py +924 -0
pysofra/io/__init__.py +1 -0
pysofra/models/__init__.py +6 -0
pysofra/models/extract.py +249 -0
pysofra/models/pool.py +119 -0
pysofra/models/regression.py +507 -0
pysofra/models/survival.py +395 -0
pysofra/models/uvregression.py +438 -0
pysofra/notebook/__init__.py +6 -0
pysofra/plot/__init__.py +23 -0
pysofra/plot/_backend.py +32 -0
pysofra/plot/forest.py +159 -0
pysofra/plot/inline.py +171 -0
pysofra/plot/km.py +249 -0
pysofra/render/__init__.py +28 -0
pysofra/render/_zip_determinism.py +57 -0
pysofra/render/base.py +22 -0
pysofra/render/docx.py +286 -0
pysofra/render/html.py +442 -0
pysofra/render/image.py +130 -0
pysofra/render/latex.py +253 -0
pysofra/render/markdown.py +128 -0
pysofra/render/pptx.py +340 -0
pysofra/render/xlsx.py +226 -0
pysofra/summary/__init__.py +6 -0
pysofra/summary/calibrate.py +214 -0
pysofra/summary/design.py +246 -0
pysofra/summary/effect_size.py +187 -0
pysofra/summary/extras.py +745 -0
pysofra/summary/smd.py +133 -0
pysofra/summary/stats.py +135 -0
pysofra/summary/tbl_cross.py +339 -0
pysofra/summary/tbl_one.py +1220 -0
pysofra/summary/tbl_summary.py +51 -0
pysofra/summary/tests.py +370 -0
pysofra/summary/typing.py +129 -0
pysofra/summary/weights.py +161 -0
pysofra/themes/__init__.py +5 -0
pysofra/themes/registry.py +272 -0
pysofra-0.1.0a1.dist-info/METADATA +301 -0
pysofra-0.1.0a1.dist-info/RECORD +50 -0
pysofra-0.1.0a1.dist-info/WHEEL +4 -0
pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0

pysofra/__init__.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""PySofra — the missing statistical reporting layer for Python.
+PySofra transforms datasets and statistical model outputs into
+publication-ready tables across HTML, Markdown, DOCX, LaTeX, PPTX, XLSX,
+and PNG. The same underlying :class:`SofraTable` object renders
+beautifully in Jupyter and exports identically to disk.
+Quick start
+-----------
+>>> import pandas as pd
+>>> import pysofra as ps
+>>> df = pd.DataFrame({
+...     "age": [55, 62, 47, 68, 51],
+...     "sex": ["F", "M", "F", "M", "F"],
+...     "arm": ["A", "A", "B", "B", "A"],
+... })
+>>> tbl = (
+...     ps.tbl_one(df, by="arm")
+...       .add_p()
+...       .add_smd()
+...       .theme("clinical")
+... )
+>>> _ = tbl.to_html()
+"""
+from __future__ import annotations
+from .core.compose import tbl_merge, tbl_stack
+from .core.schema import CellPart
+from .core.table import SofraTable
+from .models.pool import pool
+from .models.regression import tbl_regression
+from .models.survival import tbl_survival
+from .models.uvregression import tbl_uvregression
+from .summary.calibrate import design_effect, post_stratify, rake
+from .summary.design import SurveyDesign
+from .summary.effect_size import (
+    auto_effect_size,
+    cohen_d,
+    cramers_v,
+    eta_squared,
+    hedges_g,
+    omega_squared,
+    phi_coefficient,
+)
+from .summary.tbl_cross import tbl_cross
+from .summary.tbl_one import tbl_one
+from .summary.tbl_summary import tbl_summary
+from .summary.tests import available_tests
+from .themes.registry import available_themes, register_theme
+__version__ = "0.1.0a1"
+__all__ = [
+    "CellPart",
+    "SofraTable",
+    "SurveyDesign",
+    "__version__",
+    "auto_effect_size",
+    "available_tests",
+    "available_themes",
+    "cohen_d",
+    "cramers_v",
+    "design_effect",
+    "eta_squared",
+    "hedges_g",
+    "omega_squared",
+    "phi_coefficient",
+    "pool",
+    "post_stratify",
+    "rake",
+    "register_theme",
+    "tbl_cross",
+    "tbl_merge",
+    "tbl_one",
+    "tbl_regression",
+    "tbl_stack",
+    "tbl_summary",
+    "tbl_survival",
+    "tbl_uvregression",
+]

pysofra/core/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Core types: :class:`SofraTable` and its schema."""
+from .schema import Cell, HeaderCell, HeaderRow, Row, SpanningHeader
+from .table import SofraTable, TableSpec
+__all__ = [
+    "Cell",
+    "HeaderCell",
+    "HeaderRow",
+    "Row",
+    "SofraTable",
+    "SpanningHeader",
+    "TableSpec",
+]

pysofra/core/compose.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""Table composition: :func:`tbl_merge` and :func:`tbl_stack`.
+Both functions take a sequence of :class:`SofraTable` objects and combine
+them into a single SofraTable. Merging glues tables side-by-side (sharing
+the first / label column by default); stacking concatenates them vertically.
+"""
+from __future__ import annotations
+from itertools import zip_longest
+from .schema import Cell, HeaderCell, HeaderRow, Row, SpanningHeader
+from .table import SofraTable
+def tbl_merge(
+    tables: list[SofraTable] | tuple[SofraTable, ...],
+    *,
+    tab_spanners: list[str] | None = None,
+    share_first_column: bool = True,
+) -> SofraTable:
+    """Merge tables side-by-side.
+    Parameters
+    ----------
+    tables
+        Two or more :class:`SofraTable` objects with the same number of
+        body rows.
+    tab_spanners
+        Optional list of spanning-header labels, one per input table.
+    share_first_column
+        When ``True`` (default) and every input has the same first column
+        in every row, the duplicate label columns are dropped from the
+        2nd-Nth tables.
+    """
+    tables = list(tables)
+    if len(tables) < 2:
+        raise ValueError("tbl_merge requires at least two tables.")
+    n_rows_sets = {len(t.rows) for t in tables}
+    if len(n_rows_sets) != 1:
+        raise ValueError(
+            f"tbl_merge requires all tables to have the same number of rows; got {n_rows_sets}."
+        )
+    drop_first = (
+        share_first_column
+        and all(t.rows for t in tables)
+        and all(
+            all(t.rows[i].cells[0].text == tables[0].rows[i].cells[0].text
+                for i in range(len(t.rows)))
+            for t in tables[1:]
+        )
+    )
+    # Headers — pick the deepest header rows across inputs.
+    max_header_depth = max(len(t.headers) for t in tables)
+    merged_headers: list[HeaderRow] = []
+    for level in range(max_header_depth):
+        header_row_cells: list[HeaderCell] = []
+        for i, t in enumerate(tables):
+            hr = t.headers[level] if level < len(t.headers) else None
+            if hr is None:
+                # pad with empty header cells
+                base = tables[0].headers[level] if level < len(tables[0].headers) else None
+                width = len(t.headers[0].cells) if t.headers else 1
+                hr = HeaderRow(cells=tuple(HeaderCell(text="") for _ in range(width)))
+                del base
+            row_cells: list[HeaderCell] = list(hr.cells)
+            if drop_first and i > 0 and row_cells:
+                row_cells = row_cells[1:]
+            header_row_cells.extend(row_cells)
+        merged_headers.append(HeaderRow(cells=tuple(header_row_cells)))
+    # Spanning headers from tab_spanners (if provided).
+    spanning: list[SpanningHeader] = []
+    if tab_spanners:
+        if len(tab_spanners) != len(tables):
+            raise ValueError("tab_spanners must have one entry per table.")
+        col = 0
+        for i, (t, label) in enumerate(zip(tables, tab_spanners, strict=True)):
+            width = len(t.headers[0].cells) if t.headers else len(t.rows[0].cells)
+            if drop_first and i > 0:
+                width -= 1
+            spanning.append(SpanningHeader(label=label, start=col, end=col + width - 1))
+            col += width
+    # Body rows — concatenate cells horizontally.
+    n_rows = next(iter(n_rows_sets))
+    merged_rows: list[Row] = []
+    for i in range(n_rows):
+        body_cells: list[Cell] = []
+        for j, t in enumerate(tables):
+            body_row_cells: list[Cell] = list(t.rows[i].cells)
+            if drop_first and j > 0 and body_row_cells:
+                body_row_cells = body_row_cells[1:]
+            body_cells.extend(body_row_cells)
+        merged_rows.append(Row(cells=tuple(body_cells),
+                               is_group_header=tables[0].rows[i].is_group_header))
+    footnotes: list[str] = []
+    for t in tables:
+        for f in t.footnotes:
+            if f not in footnotes:
+                footnotes.append(f)
+    return SofraTable(
+        rows=tuple(merged_rows),
+        headers=tuple(merged_headers),
+        spanning_headers=tuple(spanning),
+        footnotes=tuple(footnotes),
+        caption=tables[0].caption,
+        theme_name=tables[0].theme_name,
+        metadata={"merged_from": [t.metadata.get("builder", "?") for t in tables]},
+    )
+def tbl_stack(
+    tables: list[SofraTable] | tuple[SofraTable, ...],
+    *,
+    group_labels: list[str] | None = None,
+) -> SofraTable:
+    """Stack tables vertically.
+    All inputs must share the same column count and header structure.
+    Optional ``group_labels`` introduce a group-header row between blocks.
+    """
+    tables = list(tables)
+    if len(tables) < 2:
+        raise ValueError("tbl_stack requires at least two tables.")
+    ncols = len(tables[0].headers[0].cells) if tables[0].headers else len(tables[0].rows[0].cells)
+    for t in tables[1:]:
+        nc = len(t.headers[0].cells) if t.headers else len(t.rows[0].cells)
+        if nc != ncols:
+            raise ValueError(
+                f"tbl_stack requires equal column counts; got {ncols} and {nc}."
+            )
+    if group_labels is not None and len(group_labels) != len(tables):
+        raise ValueError("group_labels must have one entry per table.")
+    rows: list[Row] = []
+    for i, t in enumerate(tables):
+        if group_labels:
+            header_cells = [Cell(text=group_labels[i], bold=True, align="left")]
+            header_cells.extend(Cell(text="") for _ in range(ncols - 1))
+            rows.append(Row(cells=tuple(header_cells), is_group_header=True))
+        rows.extend(t.rows)
+    footnotes: list[str] = []
+    for t in tables:
+        for f in t.footnotes:
+            if f not in footnotes:
+                footnotes.append(f)
+    return SofraTable(
+        rows=tuple(rows),
+        headers=tables[0].headers,
+        spanning_headers=tables[0].spanning_headers,
+        footnotes=tuple(footnotes),
+        caption=tables[0].caption,
+        theme_name=tables[0].theme_name,
+        metadata={"stacked_from": [t.metadata.get("builder", "?") for t in tables]},
+    )
+# Silence unused
+_ = zip_longest

pysofra/core/format.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Formatting helpers for numbers, p-values, percents, and confidence intervals.
+These are deterministic, locale-agnostic, and unit-testable. All rounding
+uses banker's-rounding-free conventional half-up via Python's ``format`` mini
+language so output matches what most statistical journals expect.
+"""
+from __future__ import annotations
+import math
+from typing import Final
+NA_STRING: Final[str] = "—"
+def fmt_number(value: float | int | None, digits: int = 2) -> str:
+    """Format a numeric value to ``digits`` decimal places.
+    ``None``, ``NaN``, infinite, and anything that can't be coerced to a
+    ``float`` render as :data:`NA_STRING`. Both IEEE-754 negative zero
+    AND small negative numbers that round to all-zero at ``digits``
+    precision are normalised so cells never display as ``"-0.00"``
+    (which is confusing and uninformative).
+    """
+    if value is None:
+        return NA_STRING
+    try:
+        v = float(value)
+    except (TypeError, ValueError):
+        return NA_STRING
+    if math.isnan(v) or math.isinf(v):
+        return NA_STRING
+    out = f"{v:.{digits}f}"
+    # If the formatted result is a "negative zero" representation —
+    # leading minus on a string of zeros and a decimal point — drop
+    # the sign. Covers both IEEE -0.0 (renders as "-0.00") and small
+    # negative inputs that round to zero at this precision (e.g.
+    # -0.001 at 2dp renders as "-0.00"). The information loss is
+    # already in the round-to-2dp step; preserving the sign on what
+    # the reader sees as zero would be misleading.
+    if out.startswith("-") and set(out[1:]) <= {"0", "."}:
+        out = out[1:]
+    return out
+def fmt_int(value: float | int | None) -> str:
+    if value is None:
+        return NA_STRING
+    try:
+        v = float(value)
+    except (TypeError, ValueError):
+        return NA_STRING
+    if math.isnan(v) or math.isinf(v):
+        return NA_STRING
+    return f"{int(round(v))}"
+def fmt_percent(value: float | None, digits: int = 1) -> str:
+    """Format a fraction (0–1) as a percent. Pass 0.234 → '23.4'.
+    Negative-zero output ("-0.0") is normalised to "0.0" for the same
+    reason :func:`fmt_number` does (it's confusing in publication tables).
+    """
+    if value is None:
+        return NA_STRING
+    if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
+        return NA_STRING
+    out = f"{100.0 * float(value):.{digits}f}"
+    if out.startswith("-") and set(out[1:]) <= {"0", "."}:
+        out = out[1:]
+    return out
+def fmt_n_pct(n: int, total: int, digits: int = 1) -> str:
+    """Render ``n (xx.x%)``. If ``total`` is zero, returns ``n (—)``."""
+    if total <= 0:
+        return f"{int(n)} ({NA_STRING})"
+    pct = 100.0 * n / total
+    return f"{int(n)} ({pct:.{digits}f}%)"
+def fmt_mean_sd(mean: float | None, sd: float | None, digits: int = 2) -> str:
+    """Render ``mean (sd)`` in journal style."""
+    return f"{fmt_number(mean, digits)} ({fmt_number(sd, digits)})"
+def fmt_median_iqr(
+    median: float | None,
+    q1: float | None,
+    q3: float | None,
+    digits: int = 2,
+) -> str:
+    """Render ``median (Q1, Q3)``."""
+    return f"{fmt_number(median, digits)} ({fmt_number(q1, digits)}, {fmt_number(q3, digits)})"
+def fmt_range(lo: float | None, hi: float | None, digits: int = 2) -> str:
+    return f"{fmt_number(lo, digits)}, {fmt_number(hi, digits)}"
+def fmt_ci(
+    lo: float | None,
+    hi: float | None,
+    digits: int = 2,
+    *,
+    sep: str = ", ",
+) -> str:
+    """Render a confidence interval as ``lo, hi``."""
+    return f"{fmt_number(lo, digits)}{sep}{fmt_number(hi, digits)}"
+def fmt_estimate_ci(
+    estimate: float | None,
+    lo: float | None,
+    hi: float | None,
+    digits: int = 2,
+) -> str:
+    """Render ``estimate (lo, hi)``."""
+    return f"{fmt_number(estimate, digits)} ({fmt_ci(lo, hi, digits)})"
+def fmt_p_value(p: float | None, digits: int = 3) -> str:
+    """Journal-style p-value formatting.
+    Rules:
+      * ``None`` / ``NaN`` / infinite        → :data:`NA_STRING`
+      * out-of-range (``p < 0`` or ``p > 1``) → :data:`NA_STRING`
+        (silently coercing an invalid p-value would mask a real bug in
+        the upstream computation)
+      * ``p < 10^-digits``                    → ``"<0.001"`` (for ``digits=3``)
+      * ``p > 0.99``                          → ``">0.99"``
+      * otherwise                             → ``"0.xxx"``
+    """
+    if p is None:
+        return NA_STRING
+    if isinstance(p, float) and (math.isnan(p) or math.isinf(p)):
+        return NA_STRING
+    p = float(p)
+    if p < 0.0 or p > 1.0:
+        return NA_STRING
+    threshold = 10 ** (-digits)
+    if p < threshold:
+        return f"<{threshold:.{digits}f}"
+    if p > 0.99:
+        return ">0.99"
+    return f"{p:.{digits}f}"
+def fmt_smd(smd: float | None, digits: int = 3) -> str:
+    """Format a standardized mean difference. Always signed magnitude."""
+    if smd is None:
+        return NA_STRING
+    if isinstance(smd, float) and (math.isnan(smd) or math.isinf(smd)):
+        return NA_STRING
+    return f"{float(smd):.{digits}f}"

pysofra/core/frames.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""DataFrame adaptation.
+PySofra's public API accepts any object with the pandas DataFrame shape —
+``pandas.DataFrame``, ``polars.DataFrame``, or ``polars.LazyFrame``. The
+adapter in this module normalises the input to pandas internally so the
+statistical engines have one type to reason about.
+We keep the dependency on polars *optional* — importing this module does
+not require polars to be installed.
+"""
+from __future__ import annotations
+from typing import Any
+import pandas as pd
+def to_pandas(data: Any) -> pd.DataFrame:
+    """Convert a DataFrame-like input to a pandas DataFrame.
+    Accepted inputs:
+    * ``pandas.DataFrame`` — returned as-is (no copy).
+    * ``polars.DataFrame`` — converted via ``.to_pandas()``.
+    * ``polars.LazyFrame`` — collected first, then converted.
+    * Any object exposing ``.to_pandas()`` — invoked and validated.
+    Raises ``TypeError`` for unrecognised inputs.
+    """
+    if isinstance(data, pd.DataFrame):
+        return data
+    # Duck-typed polars detection — don't import polars unless we see it.
+    cls = type(data)
+    qualname = f"{cls.__module__}.{cls.__name__}"
+    if qualname.startswith("polars."):
+        # LazyFrame needs an explicit ``.collect()`` first.
+        if cls.__name__ == "LazyFrame":
+            data = data.collect()
+        try:
+            pandas_df = data.to_pandas()
+        except (ImportError, ModuleNotFoundError):  # pragma: no cover
+            # ``polars.DataFrame.to_pandas`` routes through pyarrow by
+            # default. The ``pysofra[polars]`` and ``pysofra[all]``
+            # extras now declare ``pyarrow``, so this fallback is
+            # exercised only by users who hand-pin polars without it.
+            # Falls back to a column-wise conversion that needs only
+            # the standard library + pandas.
+            pandas_df = pd.DataFrame(
+                {col: data[col].to_list() for col in data.columns}
+            )
+        if not isinstance(pandas_df, pd.DataFrame):  # pragma: no cover
+            raise TypeError(
+                "polars to_pandas() did not return a pandas DataFrame; "
+                f"got {type(pandas_df).__name__}."
+            )
+        return pandas_df
+    # Generic fallback: any object that knows how to give us pandas.
+    if hasattr(data, "to_pandas"):
+        result = data.to_pandas()
+        if isinstance(result, pd.DataFrame):
+            return result
+    raise TypeError(
+        f"Unsupported DataFrame type {qualname!r}. "
+        "PySofra accepts pandas.DataFrame and polars.DataFrame / LazyFrame."
+    )

pysofra/core/schema.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Internal schema for SofraTable.
+This module defines the backend-agnostic representation of a statistical
+table. Every renderer (HTML, Markdown, DOCX, PPTX, LaTeX) consumes the same
+schema; statistical engines (Table 1, summary, regression) produce it.
+The schema is intentionally simple and immutable. Cells carry both the
+*display* string and the *raw* value when known, so renderers can choose
+the appropriate format (e.g., right-alignment for numeric cells).
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Literal
+CellKind = Literal[
+    "text", "numeric", "p_value", "q_value", "ci", "header_label", "group_label"
+]
+Alignment = Literal["left", "center", "right"]
+@dataclass(frozen=True)
+class CellPart:
+    """A typographically distinct run inside a single cell.
+    Used by ``SofraTable.compose()`` to embed multi-format content
+    (bold + italic + colour) inside one cell. Renderers honour
+    ``CellPart.bold``, ``italic``, ``superscript``, ``subscript``,
+    ``code``, ``color``, and ``link`` where the backend supports them;
+    unsupported flags degrade to plain text.
+    """
+    text: str
+    bold: bool = False
+    italic: bool = False
+    superscript: bool = False
+    subscript: bool = False
+    code: bool = False
+    color: str | None = None
+    link: str | None = None
+@dataclass(frozen=True)
+class Cell:
+    """One cell of a SofraTable.
+    `value` carries the raw underlying value when meaningful (e.g., a float
+    p-value). `text` is the rendered string used by all backends. Renderers
+    may consult `kind` and `value` for additional formatting decisions.
+    `style` is an optional mapping of renderer-specific overrides:
+    - ``html`` — extra inline-CSS declarations applied to the ``<td>``.
+    - ``docx`` — keys like ``padding_pt``, ``shading_hex``, ``borders``.
+    - ``xlsx`` — keys like ``bg_color``, ``num_format`` (forwarded to xlsxwriter).
+    - ``latex`` — prepended raw LaTeX (e.g. ``\\rowcolor{...}``).
+    All renderers ignore keys they don't understand.
+    """
+    text: str
+    value: Any = None
+    kind: CellKind = "text"
+    align: Alignment | None = None
+    bold: bool = False
+    italic: bool = False
+    indent: int = 0
+    style: dict[str, Any] | None = None
+    parts: tuple[CellPart, ...] | None = None
+@dataclass(frozen=True)
+class Row:
+    """One row of the table body."""
+    cells: tuple[Cell, ...]
+    is_group_header: bool = False
+    metadata: dict[str, Any] = field(default_factory=dict)
+@dataclass(frozen=True)
+class HeaderCell:
+    text: str
+    align: Alignment = "center"
+    bold: bool = True
+@dataclass(frozen=True)
+class HeaderRow:
+    """A column-header row. Tables may have multiple stacked header rows."""
+    cells: tuple[HeaderCell, ...]
+@dataclass(frozen=True)
+class SpanningHeader:
+    """A spanning header above the column headers.
+    `start` and `end` are 0-indexed column indices, inclusive.
+    """
+    label: str
+    start: int
+    end: int
+def make_cell(
+    text: str,
+    value: Any = None,
+    kind: CellKind = "text",
+    align: Alignment | None = None,
+    bold: bool = False,
+    italic: bool = False,
+    indent: int = 0,
+    style: dict[str, Any] | None = None,
+) -> Cell:
+    """Convenience constructor used internally."""
+    return Cell(
+        text=text,
+        value=value,
+        kind=kind,
+        align=align,
+        bold=bold,
+        italic=italic,
+        indent=indent,
+        style=style,
+    )