PyPI - roll-rate-analysis - Versions diffs - 0.2.0__py3-none-any.whl - Mend

roll-rate-analysis 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

roll_rate_analysis/__init__.py +13 -0
roll_rate_analysis/_common.py +138 -0
roll_rate_analysis/mom.py +224 -0
roll_rate_analysis/snapshot.py +284 -0
roll_rate_analysis-0.2.0.dist-info/METADATA +122 -0
roll_rate_analysis-0.2.0.dist-info/RECORD +8 -0
roll_rate_analysis-0.2.0.dist-info/WHEEL +4 -0
roll_rate_analysis-0.2.0.dist-info/licenses/LICENSE +21 -0

roll_rate_analysis/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Roll rate analysis for credit risk scorecards."""
+from importlib.metadata import PackageNotFoundError, version
+from .mom import MOMRollRateTable
+from .snapshot import SnapshotRollRateTable
+try:
+    __version__ = version("roll-rate-analysis")
+except PackageNotFoundError:
+    __version__ = "0.0.0+unknown"
+__all__ = ("MOMRollRateTable", "SnapshotRollRateTable", "__version__")

roll_rate_analysis/_common.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Shared helpers for roll rate computation.
+The matrix layout used by both classes is row=primary state, column=secondary
+state. Each row has a single "diagonal" column where the account did not change
+state (stable). Cells to the left of the diagonal are roll_down (state
+improved), cells to the right are roll_up (state worsened). ``reduce_matrix``
+collapses each row into those three buckets using a per-row diagonal index,
+which is the only piece of metadata that differs between MOM (square) and
+Snapshot (rectangular with extra rows in detailed mode).
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import polars as pl
+LABEL_COL = "from_state"
+LazySource = pl.LazyFrame | pl.DataFrame | str | Path
+def load_lazy(source: LazySource) -> pl.LazyFrame:
+    """Return a polars LazyFrame regardless of input type."""
+    if isinstance(source, pl.LazyFrame):
+        return source
+    if isinstance(source, pl.DataFrame):
+        return source.lazy()
+    if isinstance(source, str | Path):
+        return pl.scan_csv(source)
+    raise TypeError(
+        f"Unsupported input type {type(source).__name__}; "
+        "expected polars LazyFrame, DataFrame, or a file path."
+    )
+def cycle_row_tags(max_delq: int) -> list[str]:
+    """Generate the canonical row/column tags ``0_cycle_delinquent`` … ``N+_cycle_delinquent``."""
+    return [f"{i}_cycle_delinquent" for i in range(max_delq)] + [f"{max_delq}+_cycle_delinquent"]
+def labeled_matrix(
+    counts: np.ndarray,
+    row_tags: list[str],
+    column_tags: list[str],
+) -> pl.DataFrame:
+    """Wrap a 2D numpy count matrix in a polars DataFrame with labeled axes.
+    The first column is named ``from_state`` and holds the row tags; the
+    remaining columns are named after ``column_tags`` and hold the integer counts.
+    """
+    if counts.shape != (len(row_tags), len(column_tags)):
+        raise ValueError(
+            f"counts shape {counts.shape} does not match ({len(row_tags)}, {len(column_tags)})."
+        )
+    data: dict[str, list | np.ndarray] = {LABEL_COL: row_tags}
+    for j, tag in enumerate(column_tags):
+        data[tag] = counts[:, j]
+    return pl.DataFrame(data)
+def write_capped_counts(
+    counts: np.ndarray,
+    row: int,
+    grouped: pl.DataFrame,
+    value_col: str,
+    max_delq: int,
+) -> None:
+    """Write per-secondary-value counts into ``counts[row]``, capping at ``max_delq``.
+    ``grouped`` must be a polars DataFrame with two columns: ``value_col`` (the
+    secondary delinquency value) and ``len`` (occurrence count). Values strictly
+    greater than ``max_delq - 1`` are summed into ``counts[row, max_delq]``.
+    """
+    if grouped.height == 0:
+        return
+    below = grouped.filter(pl.col(value_col) <= max_delq - 1)
+    above = grouped.filter(pl.col(value_col) > max_delq - 1)
+    if below.height:
+        counts[row, below[value_col].to_numpy()] += below["len"].to_numpy()
+    if above.height:
+        counts[row, max_delq] += int(above["len"].sum())
+def reduce_matrix(
+    matrix: pl.DataFrame,
+    diag_cols: list[int],
+    percentages: bool = True,
+) -> pl.DataFrame:
+    """Collapse a roll-rate matrix into roll_down / stable / roll_up per row.
+    ``matrix`` is expected in the format produced by :func:`labeled_matrix`:
+    a polars DataFrame with a ``from_state`` label column followed by numeric
+    transition-count columns.
+    ``diag_cols[i]`` gives, for row ``i``, the index (among the numeric
+    columns, 0-based) where "stable" lives. Cells left of that index sum to
+    ``roll_down``; the diagonal itself is ``stable``; cells right of it sum to
+    ``roll_up``.
+    """
+    if LABEL_COL not in matrix.columns:
+        raise ValueError(f"matrix must contain a '{LABEL_COL}' label column.")
+    value_cols = [c for c in matrix.columns if c != LABEL_COL]
+    values = matrix.select(value_cols).to_numpy()
+    n_rows = values.shape[0]
+    if len(diag_cols) != n_rows:
+        raise ValueError(f"diag_cols has length {len(diag_cols)} but matrix has {n_rows} rows.")
+    buckets = np.zeros((n_rows, 3), dtype=np.float64)
+    for i, d in enumerate(diag_cols):
+        row = values[i]
+        buckets[i, 0] = row[:d].sum()
+        buckets[i, 1] = row[d]
+        buckets[i, 2] = row[d + 1 :].sum()
+    if percentages:
+        totals = buckets.sum(axis=1, keepdims=True)
+        with np.errstate(invalid="ignore", divide="ignore"):
+            buckets = np.where(totals > 0, 100 * buckets / totals, 0.0)
+        buckets = np.round(buckets, 1)
+        return pl.DataFrame(
+            {
+                LABEL_COL: matrix[LABEL_COL],
+                "roll_down": buckets[:, 0],
+                "stable": buckets[:, 1],
+                "roll_up": buckets[:, 2],
+            }
+        )
+    buckets_int = buckets.astype(np.int64)
+    return pl.DataFrame(
+        {
+            LABEL_COL: matrix[LABEL_COL],
+            "roll_down": buckets_int[:, 0],
+            "stable": buckets_int[:, 1],
+            "roll_up": buckets_int[:, 2],
+        }
+    )

roll_rate_analysis/mom.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""Month-over-month roll rate table."""
+from __future__ import annotations
+from collections.abc import Sequence
+import numpy as np
+import polars as pl
+import polars.selectors as cs
+from ._common import (
+    LazySource,
+    cycle_row_tags,
+    labeled_matrix,
+    load_lazy,
+    reduce_matrix,
+    write_capped_counts,
+)
+_MERGED = "_rr_merged_bin"
+_MERGED_SECONDARY = _MERGED + "_secondary"
+class MOMRollRateTable:
+    """Month-over-month roll rate table for two consecutive monthly snapshots.
+    Parameters
+    ----------
+    month_i:
+        Data for month ``i``. Accepts a polars ``LazyFrame``/``DataFrame`` or a
+        path/string pointing to a CSV file.
+    month_i_plus_1:
+        Data for month ``i+1``. Same supported types as ``month_i``.
+    unique_key_col:
+        Name of the account identifier column. Must exist in both inputs.
+    delinquency_col:
+        Name of the delinquency column (integer months past due). Must exist in
+        both inputs.
+    max_delq:
+        Largest delinquency level kept as its own row/column. Anything above
+        rolls into the ``N+`` bucket.
+    binary_cols:
+        Optional binary indicator columns to append to the matrix. Listed in
+        descending priority — the first entry wins ties. Each indicator gets
+        one extra row and column.
+    Use
+    ---
+    >>> table = MOMRollRateTable(
+    ...     "jan.csv", "feb.csv",
+    ...     unique_key_col="id", delinquency_col="delq", max_delq=6,
+    ... )
+    >>> matrix = table.compute()      # polars.DataFrame, the full transition matrix
+    >>> reduced = table.reduce()      # polars.DataFrame, roll_down / stable / roll_up
+    ``compute`` and ``reduce`` are idempotent; the matrix is cached after the
+    first call. Both return polars ``DataFrame``s whose first column
+    (``from_state``) holds the row label.
+    """
+    def __init__(
+        self,
+        month_i: LazySource,
+        month_i_plus_1: LazySource,
+        *,
+        unique_key_col: str,
+        delinquency_col: str,
+        max_delq: int = 6,
+        binary_cols: Sequence[str] = (),
+    ) -> None:
+        if max_delq < 1:
+            raise ValueError("max_delq must be >= 1.")
+        binary_cols = tuple(binary_cols)
+        if delinquency_col in binary_cols or unique_key_col in binary_cols:
+            raise ValueError("binary_cols must not include the unique_key or delinquency columns.")
+        self._month_i_source = month_i
+        self._month_i_plus_1_source = month_i_plus_1
+        self.unique_key_col = unique_key_col
+        self.delinquency_col = delinquency_col
+        self.max_delq = max_delq
+        self.binary_cols = binary_cols
+        self.tags = cycle_row_tags(max_delq) + list(reversed(binary_cols))
+        self._matrix: pl.DataFrame | None = None
+    @property
+    def matrix(self) -> pl.DataFrame:
+        """Return the cached transition matrix, computing it on first access."""
+        if self._matrix is None:
+            self.compute()
+        assert self._matrix is not None
+        return self._matrix
+    def compute(self) -> pl.DataFrame:
+        """Compute the transition matrix and return it as a polars DataFrame."""
+        n = self.max_delq + 1 + len(self.binary_cols)
+        counts = np.zeros((n, n), dtype=np.int64)
+        data = self._joined_frame()
+        if self.binary_cols:
+            self._accumulate_delq_to_delq(counts, data, exclude_binary=True)
+            self._accumulate_delq_to_binary(counts, data)
+            self._accumulate_binary_to_delq(counts, data)
+            self._accumulate_binary_to_binary(counts, data)
+        else:
+            self._accumulate_delq_to_delq(counts, data, exclude_binary=False)
+        self._matrix = labeled_matrix(counts, self.tags, self.tags)
+        return self._matrix
+    def reduce(self, percentages: bool = True) -> pl.DataFrame:
+        """Return roll_down / stable / roll_up per row, in percentages or counts."""
+        diag_cols = list(range(len(self.tags)))
+        return reduce_matrix(self.matrix, diag_cols, percentages=percentages)
+    # ----- pipeline -------------------------------------------------------
+    def _joined_frame(self) -> pl.DataFrame:
+        """Load both months, project the relevant columns, optionally merge binaries, and join."""
+        select_cols = [self.unique_key_col, self.delinquency_col, *self.binary_cols]
+        left = load_lazy(self._month_i_source).select(select_cols)
+        right = load_lazy(self._month_i_plus_1_source).select(select_cols)
+        if self.binary_cols:
+            left = self._merge_binary_cols(left)
+            right = self._merge_binary_cols(right)
+        return left.join(right, how="left", on=self.unique_key_col, suffix="_secondary").collect()
+    def _merge_binary_cols(self, frame: pl.LazyFrame) -> pl.LazyFrame:
+        """Collapse the binary indicator columns into one priority-valued column.
+        Priority is encoded as ``len(binary_cols), len(binary_cols)-1, …, 1`` so
+        ``binary_cols[0]`` (highest priority) gets the largest value. When more
+        than one indicator is set on the same row, ``max_horizontal`` keeps the
+        winner.
+        """
+        n = len(self.binary_cols)
+        for idx, col in enumerate(self.binary_cols):
+            priority = n - idx
+            frame = frame.with_columns(
+                pl.when(pl.col(col) == 1)
+                .then(pl.lit(priority))
+                .otherwise(pl.col(col))
+                .alias(f"{col}__priority")
+            )
+        return frame.with_columns(
+            pl.max_horizontal(cs.ends_with("__priority")).alias(_MERGED)
+        ).drop(cs.ends_with("__priority"))
+    # ----- accumulation per case ------------------------------------------
+    def _accumulate_delq_to_delq(
+        self,
+        counts: np.ndarray,
+        data: pl.DataFrame,
+        *,
+        exclude_binary: bool,
+    ) -> None:
+        """Accounts that had a normal delinquency status in both months."""
+        if exclude_binary:
+            data = data.filter((pl.col(_MERGED) == 0) & (pl.col(_MERGED_SECONDARY) == 0))
+        secondary = f"{self.delinquency_col}_secondary"
+        for cycle in self._observed_cycles(data, self.delinquency_col):
+            grouped = self._group_counts(data, self.delinquency_col, cycle, secondary)
+            write_capped_counts(
+                counts, min(cycle, self.max_delq), grouped, secondary, self.max_delq
+            )
+    def _accumulate_delq_to_binary(self, counts: np.ndarray, data: pl.DataFrame) -> None:
+        """Accounts that moved from a delinquency state into a binary indicator."""
+        data = data.filter((pl.col(_MERGED) == 0) & (pl.col(_MERGED_SECONDARY) > 0))
+        for cycle in self._observed_cycles(data, self.delinquency_col):
+            grouped = self._group_counts(data, self.delinquency_col, cycle, _MERGED_SECONDARY)
+            self._write_binary_secondary(counts, min(cycle, self.max_delq), grouped)
+    def _accumulate_binary_to_delq(self, counts: np.ndarray, data: pl.DataFrame) -> None:
+        """Accounts whose binary indicator was set in month i but had a delq state in i+1."""
+        data = data.filter((pl.col(_MERGED) > 0) & (pl.col(_MERGED_SECONDARY) == 0))
+        secondary = f"{self.delinquency_col}_secondary"
+        for priority in self._observed_cycles(data, _MERGED):
+            grouped = self._group_counts(data, _MERGED, priority, secondary)
+            write_capped_counts(counts, self.max_delq + priority, grouped, secondary, self.max_delq)
+    def _accumulate_binary_to_binary(self, counts: np.ndarray, data: pl.DataFrame) -> None:
+        """Accounts whose binary indicator was set in both months."""
+        data = data.filter((pl.col(_MERGED) > 0) & (pl.col(_MERGED_SECONDARY) > 0))
+        for priority in self._observed_cycles(data, _MERGED):
+            grouped = self._group_counts(data, _MERGED, priority, _MERGED_SECONDARY)
+            self._write_binary_secondary(counts, self.max_delq + priority, grouped)
+    # ----- low-level helpers ---------------------------------------------
+    @staticmethod
+    def _observed_cycles(data: pl.DataFrame, col: str) -> range:
+        if data.height == 0:
+            return range(0)
+        return range(int(data[col].min()), int(data[col].max()) + 1)
+    @staticmethod
+    def _group_counts(
+        data: pl.DataFrame, primary: str, primary_value: int, secondary: str
+    ) -> pl.DataFrame:
+        return (
+            data.filter(pl.col(primary) == primary_value)
+            .group_by([primary, secondary])
+            .len()
+            .sort(secondary)
+        )
+    def _write_binary_secondary(self, counts: np.ndarray, row: int, grouped: pl.DataFrame) -> None:
+        """Apply counts where the secondary axis is a binary-priority column.
+        Each priority ``k`` maps directly to column ``max_delq + k`` with no
+        capping (unlike the delinquency axis, the priority values are exact).
+        """
+        if grouped.height == 0:
+            return
+        cols = grouped[_MERGED_SECONDARY].to_numpy() + self.max_delq
+        counts[row, cols] += grouped["len"].to_numpy()
+__all__ = ("MOMRollRateTable",)

roll_rate_analysis/snapshot.py ADDED Viewed

@@ -0,0 +1,284 @@
+"""Snapshot roll rate table over observation and performance windows."""
+from __future__ import annotations
+from collections.abc import Sequence
+from dataclasses import dataclass
+import numpy as np
+import polars as pl
+import polars.selectors as cs
+from ._common import (
+    LazySource,
+    cycle_row_tags,
+    labeled_matrix,
+    load_lazy,
+    reduce_matrix,
+    write_capped_counts,
+)
+_OBS_MAX = "obs_max_delq"
+_PERF_MAX = "perf_max_delq"
+@dataclass(frozen=True)
+class _RowSpec:
+    label: str
+    level: int
+class SnapshotRollRateTable:
+    """Roll rate table for a snapshot month with observation and performance windows.
+    For every account in the snapshot, the observation window is reduced to its
+    maximum delinquency across the supplied observation files, and similarly for
+    the performance window. The resulting transition matrix has rows indexed by
+    the observation max-delinquency and columns indexed by the performance
+    max-delinquency.
+    Parameters
+    ----------
+    snapshot:
+        Data for the snapshot month (defines the account universe). Accepts a
+        polars ``LazyFrame``/``DataFrame`` or a path/string pointing to a CSV.
+    observation:
+        Sequence of frames or paths forming the observation window.
+    performance:
+        Sequence of frames or paths forming the performance window.
+    unique_key_col:
+        Name of the account identifier column. Must exist in every input.
+    delinquency_col:
+        Name of the delinquency column. Must exist in every observation and
+        performance frame.
+    max_delq:
+        Largest delinquency level kept as its own row/column. Anything above
+        rolls into the ``N+`` bucket.
+    detailed:
+        Split delinquency levels 3 and 4 into ``granularity`` sub-rows showing
+        how many times the account hit that level during the observation window.
+    granularity:
+        Number of sub-rows per detailed level. Must be ≥ 2 when ``detailed``.
+    keep_cols:
+        Optional column whitelist applied to each observation/performance frame
+        before joining (memory optimisation). Must include ``delinquency_col``.
+    Use
+    ---
+    >>> table = SnapshotRollRateTable(
+    ...     "snap.csv",
+    ...     ["obs1.csv", "obs2.csv"],
+    ...     ["perf1.csv", "perf2.csv"],
+    ...     unique_key_col="id",
+    ...     delinquency_col="delq",
+    ...     detailed=True,
+    ...     granularity=2,
+    ... )
+    >>> matrix = table.compute()      # polars.DataFrame, the full transition matrix
+    >>> reduced = table.reduce()      # polars.DataFrame, roll_down / stable / roll_up
+    ``compute`` and ``reduce`` are idempotent; the matrix is cached after the
+    first call.
+    """
+    def __init__(
+        self,
+        snapshot: LazySource,
+        observation: Sequence[LazySource],
+        performance: Sequence[LazySource],
+        *,
+        unique_key_col: str,
+        delinquency_col: str,
+        max_delq: int = 6,
+        detailed: bool = False,
+        granularity: int = 1,
+        keep_cols: Sequence[str] | None = None,
+    ) -> None:
+        if max_delq < 1:
+            raise ValueError("max_delq must be >= 1.")
+        if granularity < 1:
+            raise ValueError("granularity must be >= 1.")
+        if detailed and granularity < 2:
+            raise ValueError("granularity must be >= 2 when detailed=True.")
+        observation = list(observation)
+        performance = list(performance)
+        if not observation:
+            raise ValueError("at least one observation frame is required.")
+        if not performance:
+            raise ValueError("at least one performance frame is required.")
+        if keep_cols is not None:
+            keep_cols = tuple(keep_cols)
+            if delinquency_col not in keep_cols:
+                raise ValueError(
+                    "keep_cols must include the delinquency_col so that it survives projection."
+                )
+        self._snapshot_source = snapshot
+        self._observation_sources = observation
+        self._performance_sources = performance
+        self.unique_key_col = unique_key_col
+        self.delinquency_col = delinquency_col
+        self.max_delq = max_delq
+        self.detailed = detailed
+        self.granularity = granularity if detailed else 1
+        self.keep_cols = keep_cols
+        self._row_specs = self._build_row_specs()
+        self.row_tags = [s.label for s in self._row_specs]
+        self.column_tags = cycle_row_tags(max_delq)
+        self._matrix: pl.DataFrame | None = None
+    @property
+    def extra_rows(self) -> int:
+        """Number of additional rows beyond ``max_delq + 1`` due to detailed mode."""
+        return 2 * (self.granularity - 1) if self.detailed else 0
+    @property
+    def matrix(self) -> pl.DataFrame:
+        """Return the cached transition matrix, computing it on first access."""
+        if self._matrix is None:
+            self.compute()
+        assert self._matrix is not None
+        return self._matrix
+    def compute(self) -> pl.DataFrame:
+        """Compute the transition matrix and return it as a polars DataFrame."""
+        n_rows = self.max_delq + 1 + self.extra_rows
+        n_cols = self.max_delq + 1
+        counts = np.zeros((n_rows, n_cols), dtype=np.int64)
+        data = self._build_joined().collect()
+        if data.height > 0:
+            cycles = range(int(data[_OBS_MAX].min()), int(data[_OBS_MAX].max()) + 1)
+            for cycle in cycles:
+                self._accumulate_cycle(counts, data, cycle)
+        self._matrix = labeled_matrix(counts, self.row_tags, self.column_tags)
+        return self._matrix
+    def reduce(self, percentages: bool = True) -> pl.DataFrame:
+        """Return roll_down / stable / roll_up per row, in percentages or counts."""
+        diag_cols = [spec.level for spec in self._row_specs]
+        return reduce_matrix(self.matrix, diag_cols, percentages=percentages)
+    # ----- row layout -----------------------------------------------------
+    def _build_row_specs(self) -> list[_RowSpec]:
+        specs: list[_RowSpec] = []
+        for i in range(self.max_delq):
+            if self.detailed and i in (3, 4):
+                for j in range(1, self.granularity):
+                    specs.append(_RowSpec(f"{i}x{j}_cycle_delinquent", i))
+                specs.append(_RowSpec(f"{i}x{self.granularity}+_cycle_delinquent", i))
+            else:
+                specs.append(_RowSpec(f"{i}_cycle_delinquent", i))
+        specs.append(_RowSpec(f"{self.max_delq}+_cycle_delinquent", self.max_delq))
+        return specs
+    def _row_index(self, cycle: int, rank: int = 1) -> int:
+        """Return the matrix row index for ``(cycle, rank)``.
+        ``rank`` is only meaningful when ``detailed`` is on and ``cycle`` is 3 or 4.
+        """
+        if cycle >= self.max_delq:
+            return self.max_delq + self.extra_rows
+        if self.detailed and cycle in (3, 4):
+            base = 3 if cycle == 3 else 3 + self.granularity
+            return base + rank - 1
+        if self.detailed and cycle >= 5:
+            return cycle + self.extra_rows
+        return cycle
+    # ----- pipeline -------------------------------------------------------
+    def _build_joined(self) -> pl.LazyFrame:
+        """Build the merged frame of ``(unique_key, obs_max_delq, perf_max_delq, …)``."""
+        snapshot = load_lazy(self._snapshot_source).select([self.unique_key_col])
+        obs = self._build_window(snapshot, self._observation_sources, _OBS_MAX, "obs")
+        if self.detailed:
+            obs = obs.with_columns(
+                [
+                    pl.sum_horizontal(cs.starts_with(self.delinquency_col) == 3).alias(
+                        "obs_times_3_cycle"
+                    ),
+                    pl.sum_horizontal(cs.starts_with(self.delinquency_col) == 4).alias(
+                        "obs_times_4_cycle"
+                    ),
+                ]
+            )
+        perf = self._build_window(snapshot, self._performance_sources, _PERF_MAX, "perf")
+        joined = obs.join(perf, how="left", on=self.unique_key_col, suffix="_perfwin")
+        keep = [self.unique_key_col, _OBS_MAX, _PERF_MAX]
+        if self.detailed:
+            keep = [
+                self.unique_key_col,
+                _OBS_MAX,
+                "obs_times_3_cycle",
+                "obs_times_4_cycle",
+                _PERF_MAX,
+            ]
+        return joined.select(keep)
+    def _build_window(
+        self,
+        snapshot: pl.LazyFrame,
+        sources: Sequence[LazySource],
+        max_alias: str,
+        suffix_tag: str,
+    ) -> pl.LazyFrame:
+        """Join each window file into ``snapshot`` and reduce to one max-delq column."""
+        result = snapshot
+        for i, src in enumerate(sources):
+            frame = load_lazy(src)
+            if self.keep_cols is not None:
+                frame = frame.select([self.unique_key_col, *self.keep_cols])
+            result = result.join(
+                frame,
+                how="left",
+                on=self.unique_key_col,
+                suffix=f"_{suffix_tag}{i}",
+            )
+        return result.with_columns(
+            pl.max_horizontal(cs.starts_with(self.delinquency_col)).alias(max_alias)
+        )
+    # ----- accumulation per cycle ----------------------------------------
+    def _accumulate_cycle(self, counts: np.ndarray, data: pl.DataFrame, cycle: int) -> None:
+        rows = data.filter(pl.col(_OBS_MAX) == cycle)
+        if rows.height == 0:
+            return
+        if self.detailed and cycle in (3, 4):
+            self._accumulate_detailed(counts, rows, cycle)
+            return
+        grouped = rows.group_by([_OBS_MAX, _PERF_MAX]).len().sort(_PERF_MAX)
+        write_capped_counts(counts, self._row_index(cycle), grouped, _PERF_MAX, self.max_delq)
+    def _accumulate_detailed(self, counts: np.ndarray, rows: pl.DataFrame, cycle: int) -> None:
+        times_col = f"obs_times_{cycle}_cycle"
+        grouped = (
+            rows.filter(pl.col(times_col) >= 1)
+            .group_by([times_col, _PERF_MAX])
+            .len()
+            .sort([times_col, _PERF_MAX])
+        )
+        for rank in range(1, self.granularity + 1):
+            if rank < self.granularity:
+                sub = grouped.filter(pl.col(times_col) == rank)
+            else:
+                sub = (
+                    grouped.filter(pl.col(times_col) >= rank)
+                    .group_by(_PERF_MAX)
+                    .agg(pl.col("len").sum())
+                    .sort(_PERF_MAX)
+                )
+            row_idx = self._row_index(cycle, rank)
+            write_capped_counts(counts, row_idx, sub, _PERF_MAX, self.max_delq)
+__all__ = ("SnapshotRollRateTable",)

roll_rate_analysis-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,122 @@
+Metadata-Version: 2.4
+Name: roll-rate-analysis
+Version: 0.2.0
+Summary: Roll rate analysis for credit risk scorecards.
+Project-URL: Source, https://github.com/alexliap/roll_rate_analysis
+Author-email: Alexandros Liapatis <alexandrosliapatis@gmail.com>
+License: MIT
+License-File: LICENSE
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Requires-Dist: numpy
+Requires-Dist: polars>=1.0
+Description-Content-Type: text/markdown
+# Roll Rate Analysis
+![deploy on pypi](https://github.com/alexliap/roll_rate_analysis/actions/workflows/publish-package.yaml/badge.svg)
+![PyPI Version](https://img.shields.io/pypi/v/roll-rate-analysis?label=pypi%20package)
+![Downloads](https://static.pepy.tech/badge/roll-rate-analysis)
+Roll rate analysis is a credit-risk technique used to define the target variable when building Application or Behavioural scorecards. It's an iterative process — this package parametrises the moving parts so each iteration is a few lines of code rather than a fresh notebook.
+The library has zero pandas dependency: inputs and outputs are [Polars](https://pola.rs/) frames.
+## Installation
+From PyPI:
+```bash
+uv add roll-rate-analysis        # uv projects
+pip install roll-rate-analysis   # plain pip
+```
+Requires Python 3.10 or newer.
+## What's in the box
+Two classes, one method each:
+| Class | Use case |
+| --- | --- |
+| `MOMRollRateTable` | Transition matrix between two consecutive months. |
+| `SnapshotRollRateTable` | Transition matrix between an observation window and a performance window around a snapshot month. |
+Both expose `compute()` (full transition matrix) and `reduce()` (roll_down / stable / roll_up summary). Both return polars `DataFrame`s whose first column (`from_state`) holds the row label.
+## Quick start
+```python
+from roll_rate_analysis import MOMRollRateTable
+table = MOMRollRateTable(
+    "data/jan.csv",
+    "data/feb.csv",
+    unique_key_col="id",
+    delinquency_col="delq",
+    max_delq=6,
+)
+table.compute()    # polars.DataFrame, full transition matrix
+table.reduce()     # polars.DataFrame, roll_down / stable / roll_up percentages
+```
+In-memory polars frames work too:
+```python
+import polars as pl
+from roll_rate_analysis import SnapshotRollRateTable
+snap = pl.read_csv("data/snap.csv")
+obs = [pl.scan_csv(p) for p in ["data/obs1.csv", "data/obs2.csv"]]
+perf = [pl.scan_csv(p) for p in ["data/perf1.csv", "data/perf2.csv"]]
+table = SnapshotRollRateTable(
+    snap, obs, perf,
+    unique_key_col="id",
+    delinquency_col="delq",
+    detailed=True,
+    granularity=2,
+)
+table.compute()
+```
+See the notebooks under [`examples/`](examples/) for end-to-end walkthroughs.
+## Development
+This project uses [uv](https://docs.astral.sh/uv/). Clone and bootstrap with:
+```bash
+git clone https://github.com/alexliap/roll_rate_analysis.git
+cd roll_rate_analysis
+uv sync --dev
+```
+Run the test suite, linter, and formatter:
+```bash
+uv run pytest
+uv run ruff check .
+uv run ruff format .
+```
+Pre-commit hooks (ruff + standard checks) keep the tree clean:
+```bash
+uv run pre-commit install
+uv run pre-commit run --all-files
+```
+## License
+MIT — see [LICENSE](LICENSE).

roll_rate_analysis-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+roll_rate_analysis/__init__.py,sha256=vqAO1X2YzXHZqy-ermpbB_nC2kAyKULUnHV_EA0GDI8,383
+roll_rate_analysis/_common.py,sha256=WoB-_4AD-atQBrWqoUraduz6jlDNFNueXkmswReLG8w,5012
+roll_rate_analysis/mom.py,sha256=o3paqctuZ4uCxuXpSNk-0oQ6APE35zV5jUfA1sKAZp4,9136
+roll_rate_analysis/snapshot.py,sha256=jmzagupDkNupzcbyPOuBZPUuWJI9kSNBgl_HF1EKxhY,10694
+roll_rate_analysis-0.2.0.dist-info/METADATA,sha256=o6-fv71lgd0ixudDGDDtrN9QZzQTPW_sjTeSIRAqI8Y,3638
+roll_rate_analysis-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+roll_rate_analysis-0.2.0.dist-info/licenses/LICENSE,sha256=mlX4UXWKb_RtCmuAP6Rs_XY9s1XZyUuZB8AxsDJnVRQ,1076
+roll_rate_analysis-0.2.0.dist-info/RECORD,,

roll_rate_analysis-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

roll_rate_analysis-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023 Alexandros Liapatis
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.