roll-rate-analysis 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ """Roll rate analysis for credit risk scorecards."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from .mom import MOMRollRateTable
6
+ from .snapshot import SnapshotRollRateTable
7
+
8
+ try:
9
+ __version__ = version("roll-rate-analysis")
10
+ except PackageNotFoundError:
11
+ __version__ = "0.0.0+unknown"
12
+
13
+ __all__ = ("MOMRollRateTable", "SnapshotRollRateTable", "__version__")
@@ -0,0 +1,138 @@
1
+ """Shared helpers for roll rate computation.
2
+
3
+ The matrix layout used by both classes is row=primary state, column=secondary
4
+ state. Each row has a single "diagonal" column where the account did not change
5
+ state (stable). Cells to the left of the diagonal are roll_down (state
6
+ improved), cells to the right are roll_up (state worsened). ``reduce_matrix``
7
+ collapses each row into those three buckets using a per-row diagonal index,
8
+ which is the only piece of metadata that differs between MOM (square) and
9
+ Snapshot (rectangular with extra rows in detailed mode).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from pathlib import Path
15
+
16
+ import numpy as np
17
+ import polars as pl
18
+
19
+ LABEL_COL = "from_state"
20
+
21
+ LazySource = pl.LazyFrame | pl.DataFrame | str | Path
22
+
23
+
24
+ def load_lazy(source: LazySource) -> pl.LazyFrame:
25
+ """Return a polars LazyFrame regardless of input type."""
26
+ if isinstance(source, pl.LazyFrame):
27
+ return source
28
+ if isinstance(source, pl.DataFrame):
29
+ return source.lazy()
30
+ if isinstance(source, str | Path):
31
+ return pl.scan_csv(source)
32
+ raise TypeError(
33
+ f"Unsupported input type {type(source).__name__}; "
34
+ "expected polars LazyFrame, DataFrame, or a file path."
35
+ )
36
+
37
+
38
+ def cycle_row_tags(max_delq: int) -> list[str]:
39
+ """Generate the canonical row/column tags ``0_cycle_delinquent`` … ``N+_cycle_delinquent``."""
40
+ return [f"{i}_cycle_delinquent" for i in range(max_delq)] + [f"{max_delq}+_cycle_delinquent"]
41
+
42
+
43
+ def labeled_matrix(
44
+ counts: np.ndarray,
45
+ row_tags: list[str],
46
+ column_tags: list[str],
47
+ ) -> pl.DataFrame:
48
+ """Wrap a 2D numpy count matrix in a polars DataFrame with labeled axes.
49
+
50
+ The first column is named ``from_state`` and holds the row tags; the
51
+ remaining columns are named after ``column_tags`` and hold the integer counts.
52
+ """
53
+ if counts.shape != (len(row_tags), len(column_tags)):
54
+ raise ValueError(
55
+ f"counts shape {counts.shape} does not match ({len(row_tags)}, {len(column_tags)})."
56
+ )
57
+ data: dict[str, list | np.ndarray] = {LABEL_COL: row_tags}
58
+ for j, tag in enumerate(column_tags):
59
+ data[tag] = counts[:, j]
60
+ return pl.DataFrame(data)
61
+
62
+
63
+ def write_capped_counts(
64
+ counts: np.ndarray,
65
+ row: int,
66
+ grouped: pl.DataFrame,
67
+ value_col: str,
68
+ max_delq: int,
69
+ ) -> None:
70
+ """Write per-secondary-value counts into ``counts[row]``, capping at ``max_delq``.
71
+
72
+ ``grouped`` must be a polars DataFrame with two columns: ``value_col`` (the
73
+ secondary delinquency value) and ``len`` (occurrence count). Values strictly
74
+ greater than ``max_delq - 1`` are summed into ``counts[row, max_delq]``.
75
+ """
76
+ if grouped.height == 0:
77
+ return
78
+ below = grouped.filter(pl.col(value_col) <= max_delq - 1)
79
+ above = grouped.filter(pl.col(value_col) > max_delq - 1)
80
+ if below.height:
81
+ counts[row, below[value_col].to_numpy()] += below["len"].to_numpy()
82
+ if above.height:
83
+ counts[row, max_delq] += int(above["len"].sum())
84
+
85
+
86
+ def reduce_matrix(
87
+ matrix: pl.DataFrame,
88
+ diag_cols: list[int],
89
+ percentages: bool = True,
90
+ ) -> pl.DataFrame:
91
+ """Collapse a roll-rate matrix into roll_down / stable / roll_up per row.
92
+
93
+ ``matrix`` is expected in the format produced by :func:`labeled_matrix`:
94
+ a polars DataFrame with a ``from_state`` label column followed by numeric
95
+ transition-count columns.
96
+
97
+ ``diag_cols[i]`` gives, for row ``i``, the index (among the numeric
98
+ columns, 0-based) where "stable" lives. Cells left of that index sum to
99
+ ``roll_down``; the diagonal itself is ``stable``; cells right of it sum to
100
+ ``roll_up``.
101
+ """
102
+ if LABEL_COL not in matrix.columns:
103
+ raise ValueError(f"matrix must contain a '{LABEL_COL}' label column.")
104
+ value_cols = [c for c in matrix.columns if c != LABEL_COL]
105
+ values = matrix.select(value_cols).to_numpy()
106
+ n_rows = values.shape[0]
107
+ if len(diag_cols) != n_rows:
108
+ raise ValueError(f"diag_cols has length {len(diag_cols)} but matrix has {n_rows} rows.")
109
+
110
+ buckets = np.zeros((n_rows, 3), dtype=np.float64)
111
+ for i, d in enumerate(diag_cols):
112
+ row = values[i]
113
+ buckets[i, 0] = row[:d].sum()
114
+ buckets[i, 1] = row[d]
115
+ buckets[i, 2] = row[d + 1 :].sum()
116
+
117
+ if percentages:
118
+ totals = buckets.sum(axis=1, keepdims=True)
119
+ with np.errstate(invalid="ignore", divide="ignore"):
120
+ buckets = np.where(totals > 0, 100 * buckets / totals, 0.0)
121
+ buckets = np.round(buckets, 1)
122
+ return pl.DataFrame(
123
+ {
124
+ LABEL_COL: matrix[LABEL_COL],
125
+ "roll_down": buckets[:, 0],
126
+ "stable": buckets[:, 1],
127
+ "roll_up": buckets[:, 2],
128
+ }
129
+ )
130
+ buckets_int = buckets.astype(np.int64)
131
+ return pl.DataFrame(
132
+ {
133
+ LABEL_COL: matrix[LABEL_COL],
134
+ "roll_down": buckets_int[:, 0],
135
+ "stable": buckets_int[:, 1],
136
+ "roll_up": buckets_int[:, 2],
137
+ }
138
+ )
@@ -0,0 +1,224 @@
1
+ """Month-over-month roll rate table."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Sequence
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import polars.selectors as cs
10
+
11
+ from ._common import (
12
+ LazySource,
13
+ cycle_row_tags,
14
+ labeled_matrix,
15
+ load_lazy,
16
+ reduce_matrix,
17
+ write_capped_counts,
18
+ )
19
+
20
+ _MERGED = "_rr_merged_bin"
21
+ _MERGED_SECONDARY = _MERGED + "_secondary"
22
+
23
+
24
+ class MOMRollRateTable:
25
+ """Month-over-month roll rate table for two consecutive monthly snapshots.
26
+
27
+ Parameters
28
+ ----------
29
+ month_i:
30
+ Data for month ``i``. Accepts a polars ``LazyFrame``/``DataFrame`` or a
31
+ path/string pointing to a CSV file.
32
+ month_i_plus_1:
33
+ Data for month ``i+1``. Same supported types as ``month_i``.
34
+ unique_key_col:
35
+ Name of the account identifier column. Must exist in both inputs.
36
+ delinquency_col:
37
+ Name of the delinquency column (integer months past due). Must exist in
38
+ both inputs.
39
+ max_delq:
40
+ Largest delinquency level kept as its own row/column. Anything above
41
+ rolls into the ``N+`` bucket.
42
+ binary_cols:
43
+ Optional binary indicator columns to append to the matrix. Listed in
44
+ descending priority — the first entry wins ties. Each indicator gets
45
+ one extra row and column.
46
+
47
+ Use
48
+ ---
49
+ >>> table = MOMRollRateTable(
50
+ ... "jan.csv", "feb.csv",
51
+ ... unique_key_col="id", delinquency_col="delq", max_delq=6,
52
+ ... )
53
+ >>> matrix = table.compute() # polars.DataFrame, the full transition matrix
54
+ >>> reduced = table.reduce() # polars.DataFrame, roll_down / stable / roll_up
55
+
56
+ ``compute`` and ``reduce`` are idempotent; the matrix is cached after the
57
+ first call. Both return polars ``DataFrame``s whose first column
58
+ (``from_state``) holds the row label.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ month_i: LazySource,
64
+ month_i_plus_1: LazySource,
65
+ *,
66
+ unique_key_col: str,
67
+ delinquency_col: str,
68
+ max_delq: int = 6,
69
+ binary_cols: Sequence[str] = (),
70
+ ) -> None:
71
+ if max_delq < 1:
72
+ raise ValueError("max_delq must be >= 1.")
73
+ binary_cols = tuple(binary_cols)
74
+ if delinquency_col in binary_cols or unique_key_col in binary_cols:
75
+ raise ValueError("binary_cols must not include the unique_key or delinquency columns.")
76
+
77
+ self._month_i_source = month_i
78
+ self._month_i_plus_1_source = month_i_plus_1
79
+ self.unique_key_col = unique_key_col
80
+ self.delinquency_col = delinquency_col
81
+ self.max_delq = max_delq
82
+ self.binary_cols = binary_cols
83
+
84
+ self.tags = cycle_row_tags(max_delq) + list(reversed(binary_cols))
85
+ self._matrix: pl.DataFrame | None = None
86
+
87
+ @property
88
+ def matrix(self) -> pl.DataFrame:
89
+ """Return the cached transition matrix, computing it on first access."""
90
+ if self._matrix is None:
91
+ self.compute()
92
+ assert self._matrix is not None
93
+ return self._matrix
94
+
95
+ def compute(self) -> pl.DataFrame:
96
+ """Compute the transition matrix and return it as a polars DataFrame."""
97
+ n = self.max_delq + 1 + len(self.binary_cols)
98
+ counts = np.zeros((n, n), dtype=np.int64)
99
+ data = self._joined_frame()
100
+
101
+ if self.binary_cols:
102
+ self._accumulate_delq_to_delq(counts, data, exclude_binary=True)
103
+ self._accumulate_delq_to_binary(counts, data)
104
+ self._accumulate_binary_to_delq(counts, data)
105
+ self._accumulate_binary_to_binary(counts, data)
106
+ else:
107
+ self._accumulate_delq_to_delq(counts, data, exclude_binary=False)
108
+
109
+ self._matrix = labeled_matrix(counts, self.tags, self.tags)
110
+ return self._matrix
111
+
112
+ def reduce(self, percentages: bool = True) -> pl.DataFrame:
113
+ """Return roll_down / stable / roll_up per row, in percentages or counts."""
114
+ diag_cols = list(range(len(self.tags)))
115
+ return reduce_matrix(self.matrix, diag_cols, percentages=percentages)
116
+
117
+ # ----- pipeline -------------------------------------------------------
118
+
119
+ def _joined_frame(self) -> pl.DataFrame:
120
+ """Load both months, project the relevant columns, optionally merge binaries, and join."""
121
+ select_cols = [self.unique_key_col, self.delinquency_col, *self.binary_cols]
122
+ left = load_lazy(self._month_i_source).select(select_cols)
123
+ right = load_lazy(self._month_i_plus_1_source).select(select_cols)
124
+
125
+ if self.binary_cols:
126
+ left = self._merge_binary_cols(left)
127
+ right = self._merge_binary_cols(right)
128
+
129
+ return left.join(right, how="left", on=self.unique_key_col, suffix="_secondary").collect()
130
+
131
+ def _merge_binary_cols(self, frame: pl.LazyFrame) -> pl.LazyFrame:
132
+ """Collapse the binary indicator columns into one priority-valued column.
133
+
134
+ Priority is encoded as ``len(binary_cols), len(binary_cols)-1, …, 1`` so
135
+ ``binary_cols[0]`` (highest priority) gets the largest value. When more
136
+ than one indicator is set on the same row, ``max_horizontal`` keeps the
137
+ winner.
138
+ """
139
+ n = len(self.binary_cols)
140
+ for idx, col in enumerate(self.binary_cols):
141
+ priority = n - idx
142
+ frame = frame.with_columns(
143
+ pl.when(pl.col(col) == 1)
144
+ .then(pl.lit(priority))
145
+ .otherwise(pl.col(col))
146
+ .alias(f"{col}__priority")
147
+ )
148
+ return frame.with_columns(
149
+ pl.max_horizontal(cs.ends_with("__priority")).alias(_MERGED)
150
+ ).drop(cs.ends_with("__priority"))
151
+
152
+ # ----- accumulation per case ------------------------------------------
153
+
154
+ def _accumulate_delq_to_delq(
155
+ self,
156
+ counts: np.ndarray,
157
+ data: pl.DataFrame,
158
+ *,
159
+ exclude_binary: bool,
160
+ ) -> None:
161
+ """Accounts that had a normal delinquency status in both months."""
162
+ if exclude_binary:
163
+ data = data.filter((pl.col(_MERGED) == 0) & (pl.col(_MERGED_SECONDARY) == 0))
164
+ secondary = f"{self.delinquency_col}_secondary"
165
+ for cycle in self._observed_cycles(data, self.delinquency_col):
166
+ grouped = self._group_counts(data, self.delinquency_col, cycle, secondary)
167
+ write_capped_counts(
168
+ counts, min(cycle, self.max_delq), grouped, secondary, self.max_delq
169
+ )
170
+
171
+ def _accumulate_delq_to_binary(self, counts: np.ndarray, data: pl.DataFrame) -> None:
172
+ """Accounts that moved from a delinquency state into a binary indicator."""
173
+ data = data.filter((pl.col(_MERGED) == 0) & (pl.col(_MERGED_SECONDARY) > 0))
174
+ for cycle in self._observed_cycles(data, self.delinquency_col):
175
+ grouped = self._group_counts(data, self.delinquency_col, cycle, _MERGED_SECONDARY)
176
+ self._write_binary_secondary(counts, min(cycle, self.max_delq), grouped)
177
+
178
+ def _accumulate_binary_to_delq(self, counts: np.ndarray, data: pl.DataFrame) -> None:
179
+ """Accounts whose binary indicator was set in month i but had a delq state in i+1."""
180
+ data = data.filter((pl.col(_MERGED) > 0) & (pl.col(_MERGED_SECONDARY) == 0))
181
+ secondary = f"{self.delinquency_col}_secondary"
182
+ for priority in self._observed_cycles(data, _MERGED):
183
+ grouped = self._group_counts(data, _MERGED, priority, secondary)
184
+ write_capped_counts(counts, self.max_delq + priority, grouped, secondary, self.max_delq)
185
+
186
+ def _accumulate_binary_to_binary(self, counts: np.ndarray, data: pl.DataFrame) -> None:
187
+ """Accounts whose binary indicator was set in both months."""
188
+ data = data.filter((pl.col(_MERGED) > 0) & (pl.col(_MERGED_SECONDARY) > 0))
189
+ for priority in self._observed_cycles(data, _MERGED):
190
+ grouped = self._group_counts(data, _MERGED, priority, _MERGED_SECONDARY)
191
+ self._write_binary_secondary(counts, self.max_delq + priority, grouped)
192
+
193
+ # ----- low-level helpers ---------------------------------------------
194
+
195
+ @staticmethod
196
+ def _observed_cycles(data: pl.DataFrame, col: str) -> range:
197
+ if data.height == 0:
198
+ return range(0)
199
+ return range(int(data[col].min()), int(data[col].max()) + 1)
200
+
201
+ @staticmethod
202
+ def _group_counts(
203
+ data: pl.DataFrame, primary: str, primary_value: int, secondary: str
204
+ ) -> pl.DataFrame:
205
+ return (
206
+ data.filter(pl.col(primary) == primary_value)
207
+ .group_by([primary, secondary])
208
+ .len()
209
+ .sort(secondary)
210
+ )
211
+
212
+ def _write_binary_secondary(self, counts: np.ndarray, row: int, grouped: pl.DataFrame) -> None:
213
+ """Apply counts where the secondary axis is a binary-priority column.
214
+
215
+ Each priority ``k`` maps directly to column ``max_delq + k`` with no
216
+ capping (unlike the delinquency axis, the priority values are exact).
217
+ """
218
+ if grouped.height == 0:
219
+ return
220
+ cols = grouped[_MERGED_SECONDARY].to_numpy() + self.max_delq
221
+ counts[row, cols] += grouped["len"].to_numpy()
222
+
223
+
224
+ __all__ = ("MOMRollRateTable",)
@@ -0,0 +1,284 @@
1
+ """Snapshot roll rate table over observation and performance windows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Sequence
6
+ from dataclasses import dataclass
7
+
8
+ import numpy as np
9
+ import polars as pl
10
+ import polars.selectors as cs
11
+
12
+ from ._common import (
13
+ LazySource,
14
+ cycle_row_tags,
15
+ labeled_matrix,
16
+ load_lazy,
17
+ reduce_matrix,
18
+ write_capped_counts,
19
+ )
20
+
21
+ _OBS_MAX = "obs_max_delq"
22
+ _PERF_MAX = "perf_max_delq"
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class _RowSpec:
27
+ label: str
28
+ level: int
29
+
30
+
31
+ class SnapshotRollRateTable:
32
+ """Roll rate table for a snapshot month with observation and performance windows.
33
+
34
+ For every account in the snapshot, the observation window is reduced to its
35
+ maximum delinquency across the supplied observation files, and similarly for
36
+ the performance window. The resulting transition matrix has rows indexed by
37
+ the observation max-delinquency and columns indexed by the performance
38
+ max-delinquency.
39
+
40
+ Parameters
41
+ ----------
42
+ snapshot:
43
+ Data for the snapshot month (defines the account universe). Accepts a
44
+ polars ``LazyFrame``/``DataFrame`` or a path/string pointing to a CSV.
45
+ observation:
46
+ Sequence of frames or paths forming the observation window.
47
+ performance:
48
+ Sequence of frames or paths forming the performance window.
49
+ unique_key_col:
50
+ Name of the account identifier column. Must exist in every input.
51
+ delinquency_col:
52
+ Name of the delinquency column. Must exist in every observation and
53
+ performance frame.
54
+ max_delq:
55
+ Largest delinquency level kept as its own row/column. Anything above
56
+ rolls into the ``N+`` bucket.
57
+ detailed:
58
+ Split delinquency levels 3 and 4 into ``granularity`` sub-rows showing
59
+ how many times the account hit that level during the observation window.
60
+ granularity:
61
+ Number of sub-rows per detailed level. Must be ≥ 2 when ``detailed``.
62
+ keep_cols:
63
+ Optional column whitelist applied to each observation/performance frame
64
+ before joining (memory optimisation). Must include ``delinquency_col``.
65
+
66
+ Use
67
+ ---
68
+ >>> table = SnapshotRollRateTable(
69
+ ... "snap.csv",
70
+ ... ["obs1.csv", "obs2.csv"],
71
+ ... ["perf1.csv", "perf2.csv"],
72
+ ... unique_key_col="id",
73
+ ... delinquency_col="delq",
74
+ ... detailed=True,
75
+ ... granularity=2,
76
+ ... )
77
+ >>> matrix = table.compute() # polars.DataFrame, the full transition matrix
78
+ >>> reduced = table.reduce() # polars.DataFrame, roll_down / stable / roll_up
79
+
80
+ ``compute`` and ``reduce`` are idempotent; the matrix is cached after the
81
+ first call.
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ snapshot: LazySource,
87
+ observation: Sequence[LazySource],
88
+ performance: Sequence[LazySource],
89
+ *,
90
+ unique_key_col: str,
91
+ delinquency_col: str,
92
+ max_delq: int = 6,
93
+ detailed: bool = False,
94
+ granularity: int = 1,
95
+ keep_cols: Sequence[str] | None = None,
96
+ ) -> None:
97
+ if max_delq < 1:
98
+ raise ValueError("max_delq must be >= 1.")
99
+ if granularity < 1:
100
+ raise ValueError("granularity must be >= 1.")
101
+ if detailed and granularity < 2:
102
+ raise ValueError("granularity must be >= 2 when detailed=True.")
103
+
104
+ observation = list(observation)
105
+ performance = list(performance)
106
+ if not observation:
107
+ raise ValueError("at least one observation frame is required.")
108
+ if not performance:
109
+ raise ValueError("at least one performance frame is required.")
110
+
111
+ if keep_cols is not None:
112
+ keep_cols = tuple(keep_cols)
113
+ if delinquency_col not in keep_cols:
114
+ raise ValueError(
115
+ "keep_cols must include the delinquency_col so that it survives projection."
116
+ )
117
+
118
+ self._snapshot_source = snapshot
119
+ self._observation_sources = observation
120
+ self._performance_sources = performance
121
+ self.unique_key_col = unique_key_col
122
+ self.delinquency_col = delinquency_col
123
+ self.max_delq = max_delq
124
+ self.detailed = detailed
125
+ self.granularity = granularity if detailed else 1
126
+ self.keep_cols = keep_cols
127
+
128
+ self._row_specs = self._build_row_specs()
129
+ self.row_tags = [s.label for s in self._row_specs]
130
+ self.column_tags = cycle_row_tags(max_delq)
131
+ self._matrix: pl.DataFrame | None = None
132
+
133
+ @property
134
+ def extra_rows(self) -> int:
135
+ """Number of additional rows beyond ``max_delq + 1`` due to detailed mode."""
136
+ return 2 * (self.granularity - 1) if self.detailed else 0
137
+
138
+ @property
139
+ def matrix(self) -> pl.DataFrame:
140
+ """Return the cached transition matrix, computing it on first access."""
141
+ if self._matrix is None:
142
+ self.compute()
143
+ assert self._matrix is not None
144
+ return self._matrix
145
+
146
+ def compute(self) -> pl.DataFrame:
147
+ """Compute the transition matrix and return it as a polars DataFrame."""
148
+ n_rows = self.max_delq + 1 + self.extra_rows
149
+ n_cols = self.max_delq + 1
150
+ counts = np.zeros((n_rows, n_cols), dtype=np.int64)
151
+
152
+ data = self._build_joined().collect()
153
+ if data.height > 0:
154
+ cycles = range(int(data[_OBS_MAX].min()), int(data[_OBS_MAX].max()) + 1)
155
+ for cycle in cycles:
156
+ self._accumulate_cycle(counts, data, cycle)
157
+
158
+ self._matrix = labeled_matrix(counts, self.row_tags, self.column_tags)
159
+ return self._matrix
160
+
161
+ def reduce(self, percentages: bool = True) -> pl.DataFrame:
162
+ """Return roll_down / stable / roll_up per row, in percentages or counts."""
163
+ diag_cols = [spec.level for spec in self._row_specs]
164
+ return reduce_matrix(self.matrix, diag_cols, percentages=percentages)
165
+
166
+ # ----- row layout -----------------------------------------------------
167
+
168
+ def _build_row_specs(self) -> list[_RowSpec]:
169
+ specs: list[_RowSpec] = []
170
+ for i in range(self.max_delq):
171
+ if self.detailed and i in (3, 4):
172
+ for j in range(1, self.granularity):
173
+ specs.append(_RowSpec(f"{i}x{j}_cycle_delinquent", i))
174
+ specs.append(_RowSpec(f"{i}x{self.granularity}+_cycle_delinquent", i))
175
+ else:
176
+ specs.append(_RowSpec(f"{i}_cycle_delinquent", i))
177
+ specs.append(_RowSpec(f"{self.max_delq}+_cycle_delinquent", self.max_delq))
178
+ return specs
179
+
180
+ def _row_index(self, cycle: int, rank: int = 1) -> int:
181
+ """Return the matrix row index for ``(cycle, rank)``.
182
+
183
+ ``rank`` is only meaningful when ``detailed`` is on and ``cycle`` is 3 or 4.
184
+ """
185
+ if cycle >= self.max_delq:
186
+ return self.max_delq + self.extra_rows
187
+ if self.detailed and cycle in (3, 4):
188
+ base = 3 if cycle == 3 else 3 + self.granularity
189
+ return base + rank - 1
190
+ if self.detailed and cycle >= 5:
191
+ return cycle + self.extra_rows
192
+ return cycle
193
+
194
+ # ----- pipeline -------------------------------------------------------
195
+
196
+ def _build_joined(self) -> pl.LazyFrame:
197
+ """Build the merged frame of ``(unique_key, obs_max_delq, perf_max_delq, …)``."""
198
+ snapshot = load_lazy(self._snapshot_source).select([self.unique_key_col])
199
+ obs = self._build_window(snapshot, self._observation_sources, _OBS_MAX, "obs")
200
+ if self.detailed:
201
+ obs = obs.with_columns(
202
+ [
203
+ pl.sum_horizontal(cs.starts_with(self.delinquency_col) == 3).alias(
204
+ "obs_times_3_cycle"
205
+ ),
206
+ pl.sum_horizontal(cs.starts_with(self.delinquency_col) == 4).alias(
207
+ "obs_times_4_cycle"
208
+ ),
209
+ ]
210
+ )
211
+ perf = self._build_window(snapshot, self._performance_sources, _PERF_MAX, "perf")
212
+
213
+ joined = obs.join(perf, how="left", on=self.unique_key_col, suffix="_perfwin")
214
+ keep = [self.unique_key_col, _OBS_MAX, _PERF_MAX]
215
+ if self.detailed:
216
+ keep = [
217
+ self.unique_key_col,
218
+ _OBS_MAX,
219
+ "obs_times_3_cycle",
220
+ "obs_times_4_cycle",
221
+ _PERF_MAX,
222
+ ]
223
+ return joined.select(keep)
224
+
225
+ def _build_window(
226
+ self,
227
+ snapshot: pl.LazyFrame,
228
+ sources: Sequence[LazySource],
229
+ max_alias: str,
230
+ suffix_tag: str,
231
+ ) -> pl.LazyFrame:
232
+ """Join each window file into ``snapshot`` and reduce to one max-delq column."""
233
+ result = snapshot
234
+ for i, src in enumerate(sources):
235
+ frame = load_lazy(src)
236
+ if self.keep_cols is not None:
237
+ frame = frame.select([self.unique_key_col, *self.keep_cols])
238
+ result = result.join(
239
+ frame,
240
+ how="left",
241
+ on=self.unique_key_col,
242
+ suffix=f"_{suffix_tag}{i}",
243
+ )
244
+ return result.with_columns(
245
+ pl.max_horizontal(cs.starts_with(self.delinquency_col)).alias(max_alias)
246
+ )
247
+
248
+ # ----- accumulation per cycle ----------------------------------------
249
+
250
+ def _accumulate_cycle(self, counts: np.ndarray, data: pl.DataFrame, cycle: int) -> None:
251
+ rows = data.filter(pl.col(_OBS_MAX) == cycle)
252
+ if rows.height == 0:
253
+ return
254
+
255
+ if self.detailed and cycle in (3, 4):
256
+ self._accumulate_detailed(counts, rows, cycle)
257
+ return
258
+
259
+ grouped = rows.group_by([_OBS_MAX, _PERF_MAX]).len().sort(_PERF_MAX)
260
+ write_capped_counts(counts, self._row_index(cycle), grouped, _PERF_MAX, self.max_delq)
261
+
262
+ def _accumulate_detailed(self, counts: np.ndarray, rows: pl.DataFrame, cycle: int) -> None:
263
+ times_col = f"obs_times_{cycle}_cycle"
264
+ grouped = (
265
+ rows.filter(pl.col(times_col) >= 1)
266
+ .group_by([times_col, _PERF_MAX])
267
+ .len()
268
+ .sort([times_col, _PERF_MAX])
269
+ )
270
+ for rank in range(1, self.granularity + 1):
271
+ if rank < self.granularity:
272
+ sub = grouped.filter(pl.col(times_col) == rank)
273
+ else:
274
+ sub = (
275
+ grouped.filter(pl.col(times_col) >= rank)
276
+ .group_by(_PERF_MAX)
277
+ .agg(pl.col("len").sum())
278
+ .sort(_PERF_MAX)
279
+ )
280
+ row_idx = self._row_index(cycle, rank)
281
+ write_capped_counts(counts, row_idx, sub, _PERF_MAX, self.max_delq)
282
+
283
+
284
+ __all__ = ("SnapshotRollRateTable",)
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: roll-rate-analysis
3
+ Version: 0.2.0
4
+ Summary: Roll rate analysis for credit risk scorecards.
5
+ Project-URL: Source, https://github.com/alexliap/roll_rate_analysis
6
+ Author-email: Alexandros Liapatis <alexandrosliapatis@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: numpy
21
+ Requires-Dist: polars>=1.0
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Roll Rate Analysis
25
+
26
+ ![deploy on pypi](https://github.com/alexliap/roll_rate_analysis/actions/workflows/publish-package.yaml/badge.svg)
27
+ ![PyPI Version](https://img.shields.io/pypi/v/roll-rate-analysis?label=pypi%20package)
28
+ ![Downloads](https://static.pepy.tech/badge/roll-rate-analysis)
29
+
30
+ Roll rate analysis is a credit-risk technique used to define the target variable when building Application or Behavioural scorecards. It's an iterative process — this package parametrises the moving parts so each iteration is a few lines of code rather than a fresh notebook.
31
+
32
+ The library has zero pandas dependency: inputs and outputs are [Polars](https://pola.rs/) frames.
33
+
34
+ ## Installation
35
+
36
+ From PyPI:
37
+
38
+ ```bash
39
+ uv add roll-rate-analysis # uv projects
40
+ pip install roll-rate-analysis # plain pip
41
+ ```
42
+
43
+ Requires Python 3.10 or newer.
44
+
45
+ ## What's in the box
46
+
47
+ Two classes, one method each:
48
+
49
+ | Class | Use case |
50
+ | --- | --- |
51
+ | `MOMRollRateTable` | Transition matrix between two consecutive months. |
52
+ | `SnapshotRollRateTable` | Transition matrix between an observation window and a performance window around a snapshot month. |
53
+
54
+ Both expose `compute()` (full transition matrix) and `reduce()` (roll_down / stable / roll_up summary). Both return polars `DataFrame`s whose first column (`from_state`) holds the row label.
55
+
56
+ ## Quick start
57
+
58
+ ```python
59
+ from roll_rate_analysis import MOMRollRateTable
60
+
61
+ table = MOMRollRateTable(
62
+ "data/jan.csv",
63
+ "data/feb.csv",
64
+ unique_key_col="id",
65
+ delinquency_col="delq",
66
+ max_delq=6,
67
+ )
68
+
69
+ table.compute() # polars.DataFrame, full transition matrix
70
+ table.reduce() # polars.DataFrame, roll_down / stable / roll_up percentages
71
+ ```
72
+
73
+ In-memory polars frames work too:
74
+
75
+ ```python
76
+ import polars as pl
77
+ from roll_rate_analysis import SnapshotRollRateTable
78
+
79
+ snap = pl.read_csv("data/snap.csv")
80
+ obs = [pl.scan_csv(p) for p in ["data/obs1.csv", "data/obs2.csv"]]
81
+ perf = [pl.scan_csv(p) for p in ["data/perf1.csv", "data/perf2.csv"]]
82
+
83
+ table = SnapshotRollRateTable(
84
+ snap, obs, perf,
85
+ unique_key_col="id",
86
+ delinquency_col="delq",
87
+ detailed=True,
88
+ granularity=2,
89
+ )
90
+ table.compute()
91
+ ```
92
+
93
+ See the notebooks under [`examples/`](examples/) for end-to-end walkthroughs.
94
+
95
+ ## Development
96
+
97
+ This project uses [uv](https://docs.astral.sh/uv/). Clone and bootstrap with:
98
+
99
+ ```bash
100
+ git clone https://github.com/alexliap/roll_rate_analysis.git
101
+ cd roll_rate_analysis
102
+ uv sync --dev
103
+ ```
104
+
105
+ Run the test suite, linter, and formatter:
106
+
107
+ ```bash
108
+ uv run pytest
109
+ uv run ruff check .
110
+ uv run ruff format .
111
+ ```
112
+
113
+ Pre-commit hooks (ruff + standard checks) keep the tree clean:
114
+
115
+ ```bash
116
+ uv run pre-commit install
117
+ uv run pre-commit run --all-files
118
+ ```
119
+
120
+ ## License
121
+
122
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,8 @@
1
+ roll_rate_analysis/__init__.py,sha256=vqAO1X2YzXHZqy-ermpbB_nC2kAyKULUnHV_EA0GDI8,383
2
+ roll_rate_analysis/_common.py,sha256=WoB-_4AD-atQBrWqoUraduz6jlDNFNueXkmswReLG8w,5012
3
+ roll_rate_analysis/mom.py,sha256=o3paqctuZ4uCxuXpSNk-0oQ6APE35zV5jUfA1sKAZp4,9136
4
+ roll_rate_analysis/snapshot.py,sha256=jmzagupDkNupzcbyPOuBZPUuWJI9kSNBgl_HF1EKxhY,10694
5
+ roll_rate_analysis-0.2.0.dist-info/METADATA,sha256=o6-fv71lgd0ixudDGDDtrN9QZzQTPW_sjTeSIRAqI8Y,3638
6
+ roll_rate_analysis-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
7
+ roll_rate_analysis-0.2.0.dist-info/licenses/LICENSE,sha256=mlX4UXWKb_RtCmuAP6Rs_XY9s1XZyUuZB8AxsDJnVRQ,1076
8
+ roll_rate_analysis-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Alexandros Liapatis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.