datasci-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ from datasci_toolkit.bin_editor import BinEditor
2
+ from datasci_toolkit.bin_editor_widget import BinEditorWidget
3
+ from datasci_toolkit.grouping import StabilityGrouping, WOETransformer
4
+ from datasci_toolkit.metrics import BootstrapGini, feature_power, gini, gini_by_period, iv, ks, lift, lift_by_period, plot_metric_by_period
5
+ from datasci_toolkit.model_selection import AUCStepwiseLogit
6
+ from datasci_toolkit.variable_clustering import CorrVarClus
7
+ from datasci_toolkit.label_imputation import KNNLabelImputer, TargetImputer
8
+ from datasci_toolkit.stability import ESI, PSI, StabilityMonitor, plot_psi_comparison, psi_hist
9
+
10
+ __all__ = [
11
+ "PSI",
12
+ "ESI",
13
+ "StabilityMonitor",
14
+ "plot_psi_comparison",
15
+ "psi_hist",
16
+ "WOETransformer",
17
+ "StabilityGrouping",
18
+ "AUCStepwiseLogit",
19
+ "gini",
20
+ "ks",
21
+ "lift",
22
+ "iv",
23
+ "BootstrapGini",
24
+ "feature_power",
25
+ "TargetImputer",
26
+ "KNNLabelImputer",
27
+ "BinEditor",
28
+ "BinEditorWidget",
29
+ "CorrVarClus",
30
+ "gini_by_period",
31
+ "lift_by_period",
32
+ "plot_metric_by_period",
33
+ ]
@@ -0,0 +1,373 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import polars as pl
10
+
11
+ from datasci_toolkit.grouping import _rsi
12
+
13
+ _SMOOTH = 0.5
14
+
15
+
16
+ class FeatureDtype(str, Enum):
17
+ NUMERIC = "float"
18
+ CATEGORICAL = "category"
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class BinStats:
23
+ counts: np.ndarray
24
+ event_rates: np.ndarray
25
+ woe: np.ndarray
26
+ iv: float
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class TemporalStats:
31
+ months: list[Any]
32
+ rsi: float
33
+ event_rates: list[list[float | None]]
34
+ pop_shares: list[list[float]]
35
+
36
+
37
+ @dataclass
38
+ class FeatureState:
39
+ feature: str
40
+ dtype: FeatureDtype
41
+ n_bins: int
42
+ bins: list[str] | dict[str, int]
43
+ counts: list[float]
44
+ event_rates: list[float | None]
45
+ woe: list[float]
46
+ iv: float
47
+ splits: list[float] | None = None
48
+ groups: dict[int, list[str]] | None = None
49
+ temporal: TemporalStats | None = None
50
+
51
+
52
+ def _bin_stats(target: np.ndarray, weights: np.ndarray, assignments: np.ndarray, n_bins: int) -> BinStats:
53
+ total_ev = float((target * weights).sum())
54
+ total_nev = float(((1.0 - target) * weights).sum())
55
+ yw = target * weights
56
+ counts = np.bincount(assignments, weights=weights, minlength=n_bins + 1).astype(float)
57
+ events = np.bincount(assignments, weights=yw, minlength=n_bins + 1).astype(float)
58
+ nonevents = counts - events
59
+ event_rates = np.where(counts > 0, events / counts, np.nan)
60
+
61
+ events_per_bin, nonevents_per_bin = events[:n_bins], nonevents[:n_bins]
62
+ event_dist = (events_per_bin + _SMOOTH) / (total_ev + _SMOOTH * n_bins)
63
+ nonevent_dist = (nonevents_per_bin + _SMOOTH) / (total_nev + _SMOOTH * n_bins)
64
+ woe_per_bin = np.log(event_dist / nonevent_dist)
65
+ iv = float(((event_dist - nonevent_dist) * woe_per_bin).sum())
66
+
67
+ nan_event_dist = (events[n_bins] + _SMOOTH) / (total_ev + _SMOOTH)
68
+ nan_nonevent_dist = (nonevents[n_bins] + _SMOOTH) / (total_nev + _SMOOTH)
69
+ woe_nan = float(np.log(nan_event_dist / nan_nonevent_dist))
70
+
71
+ return BinStats(counts=counts, event_rates=event_rates, woe=np.append(woe_per_bin, woe_nan), iv=iv)
72
+
73
+
74
+ def _temporal_stats(
75
+ target: np.ndarray,
76
+ weights: np.ndarray,
77
+ assignments: np.ndarray,
78
+ n_bins: int,
79
+ time_periods: np.ndarray,
80
+ threshold: float,
81
+ ) -> TemporalStats:
82
+ months = np.sort(np.unique(time_periods))
83
+ event_rate_by_bin: list[list[float | None]] = [[] for _ in range(n_bins)]
84
+ pop_share_by_bin: list[list[float]] = [[] for _ in range(n_bins)]
85
+
86
+ for month in months:
87
+ mask = time_periods == month
88
+ stats = _bin_stats(target[mask], weights[mask], assignments[mask], n_bins)
89
+ total = float(stats.counts[:n_bins].sum()) or 1.0
90
+ for bin_index in range(n_bins):
91
+ event_rate = stats.event_rates[bin_index]
92
+ event_rate_by_bin[bin_index].append(None if np.isnan(event_rate) else round(float(event_rate), 6))
93
+ pop_share_by_bin[bin_index].append(round(float(stats.counts[bin_index] / total), 6))
94
+
95
+ scores_array: list[float] = []
96
+ rates_array: list[float] = []
97
+ months_array: list[Any] = []
98
+ for month_index, month in enumerate(months):
99
+ for bin_index in range(n_bins):
100
+ event_rate = event_rate_by_bin[bin_index][month_index]
101
+ if event_rate is not None:
102
+ scores_array.append(float(bin_index))
103
+ rates_array.append(event_rate)
104
+ months_array.append(month)
105
+
106
+ rsi = _rsi(np.array(scores_array), np.array(rates_array), np.array(months_array), threshold) if len(scores_array) > 1 else 1.0
107
+
108
+ return TemporalStats(
109
+ months=months.tolist(),
110
+ rsi=round(rsi, 4),
111
+ event_rates=event_rate_by_bin,
112
+ pop_shares=pop_share_by_bin,
113
+ )
114
+
115
+
116
+ def _num_assign(values: np.ndarray, splits: list[float]) -> np.ndarray:
117
+ missing_mask = np.isnan(values)
118
+ assignments = np.digitize(values, splits)
119
+ assignments[missing_mask] = len(splits) + 1
120
+ return assignments
121
+
122
+
123
+ def _cat_assign(values: np.ndarray, category_bins: dict[str, int]) -> np.ndarray:
124
+ n_groups = max(category_bins.values()) + 1 if category_bins else 0
125
+ assignments = np.full(len(values), n_groups, dtype=np.intp)
126
+ for category, group in category_bins.items():
127
+ assignments[values == category] = group
128
+ return assignments
129
+
130
+
131
+ def _num_labels(splits: list[float]) -> list[str]:
132
+ if not splits:
133
+ return ["-inf to inf", "NaN"]
134
+ split_strs = [f"{v:.4g}" for v in splits]
135
+ return [f"-inf to {split_strs[0]}"] + [f"{split_strs[i]} to {split_strs[i+1]}" for i in range(len(split_strs) - 1)] + [f"{split_strs[-1]} to inf", "NaN"]
136
+
137
+
138
+ def _num_state(feat: str, splits: list[float], values: np.ndarray, target: np.ndarray, weights: np.ndarray) -> FeatureState:
139
+ n_bins = len(splits) + 1
140
+ stats = _bin_stats(target, weights, _num_assign(values, splits), n_bins)
141
+ return FeatureState(
142
+ feature=feat,
143
+ dtype=FeatureDtype.NUMERIC,
144
+ n_bins=n_bins,
145
+ splits=list(splits),
146
+ bins=_num_labels(splits),
147
+ counts=stats.counts.tolist(),
148
+ event_rates=[None if np.isnan(v) else round(float(v), 6) for v in stats.event_rates],
149
+ woe=[round(float(v), 6) for v in stats.woe],
150
+ iv=round(stats.iv, 6),
151
+ )
152
+
153
+
154
+ def _cat_state(feat: str, category_bins: dict[str, int], values: np.ndarray, target: np.ndarray, weights: np.ndarray) -> FeatureState:
155
+ n_groups = max(category_bins.values()) + 1 if category_bins else 0
156
+ stats = _bin_stats(target, weights, _cat_assign(values, category_bins), n_groups)
157
+ groups: dict[int, list[str]] = {}
158
+ for cat, grp in category_bins.items():
159
+ groups.setdefault(grp, []).append(str(cat))
160
+ return FeatureState(
161
+ feature=feat,
162
+ dtype=FeatureDtype.CATEGORICAL,
163
+ n_bins=n_groups,
164
+ groups={k: sorted(v) for k, v in groups.items()},
165
+ bins=dict(category_bins),
166
+ counts=stats.counts.tolist(),
167
+ event_rates=[None if np.isnan(v) else round(float(v), 6) for v in stats.event_rates],
168
+ woe=[round(float(v), 6) for v in stats.woe],
169
+ iv=round(stats.iv, 6),
170
+ )
171
+
172
+
173
+ class BinEditor:
174
+ """Headless state machine for editing bin boundaries.
175
+
176
+ Works identically in plain Python scripts, notebooks, and agents. All
177
+ edits are logged per feature with undo support. Call `accept()` to export
178
+ the final bin specs dict for use with `WOETransformer`.
179
+
180
+ Args:
181
+ bin_specs: Initial bin specifications — a dict produced by
182
+ `StabilityGrouping.bin_specs_` or built manually.
183
+ features: Feature DataFrame matching the features in ``bin_specs``.
184
+ target: Binary target series (0/1 or float).
185
+ time_periods: Optional time series for temporal stability metrics.
186
+ weights: Optional sample weight series.
187
+ stability_threshold: RSI threshold used to flag unstable bins in the
188
+ state dict (does not block edits).
189
+
190
+ Note:
191
+ All state is accessible via `state(feat)`, which returns a `FeatureState`
192
+ dataclass with attributes ``bins``, ``n_bins``, ``counts``, ``event_rates``,
193
+ ``woe``, ``iv``, ``dtype``, ``groups``, and ``temporal``.
194
+ """
195
+
196
+ def __init__(
197
+ self,
198
+ bin_specs: dict[str, dict[str, Any]],
199
+ features: pl.DataFrame,
200
+ target: pl.Series,
201
+ time_periods: pl.Series | None = None,
202
+ weights: pl.Series | None = None,
203
+ stability_threshold: float = 0.1,
204
+ ) -> None:
205
+ self._targets = target.cast(pl.Float64).to_numpy()
206
+ self._weights = weights.cast(pl.Float64).to_numpy() if weights is not None else np.ones(len(self._targets))
207
+ self._time: np.ndarray | None = time_periods.to_numpy() if time_periods is not None else None
208
+ self._threshold = stability_threshold
209
+ self._x: dict[str, np.ndarray] = {}
210
+ self._splits: dict[str, list[float]] = {}
211
+ self._cat_bins: dict[str, dict[str, int]] = {}
212
+ self._history: dict[str, list[tuple[str, Any]]] = {}
213
+ self._orig: dict[str, dict[str, Any]] = {}
214
+
215
+ for feat, spec in bin_specs.items():
216
+ if feat not in features.columns:
217
+ continue
218
+ self._orig[feat] = spec
219
+ self._history[feat] = []
220
+ if spec["dtype"] == FeatureDtype.NUMERIC:
221
+ self._x[feat] = features[feat].cast(pl.Float64).to_numpy()
222
+ self._splits[feat] = [float(s) for s in spec["bins"][1:-1] if np.isfinite(s)]
223
+ else:
224
+ self._x[feat] = features[feat].cast(pl.Utf8).to_numpy().astype(str)
225
+ self._cat_bins[feat] = {str(k): int(v) for k, v in spec["bins"].items()}
226
+
227
+ def features(self) -> list[str]:
228
+ return list(self._splits.keys()) + list(self._cat_bins.keys())
229
+
230
+ def _base_state(self, feat: str) -> FeatureState:
231
+ if feat in self._splits:
232
+ return _num_state(feat, self._splits[feat], self._x[feat], self._targets, self._weights)
233
+ return _cat_state(feat, self._cat_bins[feat], self._x[feat], self._targets, self._weights)
234
+
235
+ def _assignments(self, feat: str) -> np.ndarray:
236
+ if feat in self._splits:
237
+ return _num_assign(self._x[feat], self._splits[feat])
238
+ return _cat_assign(self._x[feat], self._cat_bins[feat])
239
+
240
+ def state(self, feat: str) -> FeatureState:
241
+ s = self._base_state(feat)
242
+ if self._time is not None:
243
+ s.temporal = _temporal_stats(
244
+ self._targets, self._weights, self._assignments(feat), s.n_bins, self._time, self._threshold
245
+ )
246
+ return s
247
+
248
+ def _push(self, feat: str) -> None:
249
+ if feat in self._splits:
250
+ self._history[feat].append(("splits", list(self._splits[feat])))
251
+ else:
252
+ self._history[feat].append(("cat", copy.deepcopy(self._cat_bins[feat])))
253
+
254
+ def split(self, feat: str, value: float) -> FeatureState:
255
+ if value in self._splits[feat]:
256
+ return self.state(feat)
257
+ self._push(feat)
258
+ self._splits[feat] = sorted(self._splits[feat] + [value])
259
+ return self.state(feat)
260
+
261
+ def merge(self, feat: str, bin_idx: int) -> FeatureState:
262
+ if feat in self._splits:
263
+ splits = self._splits[feat]
264
+ if bin_idx >= len(splits):
265
+ return self.state(feat)
266
+ self._push(feat)
267
+ self._splits[feat] = [s for i, s in enumerate(splits) if i != bin_idx]
268
+ else:
269
+ cat_bins = self._cat_bins[feat]
270
+ n_groups = max(cat_bins.values()) + 1 if cat_bins else 0
271
+ if bin_idx >= n_groups - 1:
272
+ return self.state(feat)
273
+ self._push(feat)
274
+ self._cat_bins[feat] = {
275
+ cat: (bin_idx if grp == bin_idx + 1 else (grp - 1 if grp > bin_idx + 1 else grp))
276
+ for cat, grp in cat_bins.items()
277
+ }
278
+ return self.state(feat)
279
+
280
+ def move_boundary(self, feat: str, bin_idx: int, new_value: float) -> FeatureState:
281
+ splits = self._splits[feat]
282
+ if bin_idx >= len(splits):
283
+ return self.state(feat)
284
+ self._push(feat)
285
+ new = list(splits)
286
+ new[bin_idx] = new_value
287
+ self._splits[feat] = sorted(set(new))
288
+ return self.state(feat)
289
+
290
+ def reset(self, feat: str) -> FeatureState:
291
+ self._history[feat] = []
292
+ spec = self._orig[feat]
293
+ if spec["dtype"] == FeatureDtype.NUMERIC:
294
+ self._splits[feat] = [float(s) for s in spec["bins"][1:-1] if np.isfinite(s)]
295
+ else:
296
+ self._cat_bins[feat] = {str(k): int(v) for k, v in spec["bins"].items()}
297
+ return self.state(feat)
298
+
299
+ def undo(self, feat: str) -> FeatureState:
300
+ if not self._history[feat]:
301
+ return self.state(feat)
302
+ kind, prev = self._history[feat].pop()
303
+ if kind == "splits":
304
+ self._splits[feat] = prev
305
+ else:
306
+ self._cat_bins[feat] = prev
307
+ return self.state(feat)
308
+
309
+ def history(self, feat: str) -> list[dict[str, Any]]:
310
+ return [{"type": k, "value": v} for k, v in self._history[feat]]
311
+
312
+ def _suggest_num(self, feat: str, n_suggestions: int) -> list[float]:
313
+ values = self._x[feat]
314
+ x_valid = values[~np.isnan(values)]
315
+ if len(x_valid) == 0:
316
+ return []
317
+ current = self._splits[feat]
318
+ span = float(x_valid.max() - x_valid.min())
319
+ min_gap = span * 0.01
320
+ candidates = [
321
+ float(candidate) for candidate in np.unique(np.percentile(x_valid, np.linspace(5, 95, 40)))
322
+ if all(abs(candidate - split) > min_gap for split in current)
323
+ ]
324
+ base_information_value = self._base_state(feat).iv
325
+ pairs: list[tuple[float, float]] = sorted(
326
+ [
327
+ (
328
+ _bin_stats(self._targets, self._weights, _num_assign(values, sorted(current + [candidate])), len(current) + 2).iv - base_information_value,
329
+ float(candidate),
330
+ )
331
+ for candidate in candidates
332
+ ],
333
+ reverse=True,
334
+ )
335
+ return [v for _, v in pairs[:n_suggestions]]
336
+
337
+ def _suggest_cat(self, feat: str, n_suggestions: int) -> list[tuple[int, int]]:
338
+ category_bins = self._cat_bins[feat]
339
+ n_groups = max(category_bins.values()) + 1 if category_bins else 0
340
+ if n_groups <= 1:
341
+ return []
342
+ values = self._x[feat]
343
+ base_information_value = self._base_state(feat).iv
344
+ pairs: list[tuple[float, tuple[int, int]]] = sorted(
345
+ [
346
+ (
347
+ base_information_value - _bin_stats(
348
+ self._targets, self._weights,
349
+ _cat_assign(values, {
350
+ category: (bin_idx if group == bin_idx + 1 else (group - 1 if group > bin_idx + 1 else group))
351
+ for category, group in category_bins.items()
352
+ }),
353
+ n_groups - 1,
354
+ ).iv,
355
+ (bin_idx, bin_idx + 1),
356
+ )
357
+ for bin_idx in range(n_groups - 1)
358
+ ]
359
+ )
360
+ return [pair for _, pair in pairs[:n_suggestions]]
361
+
362
+ def suggest_splits(self, feat: str, n: int = 5) -> list: # type: ignore[type-arg]
363
+ if feat in self._splits:
364
+ return self._suggest_num(feat, n)
365
+ return self._suggest_cat(feat, n)
366
+
367
+ def accept(self) -> dict[str, dict[str, Any]]:
368
+ return {feat: self.accept_feature(feat) for feat in self.features()}
369
+
370
+ def accept_feature(self, feat: str) -> dict[str, Any]:
371
+ if feat in self._splits:
372
+ return {"dtype": "float", "bins": [-np.inf] + self._splits[feat] + [np.inf]}
373
+ return {"dtype": "category", "bins": dict(self._cat_bins[feat])}