rejectkit 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ """Augmentation-based reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+
8
+ from ..base import ArrayTriple, BaseRejectInferencer
9
+
10
+
11
+ class SimpleAugmentation(BaseRejectInferencer):
12
+ """Hard-cutoff augmentation.
13
+
14
+ Scores the rejects with a good/bad model fitted on the accepts and assigns
15
+ a hard 0/1 label by thresholding P(bad). The labelled rejects are appended
16
+ to the accepts with unit weight. Simple and transparent, but sensitive to
17
+ the chosen ``threshold`` and prone to over-confidence.
18
+
19
+ Parameters
20
+ ----------
21
+ threshold : float, default=0.5
22
+ Rejects with ``P(bad) >= threshold`` are labelled bad (1).
23
+ """
24
+
25
+ def __init__(self, base_estimator: BaseEstimator | None = None, threshold: float = 0.5):
26
+ super().__init__(base_estimator=base_estimator)
27
+ self.threshold = threshold
28
+
29
+ def resample(self) -> ArrayTriple:
30
+ p_bad = self._reject_bad_proba()
31
+ y_reject = (p_bad >= self.threshold).astype(int)
32
+ X = np.vstack([self.X_accept_, self.X_reject_])
33
+ y = np.concatenate([self.y_accept_, y_reject])
34
+ w = np.ones(X.shape[0])
35
+ return X, y, w
36
+
37
+
38
+ class FuzzyAugmentation(BaseRejectInferencer):
39
+ """Fuzzy augmentation (a.k.a. fuzzy parcelling).
40
+
41
+ Instead of committing to a hard label, each reject contributes **two** rows
42
+ — one labelled bad, one labelled good — weighted by the model's P(bad) and
43
+ P(good). This avoids the over-confidence of hard cutoffs and is generally
44
+ the most stable augmentation method.
45
+
46
+ Parameters
47
+ ----------
48
+ reject_weight : float, default=1.0
49
+ Global multiplier on the weight of reject-derived rows, useful to
50
+ reflect the reject share of the through-the-door population.
51
+ """
52
+
53
+ def __init__(self, base_estimator: BaseEstimator | None = None, reject_weight: float = 1.0):
54
+ super().__init__(base_estimator=base_estimator)
55
+ self.reject_weight = reject_weight
56
+
57
+ def resample(self) -> ArrayTriple:
58
+ p_bad = self._reject_bad_proba()
59
+ p_good = 1.0 - p_bad
60
+ n_a = self.X_accept_.shape[0]
61
+ n_r = self.X_reject_.shape[0]
62
+ X = np.vstack([self.X_accept_, self.X_reject_, self.X_reject_])
63
+ y = np.concatenate(
64
+ [self.y_accept_, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)]
65
+ )
66
+ w = np.concatenate(
67
+ [np.ones(n_a), self.reject_weight * p_bad, self.reject_weight * p_good]
68
+ )
69
+ return X, y, w
@@ -0,0 +1,47 @@
1
+ """Nearest-neighbour extrapolation ('twins') reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+ from sklearn.neighbors import NearestNeighbors
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ from ..base import ArrayTriple, BaseRejectInferencer
11
+
12
+
13
+ class Extrapolation(BaseRejectInferencer):
14
+ """Nearest-neighbour label extrapolation (a.k.a. 'twins').
15
+
16
+ Each reject is matched to its ``n_neighbors`` most similar accepts in
17
+ standardised feature space; the local bad rate among those neighbours
18
+ becomes the reject's P(bad). As in fuzzy augmentation, each reject then
19
+ contributes two weighted rows. No parametric model is assumed for the
20
+ reject labels.
21
+
22
+ Parameters
23
+ ----------
24
+ n_neighbors : int, default=10
25
+ """
26
+
27
+ def __init__(self, base_estimator: BaseEstimator | None = None,
28
+ n_neighbors: int = 10):
29
+ super().__init__(base_estimator=base_estimator)
30
+ self.n_neighbors = n_neighbors
31
+
32
+ def _fit(self) -> None:
33
+ self.scaler_ = StandardScaler().fit(self.X_accept_)
34
+ k = min(self.n_neighbors, self.X_accept_.shape[0])
35
+ self.nn_ = NearestNeighbors(n_neighbors=k).fit(
36
+ self.scaler_.transform(self.X_accept_)
37
+ )
38
+
39
+ def resample(self) -> ArrayTriple:
40
+ Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
41
+ idx = self.nn_.kneighbors(self.scaler_.transform(Xr), return_distance=False)
42
+ p_bad = ya[idx].mean(axis=1).astype(float)
43
+ n_a, n_r = Xa.shape[0], Xr.shape[0]
44
+ X = np.vstack([Xa, Xr, Xr])
45
+ y = np.concatenate([ya, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)])
46
+ w = np.concatenate([np.ones(n_a), p_bad, 1.0 - p_bad])
47
+ return X, y, w
@@ -0,0 +1,98 @@
1
+ """Heckman-style control-function correction for sample selection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+
7
+ import numpy as np
8
+ from sklearn.base import BaseEstimator, ClassifierMixin, clone
9
+ from sklearn.linear_model import LogisticRegression
10
+
11
+ from .._compat import to_numpy_1d, to_numpy_2d
12
+
13
+ _erf = np.vectorize(math.erf)
14
+
15
+
16
+ def _norm_pdf(x: np.ndarray) -> np.ndarray:
17
+ return np.exp(-0.5 * x * x) / np.sqrt(2.0 * np.pi)
18
+
19
+
20
+ def _norm_cdf(x: np.ndarray) -> np.ndarray:
21
+ return 0.5 * (1.0 + _erf(x / np.sqrt(2.0)))
22
+
23
+
24
+ class HeckmanClassifier(BaseEstimator, ClassifierMixin):
25
+ """Heckman-style two-step control-function correction.
26
+
27
+ Step 1 fits a *selection* model separating accepts from rejects and derives
28
+ each applicant's inverse Mills ratio (IMR) from its selection score. Step 2
29
+ trains the outcome model on the accepts with the IMR appended as an extra
30
+ feature; the IMR term absorbs the selection bias. At prediction time the
31
+ IMR is recomputed for new applicants and appended in the same way.
32
+
33
+ Unlike the resampling methods, Heckman augments the *feature space* rather
34
+ than the sample, so it is a standalone classifier rather than a
35
+ ``BaseRejectInferencer``. The Gaussian-latent assumption is an
36
+ approximation; treat it as a control-function heuristic.
37
+
38
+ Parameters
39
+ ----------
40
+ selection_estimator : sklearn classifier, optional
41
+ Separates accepts (1) from rejects (0). Defaults to LogisticRegression.
42
+ outcome_estimator : sklearn classifier, optional
43
+ Trained on accepts + IMR. Defaults to LogisticRegression.
44
+ """
45
+
46
+ def __init__(self, selection_estimator: BaseEstimator | None = None,
47
+ outcome_estimator: BaseEstimator | None = None):
48
+ self.selection_estimator = selection_estimator
49
+ self.outcome_estimator = outcome_estimator
50
+
51
+ def _sel(self):
52
+ if self.selection_estimator is not None:
53
+ return clone(self.selection_estimator)
54
+ return LogisticRegression(max_iter=1000)
55
+
56
+ def _out(self):
57
+ if self.outcome_estimator is not None:
58
+ return clone(self.outcome_estimator)
59
+ return LogisticRegression(max_iter=1000)
60
+
61
+ def _latent(self, X):
62
+ if hasattr(self.selection_, "decision_function"):
63
+ z = self.selection_.decision_function(X)
64
+ else:
65
+ p = np.clip(self.selection_.predict_proba(X)[:, 1], 1e-6, 1 - 1e-6)
66
+ z = np.log(p / (1 - p))
67
+ return (np.asarray(z, dtype=float) - self.z_mean_) / self.z_std_
68
+
69
+ def _imr(self, X):
70
+ z = self._latent(X)
71
+ return _norm_pdf(z) / np.clip(_norm_cdf(z), 1e-6, None)
72
+
73
+ def fit(self, X_accept, y_accept, X_reject):
74
+ Xa, Xr = to_numpy_2d(X_accept), to_numpy_2d(X_reject)
75
+ ya = to_numpy_1d(y_accept).astype(int)
76
+ Xs = np.vstack([Xa, Xr])
77
+ s = np.concatenate([np.ones(Xa.shape[0], dtype=int),
78
+ np.zeros(Xr.shape[0], dtype=int)])
79
+ self.selection_ = self._sel().fit(Xs, s)
80
+ if hasattr(self.selection_, "decision_function"):
81
+ z_raw = self.selection_.decision_function(Xa)
82
+ else:
83
+ p = np.clip(self.selection_.predict_proba(Xa)[:, 1], 1e-6, 1 - 1e-6)
84
+ z_raw = np.log(p / (1 - p))
85
+ z_raw = np.asarray(z_raw, dtype=float)
86
+ self.z_mean_, self.z_std_ = float(z_raw.mean()), float(z_raw.std() + 1e-12)
87
+ imr = self._imr(Xa)
88
+ self.outcome_ = self._out().fit(np.column_stack([Xa, imr]), ya)
89
+ self.classes_ = getattr(self.outcome_, "classes_", np.array([0, 1]))
90
+ self.n_features_in_ = Xa.shape[1]
91
+ return self
92
+
93
+ def predict_proba(self, X):
94
+ X = to_numpy_2d(X)
95
+ return self.outcome_.predict_proba(np.column_stack([X, self._imr(X)]))
96
+
97
+ def predict(self, X):
98
+ return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
@@ -0,0 +1,87 @@
1
+ """Parcelling (score-band) reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+
8
+ from ..base import ArrayTriple, BaseRejectInferencer
9
+
10
+
11
+ class Parcelling(BaseRejectInferencer):
12
+ """Parcelling.
13
+
14
+ Applicants are scored and split into ``n_bins`` score bands (quantiles of
15
+ the accept score distribution). Within each band the observed accept bad
16
+ rate is multiplied by ``uplift`` (>= 1) — encoding the assumption that
17
+ rejects are worse risks than accepts with the same score — and used as the
18
+ reject bad rate for that band.
19
+
20
+ With ``assignment='expected'`` (default) each reject contributes two
21
+ deterministically weighted rows (no randomness); with ``'random'`` a label
22
+ is drawn from a Bernoulli with the band's reject bad rate.
23
+
24
+ Parameters
25
+ ----------
26
+ n_bins : int, default=10
27
+ Number of score bands.
28
+ uplift : float, default=1.0
29
+ Multiplier applied to each band's accept bad rate. ``1.0`` assumes
30
+ rejects behave like accepts of the same score; ``> 1`` makes them worse.
31
+ assignment : {'expected', 'random'}, default='expected'
32
+ random_state : int, optional
33
+ Used only when ``assignment='random'``.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ base_estimator: BaseEstimator | None = None,
39
+ n_bins: int = 10,
40
+ uplift: float = 1.0,
41
+ assignment: str = "expected",
42
+ random_state: int | None = None,
43
+ ):
44
+ super().__init__(base_estimator=base_estimator)
45
+ self.n_bins = n_bins
46
+ self.uplift = uplift
47
+ self.assignment = assignment
48
+ self.random_state = random_state
49
+
50
+ def _reject_bad_rate(self) -> np.ndarray:
51
+ accept_score = self.scorer_.predict_proba(self.X_accept_)[:, 1]
52
+ reject_score = self._reject_bad_proba()
53
+ edges = np.unique(np.quantile(accept_score, np.linspace(0, 1, self.n_bins + 1)))
54
+ edges[0], edges[-1] = -np.inf, np.inf
55
+ inner = edges[1:-1]
56
+ a_bin = np.digitize(accept_score, inner)
57
+ r_bin = np.digitize(reject_score, inner)
58
+ n_eff = len(edges) - 1
59
+ bad_rate = np.full(n_eff, float(self.y_accept_.mean()))
60
+ for b in range(n_eff):
61
+ mask = a_bin == b
62
+ if mask.any():
63
+ bad_rate[b] = self.y_accept_[mask].mean()
64
+ return np.clip(bad_rate[r_bin] * self.uplift, 0.0, 1.0)
65
+
66
+ def resample(self) -> ArrayTriple:
67
+ reject_bad_rate = self._reject_bad_rate()
68
+
69
+ if self.assignment == "random":
70
+ rng = np.random.default_rng(self.random_state)
71
+ y_reject = (rng.random(reject_bad_rate.shape[0]) < reject_bad_rate).astype(int)
72
+ X = np.vstack([self.X_accept_, self.X_reject_])
73
+ y = np.concatenate([self.y_accept_, y_reject])
74
+ w = np.ones(X.shape[0])
75
+ return X, y, w
76
+
77
+ if self.assignment == "expected":
78
+ n_a = self.X_accept_.shape[0]
79
+ n_r = self.X_reject_.shape[0]
80
+ X = np.vstack([self.X_accept_, self.X_reject_, self.X_reject_])
81
+ y = np.concatenate(
82
+ [self.y_accept_, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)]
83
+ )
84
+ w = np.concatenate([np.ones(n_a), reject_bad_rate, 1.0 - reject_bad_rate])
85
+ return X, y, w
86
+
87
+ raise ValueError("assignment must be 'expected' or 'random'.")
@@ -0,0 +1,45 @@
1
+ """Iterative reclassification reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+
8
+ from ..base import ArrayTriple, BaseRejectInferencer
9
+
10
+
11
+ class Reclassification(BaseRejectInferencer):
12
+ """Iterative reclassification.
13
+
14
+ Fits a good/bad model on the accepts, hard-labels the rejects by
15
+ thresholding P(bad), adds them to the training data, refits, and repeats.
16
+ Labels may change between iterations until they stabilise or ``n_iter`` is
17
+ reached.
18
+
19
+ Parameters
20
+ ----------
21
+ threshold : float, default=0.5
22
+ n_iter : int, default=3
23
+ """
24
+
25
+ def __init__(self, base_estimator: BaseEstimator | None = None,
26
+ threshold: float = 0.5, n_iter: int = 3):
27
+ super().__init__(base_estimator=base_estimator)
28
+ self.threshold = threshold
29
+ self.n_iter = n_iter
30
+
31
+ def resample(self) -> ArrayTriple:
32
+ Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
33
+ model = self.scorer_
34
+ y_reject = (model.predict_proba(Xr)[:, 1] >= self.threshold).astype(int)
35
+ for _ in range(max(self.n_iter - 1, 0)):
36
+ model = self._make_base()
37
+ model.fit(np.vstack([Xa, Xr]), np.concatenate([ya, y_reject]))
38
+ new = (model.predict_proba(Xr)[:, 1] >= self.threshold).astype(int)
39
+ stable = np.array_equal(new, y_reject)
40
+ y_reject = new
41
+ if stable:
42
+ break
43
+ X = np.vstack([Xa, Xr])
44
+ y = np.concatenate([ya, y_reject])
45
+ return X, y, np.ones(X.shape[0])
@@ -0,0 +1,47 @@
1
+ """Inverse-propensity reweighting (IPW) reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+
8
+ from ..base import ArrayTriple, BaseRejectInferencer
9
+
10
+
11
+ class Reweighting(BaseRejectInferencer):
12
+ """Inverse-probability-of-acceptance reweighting.
13
+
14
+ Rather than inventing labels for rejects, this fits a *selection model*
15
+ that separates accepts from rejects, then trains only on the accepts, each
16
+ weighted by ``1 / P(accept | x)``. Accepts that look like rejects are
17
+ up-weighted, correcting the sample-selection bias under a
18
+ missing-at-random assumption. No reject labels are fabricated.
19
+
20
+ Parameters
21
+ ----------
22
+ clip : float, default=0.01
23
+ Acceptance probabilities are clipped to ``[clip, 1]`` before inversion
24
+ to bound the weights.
25
+ """
26
+
27
+ def __init__(self, base_estimator: BaseEstimator | None = None, clip: float = 0.01):
28
+ super().__init__(base_estimator=base_estimator)
29
+ self.clip = clip
30
+
31
+ def _fit(self) -> None:
32
+ X = np.vstack([self.X_accept_, self.X_reject_])
33
+ s = np.concatenate(
34
+ [
35
+ np.ones(self.X_accept_.shape[0], dtype=int),
36
+ np.zeros(self.X_reject_.shape[0], dtype=int),
37
+ ]
38
+ )
39
+ self.selection_model_ = self._make_base()
40
+ self.selection_model_.fit(X, s)
41
+
42
+ def resample(self) -> ArrayTriple:
43
+ p_accept = self.selection_model_.predict_proba(self.X_accept_)[:, 1]
44
+ p_accept = np.clip(p_accept, self.clip, 1.0)
45
+ w = 1.0 / p_accept
46
+ w = w * (w.shape[0] / w.sum()) # normalise mean weight to 1
47
+ return self.X_accept_.copy(), self.y_accept_.copy(), w
@@ -0,0 +1,64 @@
1
+ """Semi-supervised (self-training) reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator
7
+
8
+ from ..base import ArrayTriple, BaseRejectInferencer
9
+
10
+
11
+ class SelfLearning(BaseRejectInferencer):
12
+ """Self-training reject inference.
13
+
14
+ Treats rejects as unlabelled. Starting from a model fit on the accepts, it
15
+ iteratively pseudo-labels the rejects it is most confident about
16
+ (``P(bad) >= threshold`` -> bad, ``P(bad) <= 1 - threshold`` -> good),
17
+ refits on the accepts plus newly labelled rejects, and repeats. Rejects
18
+ that never cross the confidence band are excluded from the final sample.
19
+
20
+ Parameters
21
+ ----------
22
+ threshold : float, default=0.75
23
+ max_iter : int, default=10
24
+ """
25
+
26
+ def __init__(self, base_estimator: BaseEstimator | None = None,
27
+ threshold: float = 0.75, max_iter: int = 10):
28
+ super().__init__(base_estimator=base_estimator)
29
+ self.threshold = threshold
30
+ self.max_iter = max_iter
31
+
32
+ def resample(self) -> ArrayTriple:
33
+ Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
34
+ n_r = Xr.shape[0]
35
+ labels = np.full(n_r, -1, dtype=int)
36
+ lo = 1.0 - self.threshold
37
+ model = self.scorer_
38
+ for _ in range(self.max_iter):
39
+ unl = labels == -1
40
+ if not unl.any():
41
+ break
42
+ p = model.predict_proba(Xr[unl])[:, 1]
43
+ local = np.where(unl)[0]
44
+ newly = False
45
+ bad = local[p >= self.threshold]
46
+ good = local[p <= lo]
47
+ if bad.size:
48
+ labels[bad] = 1
49
+ newly = True
50
+ if good.size:
51
+ labels[good] = 0
52
+ newly = True
53
+ if not newly:
54
+ break
55
+ keep = labels != -1
56
+ model = self._make_base()
57
+ model.fit(np.vstack([Xa, Xr[keep]]), np.concatenate([ya, labels[keep]]))
58
+ keep = labels != -1
59
+ if keep.any():
60
+ X = np.vstack([Xa, Xr[keep]])
61
+ y = np.concatenate([ya, labels[keep]])
62
+ else:
63
+ X, y = Xa.copy(), ya.copy()
64
+ return X, y, np.ones(X.shape[0])
rejectkit/plotting.py ADDED
@@ -0,0 +1,75 @@
1
+ """Optional matplotlib plotting helpers.
2
+
3
+ Importing this module is cheap; matplotlib is only imported when a plotting
4
+ function is actually called. Install the extra with ``pip install rejectkit[plot]``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+
11
+ from ._compat import to_numpy_1d
12
+
13
+
14
+ def _get_ax(ax):
15
+ try:
16
+ import matplotlib.pyplot as plt
17
+ except ImportError as exc: # pragma: no cover - exercised only without matplotlib
18
+ raise ImportError(
19
+ "Plotting requires matplotlib. Install with `pip install rejectkit[plot]`."
20
+ ) from exc
21
+ if ax is None:
22
+ _, ax = plt.subplots()
23
+ return ax
24
+
25
+
26
+ def plot_benchmark(results, metric: str = "auc_recovery", ax=None):
27
+ """Bar chart of a :class:`MaskedRejectBenchmark` results table."""
28
+ ax = _get_ax(ax)
29
+ df = results
30
+ if metric == "auc_recovery" and "oracle" in df.index:
31
+ df = df.drop(index="oracle")
32
+ vals = df[metric]
33
+ colors = ["#999999" if i == "naive" else "#3366cc" for i in vals.index]
34
+ ax.bar(range(len(vals)), vals.to_numpy(), color=colors)
35
+ ax.set_xticks(range(len(vals)))
36
+ ax.set_xticklabels(vals.index, rotation=45, ha="right")
37
+ ax.axhline(0, color="k", lw=0.8)
38
+ ax.set_ylabel(metric)
39
+ ax.set_title("Reject inference benchmark")
40
+ return ax
41
+
42
+
43
+ def plot_score_distributions(score_accept, score_reject, bins: int = 30, ax=None):
44
+ """Overlaid histograms of accept vs reject scores."""
45
+ ax = _get_ax(ax)
46
+ ax.hist(to_numpy_1d(score_accept), bins=bins, alpha=0.5, density=True, label="accept")
47
+ ax.hist(to_numpy_1d(score_reject), bins=bins, alpha=0.5, density=True, label="reject")
48
+ ax.set_xlabel("score / P(bad)")
49
+ ax.set_ylabel("density")
50
+ ax.legend()
51
+ ax.set_title("Accept vs reject score distribution")
52
+ return ax
53
+
54
+
55
+ def plot_ks(y_true, y_score, ax=None):
56
+ """Plot the KS curve (cumulative bad vs good across the score threshold)."""
57
+ from sklearn.metrics import roc_curve
58
+
59
+ ax = _get_ax(ax)
60
+ y_true = to_numpy_1d(y_true)
61
+ y_score = to_numpy_1d(y_score).astype(float)
62
+ fpr, tpr, thr = roc_curve(y_true, y_score)
63
+ finite = np.isfinite(thr)
64
+ ks = float(np.max(tpr - fpr))
65
+ k = int(np.argmax(tpr - fpr))
66
+ ax.plot(thr[finite], tpr[finite], label="cumulative bad (TPR)")
67
+ ax.plot(thr[finite], fpr[finite], label="cumulative good (FPR)")
68
+ if np.isfinite(thr[k]):
69
+ ax.axvline(thr[k], color="k", ls="--", lw=0.8, label=f"KS = {ks:.3f}")
70
+ ax.set_xlabel("threshold on P(bad)")
71
+ ax.set_ylabel("cumulative rate")
72
+ ax.invert_xaxis()
73
+ ax.legend()
74
+ ax.set_title("KS curve")
75
+ return ax