PyPI - rejectkit - Versions diffs - 0.3.0__py3-none-any.whl - Mend

rejectkit 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

rejectkit/__init__.py +47 -0
rejectkit/_compat.py +19 -0
rejectkit/base.py +94 -0
rejectkit/benchmark.py +141 -0
rejectkit/datasets.py +97 -0
rejectkit/diagnostics.py +109 -0
rejectkit/estimator.py +105 -0
rejectkit/methods/__init__.py +20 -0
rejectkit/methods/augmentation.py +69 -0
rejectkit/methods/extrapolation.py +47 -0
rejectkit/methods/heckman.py +98 -0
rejectkit/methods/parcelling.py +87 -0
rejectkit/methods/reclassification.py +45 -0
rejectkit/methods/reweighting.py +47 -0
rejectkit/methods/semi_supervised.py +64 -0
rejectkit/plotting.py +75 -0
rejectkit-0.3.0.dist-info/METADATA +219 -0
rejectkit-0.3.0.dist-info/RECORD +20 -0
rejectkit-0.3.0.dist-info/WHEEL +4 -0
rejectkit-0.3.0.dist-info/licenses/LICENSE +21 -0

rejectkit/methods/augmentation.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Augmentation-based reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from ..base import ArrayTriple, BaseRejectInferencer
+class SimpleAugmentation(BaseRejectInferencer):
+    """Hard-cutoff augmentation.
+    Scores the rejects with a good/bad model fitted on the accepts and assigns
+    a hard 0/1 label by thresholding P(bad). The labelled rejects are appended
+    to the accepts with unit weight. Simple and transparent, but sensitive to
+    the chosen ``threshold`` and prone to over-confidence.
+    Parameters
+    ----------
+    threshold : float, default=0.5
+        Rejects with ``P(bad) >= threshold`` are labelled bad (1).
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None, threshold: float = 0.5):
+        super().__init__(base_estimator=base_estimator)
+        self.threshold = threshold
+    def resample(self) -> ArrayTriple:
+        p_bad = self._reject_bad_proba()
+        y_reject = (p_bad >= self.threshold).astype(int)
+        X = np.vstack([self.X_accept_, self.X_reject_])
+        y = np.concatenate([self.y_accept_, y_reject])
+        w = np.ones(X.shape[0])
+        return X, y, w
+class FuzzyAugmentation(BaseRejectInferencer):
+    """Fuzzy augmentation (a.k.a. fuzzy parcelling).
+    Instead of committing to a hard label, each reject contributes **two** rows
+    — one labelled bad, one labelled good — weighted by the model's P(bad) and
+    P(good). This avoids the over-confidence of hard cutoffs and is generally
+    the most stable augmentation method.
+    Parameters
+    ----------
+    reject_weight : float, default=1.0
+        Global multiplier on the weight of reject-derived rows, useful to
+        reflect the reject share of the through-the-door population.
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None, reject_weight: float = 1.0):
+        super().__init__(base_estimator=base_estimator)
+        self.reject_weight = reject_weight
+    def resample(self) -> ArrayTriple:
+        p_bad = self._reject_bad_proba()
+        p_good = 1.0 - p_bad
+        n_a = self.X_accept_.shape[0]
+        n_r = self.X_reject_.shape[0]
+        X = np.vstack([self.X_accept_, self.X_reject_, self.X_reject_])
+        y = np.concatenate(
+            [self.y_accept_, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)]
+        )
+        w = np.concatenate(
+            [np.ones(n_a), self.reject_weight * p_bad, self.reject_weight * p_good]
+        )
+        return X, y, w

rejectkit/methods/extrapolation.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Nearest-neighbour extrapolation ('twins') reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import StandardScaler
+from ..base import ArrayTriple, BaseRejectInferencer
+class Extrapolation(BaseRejectInferencer):
+    """Nearest-neighbour label extrapolation (a.k.a. 'twins').
+    Each reject is matched to its ``n_neighbors`` most similar accepts in
+    standardised feature space; the local bad rate among those neighbours
+    becomes the reject's P(bad). As in fuzzy augmentation, each reject then
+    contributes two weighted rows. No parametric model is assumed for the
+    reject labels.
+    Parameters
+    ----------
+    n_neighbors : int, default=10
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None,
+                 n_neighbors: int = 10):
+        super().__init__(base_estimator=base_estimator)
+        self.n_neighbors = n_neighbors
+    def _fit(self) -> None:
+        self.scaler_ = StandardScaler().fit(self.X_accept_)
+        k = min(self.n_neighbors, self.X_accept_.shape[0])
+        self.nn_ = NearestNeighbors(n_neighbors=k).fit(
+            self.scaler_.transform(self.X_accept_)
+        )
+    def resample(self) -> ArrayTriple:
+        Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
+        idx = self.nn_.kneighbors(self.scaler_.transform(Xr), return_distance=False)
+        p_bad = ya[idx].mean(axis=1).astype(float)
+        n_a, n_r = Xa.shape[0], Xr.shape[0]
+        X = np.vstack([Xa, Xr, Xr])
+        y = np.concatenate([ya, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)])
+        w = np.concatenate([np.ones(n_a), p_bad, 1.0 - p_bad])
+        return X, y, w

rejectkit/methods/heckman.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Heckman-style control-function correction for sample selection."""
+from __future__ import annotations
+import math
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.linear_model import LogisticRegression
+from .._compat import to_numpy_1d, to_numpy_2d
+_erf = np.vectorize(math.erf)
+def _norm_pdf(x: np.ndarray) -> np.ndarray:
+    return np.exp(-0.5 * x * x) / np.sqrt(2.0 * np.pi)
+def _norm_cdf(x: np.ndarray) -> np.ndarray:
+    return 0.5 * (1.0 + _erf(x / np.sqrt(2.0)))
+class HeckmanClassifier(BaseEstimator, ClassifierMixin):
+    """Heckman-style two-step control-function correction.
+    Step 1 fits a *selection* model separating accepts from rejects and derives
+    each applicant's inverse Mills ratio (IMR) from its selection score. Step 2
+    trains the outcome model on the accepts with the IMR appended as an extra
+    feature; the IMR term absorbs the selection bias. At prediction time the
+    IMR is recomputed for new applicants and appended in the same way.
+    Unlike the resampling methods, Heckman augments the *feature space* rather
+    than the sample, so it is a standalone classifier rather than a
+    ``BaseRejectInferencer``. The Gaussian-latent assumption is an
+    approximation; treat it as a control-function heuristic.
+    Parameters
+    ----------
+    selection_estimator : sklearn classifier, optional
+        Separates accepts (1) from rejects (0). Defaults to LogisticRegression.
+    outcome_estimator : sklearn classifier, optional
+        Trained on accepts + IMR. Defaults to LogisticRegression.
+    """
+    def __init__(self, selection_estimator: BaseEstimator | None = None,
+                 outcome_estimator: BaseEstimator | None = None):
+        self.selection_estimator = selection_estimator
+        self.outcome_estimator = outcome_estimator
+    def _sel(self):
+        if self.selection_estimator is not None:
+            return clone(self.selection_estimator)
+        return LogisticRegression(max_iter=1000)
+    def _out(self):
+        if self.outcome_estimator is not None:
+            return clone(self.outcome_estimator)
+        return LogisticRegression(max_iter=1000)
+    def _latent(self, X):
+        if hasattr(self.selection_, "decision_function"):
+            z = self.selection_.decision_function(X)
+        else:
+            p = np.clip(self.selection_.predict_proba(X)[:, 1], 1e-6, 1 - 1e-6)
+            z = np.log(p / (1 - p))
+        return (np.asarray(z, dtype=float) - self.z_mean_) / self.z_std_
+    def _imr(self, X):
+        z = self._latent(X)
+        return _norm_pdf(z) / np.clip(_norm_cdf(z), 1e-6, None)
+    def fit(self, X_accept, y_accept, X_reject):
+        Xa, Xr = to_numpy_2d(X_accept), to_numpy_2d(X_reject)
+        ya = to_numpy_1d(y_accept).astype(int)
+        Xs = np.vstack([Xa, Xr])
+        s = np.concatenate([np.ones(Xa.shape[0], dtype=int),
+                            np.zeros(Xr.shape[0], dtype=int)])
+        self.selection_ = self._sel().fit(Xs, s)
+        if hasattr(self.selection_, "decision_function"):
+            z_raw = self.selection_.decision_function(Xa)
+        else:
+            p = np.clip(self.selection_.predict_proba(Xa)[:, 1], 1e-6, 1 - 1e-6)
+            z_raw = np.log(p / (1 - p))
+        z_raw = np.asarray(z_raw, dtype=float)
+        self.z_mean_, self.z_std_ = float(z_raw.mean()), float(z_raw.std() + 1e-12)
+        imr = self._imr(Xa)
+        self.outcome_ = self._out().fit(np.column_stack([Xa, imr]), ya)
+        self.classes_ = getattr(self.outcome_, "classes_", np.array([0, 1]))
+        self.n_features_in_ = Xa.shape[1]
+        return self
+    def predict_proba(self, X):
+        X = to_numpy_2d(X)
+        return self.outcome_.predict_proba(np.column_stack([X, self._imr(X)]))
+    def predict(self, X):
+        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

rejectkit/methods/parcelling.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Parcelling (score-band) reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from ..base import ArrayTriple, BaseRejectInferencer
+class Parcelling(BaseRejectInferencer):
+    """Parcelling.
+    Applicants are scored and split into ``n_bins`` score bands (quantiles of
+    the accept score distribution). Within each band the observed accept bad
+    rate is multiplied by ``uplift`` (>= 1) — encoding the assumption that
+    rejects are worse risks than accepts with the same score — and used as the
+    reject bad rate for that band.
+    With ``assignment='expected'`` (default) each reject contributes two
+    deterministically weighted rows (no randomness); with ``'random'`` a label
+    is drawn from a Bernoulli with the band's reject bad rate.
+    Parameters
+    ----------
+    n_bins : int, default=10
+        Number of score bands.
+    uplift : float, default=1.0
+        Multiplier applied to each band's accept bad rate. ``1.0`` assumes
+        rejects behave like accepts of the same score; ``> 1`` makes them worse.
+    assignment : {'expected', 'random'}, default='expected'
+    random_state : int, optional
+        Used only when ``assignment='random'``.
+    """
+    def __init__(
+        self,
+        base_estimator: BaseEstimator | None = None,
+        n_bins: int = 10,
+        uplift: float = 1.0,
+        assignment: str = "expected",
+        random_state: int | None = None,
+    ):
+        super().__init__(base_estimator=base_estimator)
+        self.n_bins = n_bins
+        self.uplift = uplift
+        self.assignment = assignment
+        self.random_state = random_state
+    def _reject_bad_rate(self) -> np.ndarray:
+        accept_score = self.scorer_.predict_proba(self.X_accept_)[:, 1]
+        reject_score = self._reject_bad_proba()
+        edges = np.unique(np.quantile(accept_score, np.linspace(0, 1, self.n_bins + 1)))
+        edges[0], edges[-1] = -np.inf, np.inf
+        inner = edges[1:-1]
+        a_bin = np.digitize(accept_score, inner)
+        r_bin = np.digitize(reject_score, inner)
+        n_eff = len(edges) - 1
+        bad_rate = np.full(n_eff, float(self.y_accept_.mean()))
+        for b in range(n_eff):
+            mask = a_bin == b
+            if mask.any():
+                bad_rate[b] = self.y_accept_[mask].mean()
+        return np.clip(bad_rate[r_bin] * self.uplift, 0.0, 1.0)
+    def resample(self) -> ArrayTriple:
+        reject_bad_rate = self._reject_bad_rate()
+        if self.assignment == "random":
+            rng = np.random.default_rng(self.random_state)
+            y_reject = (rng.random(reject_bad_rate.shape[0]) < reject_bad_rate).astype(int)
+            X = np.vstack([self.X_accept_, self.X_reject_])
+            y = np.concatenate([self.y_accept_, y_reject])
+            w = np.ones(X.shape[0])
+            return X, y, w
+        if self.assignment == "expected":
+            n_a = self.X_accept_.shape[0]
+            n_r = self.X_reject_.shape[0]
+            X = np.vstack([self.X_accept_, self.X_reject_, self.X_reject_])
+            y = np.concatenate(
+                [self.y_accept_, np.ones(n_r, dtype=int), np.zeros(n_r, dtype=int)]
+            )
+            w = np.concatenate([np.ones(n_a), reject_bad_rate, 1.0 - reject_bad_rate])
+            return X, y, w
+        raise ValueError("assignment must be 'expected' or 'random'.")

rejectkit/methods/reclassification.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Iterative reclassification reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from ..base import ArrayTriple, BaseRejectInferencer
+class Reclassification(BaseRejectInferencer):
+    """Iterative reclassification.
+    Fits a good/bad model on the accepts, hard-labels the rejects by
+    thresholding P(bad), adds them to the training data, refits, and repeats.
+    Labels may change between iterations until they stabilise or ``n_iter`` is
+    reached.
+    Parameters
+    ----------
+    threshold : float, default=0.5
+    n_iter : int, default=3
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None,
+                 threshold: float = 0.5, n_iter: int = 3):
+        super().__init__(base_estimator=base_estimator)
+        self.threshold = threshold
+        self.n_iter = n_iter
+    def resample(self) -> ArrayTriple:
+        Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
+        model = self.scorer_
+        y_reject = (model.predict_proba(Xr)[:, 1] >= self.threshold).astype(int)
+        for _ in range(max(self.n_iter - 1, 0)):
+            model = self._make_base()
+            model.fit(np.vstack([Xa, Xr]), np.concatenate([ya, y_reject]))
+            new = (model.predict_proba(Xr)[:, 1] >= self.threshold).astype(int)
+            stable = np.array_equal(new, y_reject)
+            y_reject = new
+            if stable:
+                break
+        X = np.vstack([Xa, Xr])
+        y = np.concatenate([ya, y_reject])
+        return X, y, np.ones(X.shape[0])

rejectkit/methods/reweighting.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Inverse-propensity reweighting (IPW) reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from ..base import ArrayTriple, BaseRejectInferencer
+class Reweighting(BaseRejectInferencer):
+    """Inverse-probability-of-acceptance reweighting.
+    Rather than inventing labels for rejects, this fits a *selection model*
+    that separates accepts from rejects, then trains only on the accepts, each
+    weighted by ``1 / P(accept | x)``. Accepts that look like rejects are
+    up-weighted, correcting the sample-selection bias under a
+    missing-at-random assumption. No reject labels are fabricated.
+    Parameters
+    ----------
+    clip : float, default=0.01
+        Acceptance probabilities are clipped to ``[clip, 1]`` before inversion
+        to bound the weights.
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None, clip: float = 0.01):
+        super().__init__(base_estimator=base_estimator)
+        self.clip = clip
+    def _fit(self) -> None:
+        X = np.vstack([self.X_accept_, self.X_reject_])
+        s = np.concatenate(
+            [
+                np.ones(self.X_accept_.shape[0], dtype=int),
+                np.zeros(self.X_reject_.shape[0], dtype=int),
+            ]
+        )
+        self.selection_model_ = self._make_base()
+        self.selection_model_.fit(X, s)
+    def resample(self) -> ArrayTriple:
+        p_accept = self.selection_model_.predict_proba(self.X_accept_)[:, 1]
+        p_accept = np.clip(p_accept, self.clip, 1.0)
+        w = 1.0 / p_accept
+        w = w * (w.shape[0] / w.sum())  # normalise mean weight to 1
+        return self.X_accept_.copy(), self.y_accept_.copy(), w

rejectkit/methods/semi_supervised.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Semi-supervised (self-training) reject inference."""
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator
+from ..base import ArrayTriple, BaseRejectInferencer
+class SelfLearning(BaseRejectInferencer):
+    """Self-training reject inference.
+    Treats rejects as unlabelled. Starting from a model fit on the accepts, it
+    iteratively pseudo-labels the rejects it is most confident about
+    (``P(bad) >= threshold`` -> bad, ``P(bad) <= 1 - threshold`` -> good),
+    refits on the accepts plus newly labelled rejects, and repeats. Rejects
+    that never cross the confidence band are excluded from the final sample.
+    Parameters
+    ----------
+    threshold : float, default=0.75
+    max_iter : int, default=10
+    """
+    def __init__(self, base_estimator: BaseEstimator | None = None,
+                 threshold: float = 0.75, max_iter: int = 10):
+        super().__init__(base_estimator=base_estimator)
+        self.threshold = threshold
+        self.max_iter = max_iter
+    def resample(self) -> ArrayTriple:
+        Xa, ya, Xr = self.X_accept_, self.y_accept_, self.X_reject_
+        n_r = Xr.shape[0]
+        labels = np.full(n_r, -1, dtype=int)
+        lo = 1.0 - self.threshold
+        model = self.scorer_
+        for _ in range(self.max_iter):
+            unl = labels == -1
+            if not unl.any():
+                break
+            p = model.predict_proba(Xr[unl])[:, 1]
+            local = np.where(unl)[0]
+            newly = False
+            bad = local[p >= self.threshold]
+            good = local[p <= lo]
+            if bad.size:
+                labels[bad] = 1
+                newly = True
+            if good.size:
+                labels[good] = 0
+                newly = True
+            if not newly:
+                break
+            keep = labels != -1
+            model = self._make_base()
+            model.fit(np.vstack([Xa, Xr[keep]]), np.concatenate([ya, labels[keep]]))
+        keep = labels != -1
+        if keep.any():
+            X = np.vstack([Xa, Xr[keep]])
+            y = np.concatenate([ya, labels[keep]])
+        else:
+            X, y = Xa.copy(), ya.copy()
+        return X, y, np.ones(X.shape[0])

rejectkit/plotting.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Optional matplotlib plotting helpers.
+Importing this module is cheap; matplotlib is only imported when a plotting
+function is actually called. Install the extra with ``pip install rejectkit[plot]``.
+"""
+from __future__ import annotations
+import numpy as np
+from ._compat import to_numpy_1d
+def _get_ax(ax):
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError as exc:  # pragma: no cover - exercised only without matplotlib
+        raise ImportError(
+            "Plotting requires matplotlib. Install with `pip install rejectkit[plot]`."
+        ) from exc
+    if ax is None:
+        _, ax = plt.subplots()
+    return ax
+def plot_benchmark(results, metric: str = "auc_recovery", ax=None):
+    """Bar chart of a :class:`MaskedRejectBenchmark` results table."""
+    ax = _get_ax(ax)
+    df = results
+    if metric == "auc_recovery" and "oracle" in df.index:
+        df = df.drop(index="oracle")
+    vals = df[metric]
+    colors = ["#999999" if i == "naive" else "#3366cc" for i in vals.index]
+    ax.bar(range(len(vals)), vals.to_numpy(), color=colors)
+    ax.set_xticks(range(len(vals)))
+    ax.set_xticklabels(vals.index, rotation=45, ha="right")
+    ax.axhline(0, color="k", lw=0.8)
+    ax.set_ylabel(metric)
+    ax.set_title("Reject inference benchmark")
+    return ax
+def plot_score_distributions(score_accept, score_reject, bins: int = 30, ax=None):
+    """Overlaid histograms of accept vs reject scores."""
+    ax = _get_ax(ax)
+    ax.hist(to_numpy_1d(score_accept), bins=bins, alpha=0.5, density=True, label="accept")
+    ax.hist(to_numpy_1d(score_reject), bins=bins, alpha=0.5, density=True, label="reject")
+    ax.set_xlabel("score / P(bad)")
+    ax.set_ylabel("density")
+    ax.legend()
+    ax.set_title("Accept vs reject score distribution")
+    return ax
+def plot_ks(y_true, y_score, ax=None):
+    """Plot the KS curve (cumulative bad vs good across the score threshold)."""
+    from sklearn.metrics import roc_curve
+    ax = _get_ax(ax)
+    y_true = to_numpy_1d(y_true)
+    y_score = to_numpy_1d(y_score).astype(float)
+    fpr, tpr, thr = roc_curve(y_true, y_score)
+    finite = np.isfinite(thr)
+    ks = float(np.max(tpr - fpr))
+    k = int(np.argmax(tpr - fpr))
+    ax.plot(thr[finite], tpr[finite], label="cumulative bad (TPR)")
+    ax.plot(thr[finite], fpr[finite], label="cumulative good (FPR)")
+    if np.isfinite(thr[k]):
+        ax.axvline(thr[k], color="k", ls="--", lw=0.8, label=f"KS = {ks:.3f}")
+    ax.set_xlabel("threshold on P(bad)")
+    ax.set_ylabel("cumulative rate")
+    ax.invert_xaxis()
+    ax.legend()
+    ax.set_title("KS curve")
+    return ax