rejectkit 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rejectkit/__init__.py ADDED
@@ -0,0 +1,47 @@
1
+ """rejectkit — reject inference for credit scoring, scikit-learn style.
2
+
3
+ Reject inference corrects the sampling bias that arises when a credit model is
4
+ trained only on *accepted* applicants whose good/bad outcome is observed, while
5
+ the *rejected* applicants — who are also part of the through-the-door
6
+ population the model must score — are silently dropped.
7
+
8
+ ``rejectkit`` provides the classic reject inference methods behind a single,
9
+ scikit-learn-compatible API, plus a benchmark harness that lets you measure —
10
+ on your own data — whether reject inference actually helps.
11
+ """
12
+
13
+ from . import datasets, diagnostics, plotting
14
+ from .base import BaseRejectInferencer
15
+ from .benchmark import MaskedRejectBenchmark
16
+ from .estimator import RejectInferenceClassifier, get_inferencer
17
+ from .methods import (
18
+ Extrapolation,
19
+ FuzzyAugmentation,
20
+ HeckmanClassifier,
21
+ Parcelling,
22
+ Reclassification,
23
+ Reweighting,
24
+ SelfLearning,
25
+ SimpleAugmentation,
26
+ )
27
+
28
+ __version__ = "0.3.0"
29
+
30
+ __all__ = [
31
+ "BaseRejectInferencer",
32
+ "SimpleAugmentation",
33
+ "FuzzyAugmentation",
34
+ "Parcelling",
35
+ "Reweighting",
36
+ "Reclassification",
37
+ "Extrapolation",
38
+ "SelfLearning",
39
+ "HeckmanClassifier",
40
+ "RejectInferenceClassifier",
41
+ "get_inferencer",
42
+ "MaskedRejectBenchmark",
43
+ "diagnostics",
44
+ "datasets",
45
+ "plotting",
46
+ "__version__",
47
+ ]
rejectkit/_compat.py ADDED
@@ -0,0 +1,19 @@
1
+ """Input coercion so rejectkit accepts pandas, polars, or plain numpy."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def to_numpy_2d(X):
9
+ """Coerce a 2D array-like (pandas/polars DataFrame, ndarray, list) to float ndarray."""
10
+ if hasattr(X, "to_numpy"): # pandas / polars DataFrame
11
+ X = X.to_numpy()
12
+ return np.asarray(X, dtype=float)
13
+
14
+
15
+ def to_numpy_1d(y):
16
+ """Coerce a 1D array-like (pandas/polars Series, ndarray, list) to a 1D ndarray."""
17
+ if hasattr(y, "to_numpy"): # pandas / polars Series
18
+ y = y.to_numpy()
19
+ return np.asarray(y).ravel()
rejectkit/base.py ADDED
@@ -0,0 +1,94 @@
1
+ """Base class shared by all reject inference methods."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ import numpy as np
8
+ from sklearn.base import BaseEstimator, clone
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.utils import check_array
11
+
12
+ from ._compat import to_numpy_1d, to_numpy_2d
13
+
14
+ ArrayTriple = tuple[np.ndarray, np.ndarray, np.ndarray]
15
+
16
+
17
+ class BaseRejectInferencer(BaseEstimator, ABC):
18
+ """Abstract base class for reject inference methods.
19
+
20
+ A reject inferencer takes the accepted applicants ``(X_accept, y_accept)``,
21
+ whose good/bad outcome is known, together with the rejected applicants
22
+ ``X_reject``, whose outcome is unknown, and produces an augmented, weighted
23
+ training sample ``(X, y, sample_weight)`` that approximates the full
24
+ "through-the-door" population.
25
+
26
+ The label convention is ``1 = bad`` (the event of interest, e.g. default)
27
+ and ``0 = good``, matching ``predict_proba(...)[:, 1] = P(bad)``.
28
+
29
+ Accepts pandas, polars, or numpy inputs.
30
+
31
+ Parameters
32
+ ----------
33
+ base_estimator : sklearn classifier, optional
34
+ Probabilistic model used internally by the method. Must implement
35
+ ``predict_proba``. Defaults to LogisticRegression.
36
+ """
37
+
38
+ def __init__(self, base_estimator: BaseEstimator | None = None):
39
+ self.base_estimator = base_estimator
40
+
41
+ def _make_base(self) -> BaseEstimator:
42
+ if self.base_estimator is None:
43
+ return LogisticRegression(max_iter=1000)
44
+ return clone(self.base_estimator)
45
+
46
+ def _validate(self, X_accept, y_accept, X_reject):
47
+ X_accept = check_array(to_numpy_2d(X_accept), dtype=float)
48
+ X_reject = check_array(to_numpy_2d(X_reject), dtype=float)
49
+ y_accept = to_numpy_1d(y_accept)
50
+ try:
51
+ y_accept = y_accept.astype(int)
52
+ except (ValueError, TypeError) as exc: # pragma: no cover - defensive
53
+ raise ValueError("y_accept must be coercible to integers 0/1.") from exc
54
+ if X_accept.shape[1] != X_reject.shape[1]:
55
+ raise ValueError(
56
+ f"X_accept has {X_accept.shape[1]} features but X_reject has "
57
+ f"{X_reject.shape[1]}."
58
+ )
59
+ classes = set(np.unique(y_accept).tolist())
60
+ if classes != {0, 1}:
61
+ raise ValueError(
62
+ "y_accept must be binary with both labels present, where "
63
+ f"1 = bad (event) and 0 = good. Got classes {sorted(classes)}."
64
+ )
65
+ return X_accept, y_accept, X_reject
66
+
67
+ def _reject_bad_proba(self) -> np.ndarray:
68
+ """P(bad) for rejected applicants from the good/bad scorer."""
69
+ return self.scorer_.predict_proba(self.X_reject_)[:, 1]
70
+
71
+ def fit(self, X_accept, y_accept, X_reject):
72
+ """Fit any internal models on the accepts (and rejects, if needed)."""
73
+ X_accept, y_accept, X_reject = self._validate(X_accept, y_accept, X_reject)
74
+ self.X_accept_ = X_accept
75
+ self.y_accept_ = y_accept
76
+ self.X_reject_ = X_reject
77
+ self.n_features_in_ = X_accept.shape[1]
78
+ self._fit()
79
+ self.is_fitted_ = True
80
+ return self
81
+
82
+ def _fit(self) -> None:
83
+ """Hook for method-specific fitting; default fits the good/bad scorer."""
84
+ self.scorer_ = self._make_base()
85
+ self.scorer_.fit(self.X_accept_, self.y_accept_)
86
+
87
+ @abstractmethod
88
+ def resample(self) -> ArrayTriple:
89
+ """Return the augmented training sample ``(X, y, sample_weight)``."""
90
+ raise NotImplementedError
91
+
92
+ def fit_resample(self, X_accept, y_accept, X_reject) -> ArrayTriple:
93
+ """Convenience: :meth:`fit` followed by :meth:`resample`."""
94
+ return self.fit(X_accept, y_accept, X_reject).resample()
rejectkit/benchmark.py ADDED
@@ -0,0 +1,141 @@
1
+ """Benchmark harness for reject inference methods.
2
+
3
+ The core difficulty of reject inference is that you never observe the outcome
4
+ of rejected applicants, so you cannot directly validate any method on real
5
+ data. :class:`MaskedRejectBenchmark` sidesteps this on a *fully labelled*
6
+ dataset: it hides the labels of a synthetically "rejected" subset of the
7
+ training data, asks each method to recover a model, and scores every model on
8
+ an untouched test set representing the true through-the-door population.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Sequence
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ from sklearn.base import clone
18
+ from sklearn.linear_model import LogisticRegression
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.preprocessing import StandardScaler
21
+
22
+ from ._compat import to_numpy_1d, to_numpy_2d
23
+ from .diagnostics import auc, gini, ks_statistic
24
+ from .estimator import RejectInferenceClassifier
25
+ from .methods import HeckmanClassifier
26
+
27
+
28
+ class MaskedRejectBenchmark:
29
+ """Compare reject inference methods by masking labels on labelled data.
30
+
31
+ Parameters
32
+ ----------
33
+ selection : {'mar', 'mnar', 'cutoff'}, default='mnar'
34
+ Acceptance mechanism applied to the training set.
35
+ ``'mar'`` — acceptance depends only on observed features.
36
+ ``'mnar'`` — acceptance also depends on the hidden outcome.
37
+ ``'cutoff'`` — accept the lowest-PD fraction under a quick model, the
38
+ realistic credit-policy mechanism (strong, score-based truncation).
39
+ accept_rate : float, default=0.6
40
+ Fraction of training applicants accepted.
41
+ test_size : float, default=0.3
42
+ Fraction held out as the unbiased evaluation population.
43
+ selection_strength : float, default=2.0
44
+ Strength of the feature (and, under MNAR, outcome) dependence. Unused
45
+ for ``'cutoff'``.
46
+ random_state : int, default=0
47
+ """
48
+
49
+ def __init__(self, selection="mnar", accept_rate=0.6, test_size=0.3,
50
+ selection_strength=2.0, random_state=0):
51
+ self.selection = selection
52
+ self.accept_rate = accept_rate
53
+ self.test_size = test_size
54
+ self.selection_strength = selection_strength
55
+ self.random_state = random_state
56
+
57
+ def _selection_scores(self, X_std, y, rng):
58
+ n, d = X_std.shape
59
+ w = rng.normal(size=d)
60
+ feat = X_std @ w
61
+ feat = feat / (feat.std() + 1e-12)
62
+ noise = rng.normal(size=n)
63
+ if self.selection == "mar":
64
+ return self.selection_strength * feat + noise
65
+ if self.selection == "mnar":
66
+ y_term = self.selection_strength * (0.5 - y) * 2.0
67
+ return feat + y_term + noise
68
+ raise ValueError("selection must be 'mar', 'mnar', or 'cutoff'.")
69
+
70
+ def _accept_mask(self, X_train, y_train) -> np.ndarray:
71
+ if self.selection == "cutoff":
72
+ risk = (
73
+ LogisticRegression(max_iter=1000)
74
+ .fit(X_train, y_train)
75
+ .predict_proba(X_train)[:, 1]
76
+ )
77
+ accepted = risk <= np.quantile(risk, self.accept_rate)
78
+ else:
79
+ rng = np.random.default_rng(self.random_state)
80
+ X_std = StandardScaler().fit_transform(X_train)
81
+ z = self._selection_scores(X_std, y_train, rng)
82
+ accepted = z >= np.quantile(z, 1.0 - self.accept_rate)
83
+ if len(np.unique(y_train[accepted])) < 2:
84
+ raise ValueError(
85
+ "The accepted subset contains a single class; lower "
86
+ "selection_strength or change accept_rate."
87
+ )
88
+ return accepted
89
+
90
+ def compare(self, methods: Sequence[str], X, y, estimator=None,
91
+ method_params: dict | None = None) -> pd.DataFrame:
92
+ """Run the benchmark and return a tidy results table.
93
+
94
+ Methods may include ``'heckman'`` (uses :class:`HeckmanClassifier`) as
95
+ well as any resampling method name. Returns a DataFrame indexed by
96
+ ``['oracle', 'naive', *methods]`` with columns ``auc``, ``ks``, ``gini``
97
+ and ``auc_recovery`` (0 = no better than naive, 1 = matches oracle).
98
+ Sample sizes are in ``df.attrs``.
99
+ """
100
+ X = to_numpy_2d(X)
101
+ y = to_numpy_1d(y).astype(int)
102
+ method_params = method_params or {}
103
+ est = estimator if estimator is not None else LogisticRegression(max_iter=1000)
104
+
105
+ X_tr, X_te, y_tr, y_te = train_test_split(
106
+ X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
107
+ )
108
+ accepted = self._accept_mask(X_tr, y_tr)
109
+ X_a, y_a, X_r = X_tr[accepted], y_tr[accepted], X_tr[~accepted]
110
+
111
+ def _scores(model):
112
+ p = model.predict_proba(X_te)[:, 1]
113
+ return {"auc": auc(y_te, p), "ks": ks_statistic(y_te, p), "gini": gini(y_te, p)}
114
+
115
+ rows = {"oracle": _scores(clone(est).fit(X_tr, y_tr)),
116
+ "naive": _scores(clone(est).fit(X_a, y_a))}
117
+ for m in methods:
118
+ if str(m).lower() == "heckman":
119
+ clf = HeckmanClassifier(outcome_estimator=clone(est))
120
+ else:
121
+ clf = RejectInferenceClassifier(
122
+ estimator=clone(est), method=m, method_params=method_params.get(m, {})
123
+ )
124
+ clf.fit(X_a, y_a, X_r)
125
+ rows[m] = _scores(clf)
126
+
127
+ df = pd.DataFrame(rows).T[["auc", "ks", "gini"]]
128
+ gap = rows["oracle"]["auc"] - rows["naive"]["auc"]
129
+ # Recovery is only meaningful when the oracle clearly beats the naive
130
+ # model; if there is no gap to recover, report NaN rather than a ratio
131
+ # that explodes around a near-zero denominator.
132
+ df["auc_recovery"] = (
133
+ (df["auc"] - rows["naive"]["auc"]) / gap if gap > 5e-3 else np.nan
134
+ )
135
+ df.attrs.update(
136
+ n_accept=int(accepted.sum()),
137
+ n_reject=int((~accepted).sum()),
138
+ n_test=int(len(y_te)),
139
+ selection=self.selection,
140
+ )
141
+ return df
rejectkit/datasets.py ADDED
@@ -0,0 +1,97 @@
1
+ """Synthetic data generators for examples, tests and demos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def _sigmoid(z: np.ndarray) -> np.ndarray:
10
+ """Numerically stable logistic sigmoid."""
11
+ out = np.empty_like(z, dtype=float)
12
+ pos = z >= 0
13
+ out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
14
+ ez = np.exp(z[~pos])
15
+ out[~pos] = ez / (1.0 + ez)
16
+ return out
17
+
18
+
19
+ def _solve_intercept(logit: np.ndarray, target: float, iters: int = 60) -> float:
20
+ """Bisection for the intercept that yields a target mean event rate."""
21
+ lo, hi = -20.0, 20.0
22
+ for _ in range(iters):
23
+ mid = 0.5 * (lo + hi)
24
+ if _sigmoid(logit + mid).mean() < target:
25
+ lo = mid
26
+ else:
27
+ hi = mid
28
+ return 0.5 * (lo + hi)
29
+
30
+
31
+ def make_credit_data(
32
+ n_samples: int = 3000,
33
+ n_features: int = 8,
34
+ bad_rate: float = 0.3,
35
+ random_state: int = 0,
36
+ ) -> tuple[pd.DataFrame, pd.Series]:
37
+ """Generate a synthetic, fully labelled credit dataset.
38
+
39
+ Returns
40
+ -------
41
+ X : pandas.DataFrame, shape (n_samples, n_features)
42
+ y : pandas.Series
43
+ ``1 = bad`` (default), ``0 = good``. The mean of ``y`` is approximately
44
+ ``bad_rate``.
45
+ """
46
+ rng = np.random.default_rng(random_state)
47
+ X = rng.normal(size=(n_samples, n_features))
48
+ beta = rng.normal(size=n_features)
49
+ beta[n_features // 2:] *= 0.2 # second half weakly informative
50
+ logit = X @ beta
51
+ logit = 1.5 * (logit - logit.mean()) / (logit.std() + 1e-12)
52
+ intercept = _solve_intercept(logit, bad_rate)
53
+ p_bad = _sigmoid(logit + intercept)
54
+ y = (rng.random(n_samples) < p_bad).astype(int)
55
+ cols = [f"x{i + 1}" for i in range(n_features)]
56
+ return pd.DataFrame(X, columns=cols), pd.Series(y, name="bad")
57
+
58
+
59
+ def make_accept_reject(
60
+ n_samples: int = 3000,
61
+ n_features: int = 8,
62
+ bad_rate: float = 0.3,
63
+ accept_rate: float = 0.6,
64
+ selection: str = "mnar",
65
+ selection_strength: float = 1.5,
66
+ random_state: int = 0,
67
+ ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
68
+ """Generate accept/reject data with the reject labels hidden.
69
+
70
+ Returns
71
+ -------
72
+ X_accept, y_accept : the accepted applicants and their observed outcomes.
73
+ X_reject : the rejected applicants (features only).
74
+ y_reject_true : the rejects' true outcomes — **hidden** in practice, returned
75
+ only so demos and tests can quantify how well a method recovers them.
76
+ """
77
+ X, y = make_credit_data(n_samples, n_features, bad_rate, random_state)
78
+ rng = np.random.default_rng(random_state + 1)
79
+ Xv = X.to_numpy()
80
+ Xs = (Xv - Xv.mean(0)) / (Xv.std(0) + 1e-12)
81
+ w = rng.normal(size=n_features)
82
+ feat = Xs @ w
83
+ feat = feat / (feat.std() + 1e-12)
84
+ noise = rng.normal(size=n_samples)
85
+ if selection == "mnar":
86
+ z = feat + selection_strength * (0.5 - y.to_numpy()) * 2.0 + noise
87
+ elif selection == "mar":
88
+ z = selection_strength * feat + noise
89
+ else:
90
+ raise ValueError("selection must be 'mar' or 'mnar'.")
91
+ accepted = z >= np.quantile(z, 1.0 - accept_rate)
92
+ return (
93
+ X[accepted].reset_index(drop=True),
94
+ y[accepted].reset_index(drop=True),
95
+ X[~accepted].reset_index(drop=True),
96
+ y[~accepted].reset_index(drop=True),
97
+ )
@@ -0,0 +1,109 @@
1
+ """Lightweight evaluation and drift metrics used across rejectkit.
2
+
3
+ Dependency-free (numpy + scikit-learn + pandas) and using the ``1 = bad``
4
+ convention, so ``y_score`` is always interpreted as P(bad).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from ._compat import to_numpy_1d, to_numpy_2d
13
+
14
+
15
+ def _check_arrays(y_true, y_score):
16
+ y_true = to_numpy_1d(y_true)
17
+ y_score = to_numpy_1d(y_score).astype(float)
18
+ if y_true.shape[0] != y_score.shape[0]:
19
+ raise ValueError("y_true and y_score must have the same length.")
20
+ return y_true, y_score
21
+
22
+
23
+ def auc(y_true, y_score) -> float:
24
+ """Area under the ROC curve."""
25
+ from sklearn.metrics import roc_auc_score
26
+
27
+ y_true, y_score = _check_arrays(y_true, y_score)
28
+ return float(roc_auc_score(y_true, y_score))
29
+
30
+
31
+ def gini(y_true, y_score) -> float:
32
+ """Gini coefficient, ``2 * AUC - 1``."""
33
+ return 2.0 * auc(y_true, y_score) - 1.0
34
+
35
+
36
+ def ks_statistic(y_true, y_score) -> float:
37
+ """Kolmogorov-Smirnov statistic: max separation of good/bad CDFs."""
38
+ from sklearn.metrics import roc_curve
39
+
40
+ y_true, y_score = _check_arrays(y_true, y_score)
41
+ fpr, tpr, _ = roc_curve(y_true, y_score)
42
+ return float(np.max(tpr - fpr))
43
+
44
+
45
+ def psi(expected, actual, n_bins: int = 10, eps: float = 1e-6) -> float:
46
+ """Population Stability Index between two distributions.
47
+
48
+ Bins are quantiles of ``expected``. Rule of thumb: < 0.1 no shift,
49
+ 0.1-0.25 moderate, > 0.25 major.
50
+ """
51
+ expected = to_numpy_1d(expected).astype(float)
52
+ actual = to_numpy_1d(actual).astype(float)
53
+ edges = np.unique(np.quantile(expected, np.linspace(0, 1, n_bins + 1)))
54
+ if edges.size < 2:
55
+ return 0.0
56
+ edges[0], edges[-1] = -np.inf, np.inf
57
+ e_counts, _ = np.histogram(expected, bins=edges)
58
+ a_counts, _ = np.histogram(actual, bins=edges)
59
+ e_perc = np.clip(e_counts / max(e_counts.sum(), 1), eps, None)
60
+ a_perc = np.clip(a_counts / max(a_counts.sum(), 1), eps, None)
61
+ return float(np.sum((a_perc - e_perc) * np.log(a_perc / e_perc)))
62
+
63
+
64
+ def feature_drift(X_accept, X_reject, n_bins: int = 10) -> pd.Series:
65
+ """Per-feature PSI between the accept and reject populations.
66
+
67
+ A quick read on how unrepresentative your accepts are: large values flag
68
+ features whose distribution differs most between accepts and rejects.
69
+ """
70
+ cols = list(X_accept.columns) if hasattr(X_accept, "columns") else None
71
+ A = to_numpy_2d(X_accept)
72
+ R = to_numpy_2d(X_reject)
73
+ if cols is None:
74
+ cols = [f"x{i + 1}" for i in range(A.shape[1])]
75
+ values = {c: psi(A[:, i], R[:, i], n_bins=n_bins) for i, c in enumerate(cols)}
76
+ return pd.Series(values, name="psi").sort_values(ascending=False)
77
+
78
+
79
+ def swap_set(
80
+ y_true,
81
+ score_reference,
82
+ score_challenger,
83
+ cutoff_reference,
84
+ cutoff_challenger,
85
+ lower_is_safer: bool = True,
86
+ ) -> pd.DataFrame:
87
+ """Swap-set analysis between a reference and a challenger scorecard.
88
+
89
+ Compares which applicants each policy accepts (accept if the score is on the
90
+ safe side of its cutoff) and reports counts and bad rates for the four
91
+ groups: kept-accept, kept-reject, swap-in (reference rejects, challenger
92
+ accepts) and swap-out (reference accepts, challenger rejects).
93
+ """
94
+ y = to_numpy_1d(y_true).astype(int)
95
+ sr = to_numpy_1d(score_reference).astype(float)
96
+ sc = to_numpy_1d(score_challenger).astype(float)
97
+ acc_r = sr <= cutoff_reference if lower_is_safer else sr >= cutoff_reference
98
+ acc_c = sc <= cutoff_challenger if lower_is_safer else sc >= cutoff_challenger
99
+ groups = {
100
+ "kept_accept": acc_r & acc_c,
101
+ "swap_out": acc_r & ~acc_c,
102
+ "swap_in": ~acc_r & acc_c,
103
+ "kept_reject": ~acc_r & ~acc_c,
104
+ }
105
+ rows = {}
106
+ for name, mask in groups.items():
107
+ n = int(mask.sum())
108
+ rows[name] = {"n": n, "bad_rate": float(y[mask].mean()) if n else float("nan")}
109
+ return pd.DataFrame(rows).T[["n", "bad_rate"]]
rejectkit/estimator.py ADDED
@@ -0,0 +1,105 @@
1
+ """High-level estimator that wraps any classifier with reject inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.base import BaseEstimator, ClassifierMixin, clone
7
+ from sklearn.linear_model import LogisticRegression
8
+
9
+ from ._compat import to_numpy_2d
10
+ from .methods import (
11
+ Extrapolation,
12
+ FuzzyAugmentation,
13
+ Parcelling,
14
+ Reclassification,
15
+ Reweighting,
16
+ SelfLearning,
17
+ SimpleAugmentation,
18
+ )
19
+
20
+ _METHODS = {
21
+ "simple": SimpleAugmentation,
22
+ "fuzzy": FuzzyAugmentation,
23
+ "parcelling": Parcelling,
24
+ "reweighting": Reweighting,
25
+ "reclassification": Reclassification,
26
+ "extrapolation": Extrapolation,
27
+ "twins": Extrapolation,
28
+ "selflearning": SelfLearning,
29
+ "self-learning": SelfLearning,
30
+ }
31
+
32
+
33
+ def get_inferencer(method: str, base_estimator: BaseEstimator | None = None, **params):
34
+ """Build a reject inferencer by name.
35
+
36
+ Parameters
37
+ ----------
38
+ method : str
39
+ One of: simple, fuzzy, parcelling, reweighting, reclassification,
40
+ extrapolation (alias twins), selflearning.
41
+ base_estimator : sklearn classifier, optional
42
+ **params
43
+ Extra keyword arguments for the chosen method (e.g. ``uplift=1.5``).
44
+ """
45
+ key = str(method).lower()
46
+ if key not in _METHODS:
47
+ raise ValueError(f"Unknown method {method!r}. Available: {sorted(_METHODS)}.")
48
+ return _METHODS[key](base_estimator=base_estimator, **params)
49
+
50
+
51
+ class RejectInferenceClassifier(BaseEstimator, ClassifierMixin):
52
+ """Wrap a scikit-learn classifier with reject inference.
53
+
54
+ Infers labels/weights for the rejected applicants using ``method``, builds
55
+ the augmented through-the-door sample, and fits ``estimator`` on it
56
+ (passing ``sample_weight``). It then behaves like an ordinary fitted
57
+ classifier via :meth:`predict` / :meth:`predict_proba`.
58
+
59
+ Note the non-standard ``fit`` signature: it takes accepts and rejects
60
+ separately. Accepts pandas, polars, or numpy inputs.
61
+
62
+ Parameters
63
+ ----------
64
+ estimator : sklearn classifier, optional
65
+ Final model trained on the augmented sample. Must accept
66
+ ``sample_weight``. Defaults to LogisticRegression.
67
+ method : str, default='fuzzy'
68
+ See :func:`get_inferencer`.
69
+ base_scorer : sklearn classifier, optional
70
+ Internal model used by the reject inference method.
71
+ method_params : dict, optional
72
+ Extra keyword arguments forwarded to the method, e.g. ``{'uplift': 1.5}``.
73
+ """
74
+
75
+ def __init__(self, estimator=None, method="fuzzy", base_scorer=None, method_params=None):
76
+ self.estimator = estimator
77
+ self.method = method
78
+ self.base_scorer = base_scorer
79
+ self.method_params = method_params
80
+
81
+ def fit(self, X_accept, y_accept, X_reject):
82
+ params = self.method_params or {}
83
+ inferencer = get_inferencer(self.method, base_estimator=self.base_scorer, **params)
84
+ X_aug, y_aug, w_aug = inferencer.fit_resample(X_accept, y_accept, X_reject)
85
+ self.inferencer_ = inferencer
86
+ self.estimator_ = (
87
+ clone(self.estimator) if self.estimator is not None
88
+ else LogisticRegression(max_iter=1000)
89
+ )
90
+ self.estimator_.fit(X_aug, y_aug, sample_weight=w_aug)
91
+ self.classes_ = getattr(self.estimator_, "classes_", np.array([0, 1]))
92
+ self.n_features_in_ = X_aug.shape[1]
93
+ return self
94
+
95
+ def predict(self, X):
96
+ return self.estimator_.predict(to_numpy_2d(X))
97
+
98
+ def predict_proba(self, X):
99
+ return self.estimator_.predict_proba(to_numpy_2d(X))
100
+
101
+ def decision_function(self, X):
102
+ Xa = to_numpy_2d(X)
103
+ if hasattr(self.estimator_, "decision_function"):
104
+ return self.estimator_.decision_function(Xa)
105
+ return self.predict_proba(Xa)[:, 1]
@@ -0,0 +1,20 @@
1
+ """Reject inference methods."""
2
+
3
+ from .augmentation import FuzzyAugmentation, SimpleAugmentation
4
+ from .extrapolation import Extrapolation
5
+ from .heckman import HeckmanClassifier
6
+ from .parcelling import Parcelling
7
+ from .reclassification import Reclassification
8
+ from .reweighting import Reweighting
9
+ from .semi_supervised import SelfLearning
10
+
11
+ __all__ = [
12
+ "SimpleAugmentation",
13
+ "FuzzyAugmentation",
14
+ "Parcelling",
15
+ "Reweighting",
16
+ "Reclassification",
17
+ "Extrapolation",
18
+ "SelfLearning",
19
+ "HeckmanClassifier",
20
+ ]