rejectkit 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rejectkit/__init__.py +47 -0
- rejectkit/_compat.py +19 -0
- rejectkit/base.py +94 -0
- rejectkit/benchmark.py +141 -0
- rejectkit/datasets.py +97 -0
- rejectkit/diagnostics.py +109 -0
- rejectkit/estimator.py +105 -0
- rejectkit/methods/__init__.py +20 -0
- rejectkit/methods/augmentation.py +69 -0
- rejectkit/methods/extrapolation.py +47 -0
- rejectkit/methods/heckman.py +98 -0
- rejectkit/methods/parcelling.py +87 -0
- rejectkit/methods/reclassification.py +45 -0
- rejectkit/methods/reweighting.py +47 -0
- rejectkit/methods/semi_supervised.py +64 -0
- rejectkit/plotting.py +75 -0
- rejectkit-0.3.0.dist-info/METADATA +219 -0
- rejectkit-0.3.0.dist-info/RECORD +20 -0
- rejectkit-0.3.0.dist-info/WHEEL +4 -0
- rejectkit-0.3.0.dist-info/licenses/LICENSE +21 -0
rejectkit/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""rejectkit — reject inference for credit scoring, scikit-learn style.
|
|
2
|
+
|
|
3
|
+
Reject inference corrects the sampling bias that arises when a credit model is
|
|
4
|
+
trained only on *accepted* applicants whose good/bad outcome is observed, while
|
|
5
|
+
the *rejected* applicants — who are also part of the through-the-door
|
|
6
|
+
population the model must score — are silently dropped.
|
|
7
|
+
|
|
8
|
+
``rejectkit`` provides the classic reject inference methods behind a single,
|
|
9
|
+
scikit-learn-compatible API, plus a benchmark harness that lets you measure —
|
|
10
|
+
on your own data — whether reject inference actually helps.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from . import datasets, diagnostics, plotting
|
|
14
|
+
from .base import BaseRejectInferencer
|
|
15
|
+
from .benchmark import MaskedRejectBenchmark
|
|
16
|
+
from .estimator import RejectInferenceClassifier, get_inferencer
|
|
17
|
+
from .methods import (
|
|
18
|
+
Extrapolation,
|
|
19
|
+
FuzzyAugmentation,
|
|
20
|
+
HeckmanClassifier,
|
|
21
|
+
Parcelling,
|
|
22
|
+
Reclassification,
|
|
23
|
+
Reweighting,
|
|
24
|
+
SelfLearning,
|
|
25
|
+
SimpleAugmentation,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__version__ = "0.3.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"BaseRejectInferencer",
|
|
32
|
+
"SimpleAugmentation",
|
|
33
|
+
"FuzzyAugmentation",
|
|
34
|
+
"Parcelling",
|
|
35
|
+
"Reweighting",
|
|
36
|
+
"Reclassification",
|
|
37
|
+
"Extrapolation",
|
|
38
|
+
"SelfLearning",
|
|
39
|
+
"HeckmanClassifier",
|
|
40
|
+
"RejectInferenceClassifier",
|
|
41
|
+
"get_inferencer",
|
|
42
|
+
"MaskedRejectBenchmark",
|
|
43
|
+
"diagnostics",
|
|
44
|
+
"datasets",
|
|
45
|
+
"plotting",
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|
rejectkit/_compat.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Input coercion so rejectkit accepts pandas, polars, or plain numpy."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_numpy_2d(X):
|
|
9
|
+
"""Coerce a 2D array-like (pandas/polars DataFrame, ndarray, list) to float ndarray."""
|
|
10
|
+
if hasattr(X, "to_numpy"): # pandas / polars DataFrame
|
|
11
|
+
X = X.to_numpy()
|
|
12
|
+
return np.asarray(X, dtype=float)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def to_numpy_1d(y):
|
|
16
|
+
"""Coerce a 1D array-like (pandas/polars Series, ndarray, list) to a 1D ndarray."""
|
|
17
|
+
if hasattr(y, "to_numpy"): # pandas / polars Series
|
|
18
|
+
y = y.to_numpy()
|
|
19
|
+
return np.asarray(y).ravel()
|
rejectkit/base.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Base class shared by all reject inference methods."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.base import BaseEstimator, clone
|
|
9
|
+
from sklearn.linear_model import LogisticRegression
|
|
10
|
+
from sklearn.utils import check_array
|
|
11
|
+
|
|
12
|
+
from ._compat import to_numpy_1d, to_numpy_2d
|
|
13
|
+
|
|
14
|
+
ArrayTriple = tuple[np.ndarray, np.ndarray, np.ndarray]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseRejectInferencer(BaseEstimator, ABC):
|
|
18
|
+
"""Abstract base class for reject inference methods.
|
|
19
|
+
|
|
20
|
+
A reject inferencer takes the accepted applicants ``(X_accept, y_accept)``,
|
|
21
|
+
whose good/bad outcome is known, together with the rejected applicants
|
|
22
|
+
``X_reject``, whose outcome is unknown, and produces an augmented, weighted
|
|
23
|
+
training sample ``(X, y, sample_weight)`` that approximates the full
|
|
24
|
+
"through-the-door" population.
|
|
25
|
+
|
|
26
|
+
The label convention is ``1 = bad`` (the event of interest, e.g. default)
|
|
27
|
+
and ``0 = good``, matching ``predict_proba(...)[:, 1] = P(bad)``.
|
|
28
|
+
|
|
29
|
+
Accepts pandas, polars, or numpy inputs.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
base_estimator : sklearn classifier, optional
|
|
34
|
+
Probabilistic model used internally by the method. Must implement
|
|
35
|
+
``predict_proba``. Defaults to LogisticRegression.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, base_estimator: BaseEstimator | None = None):
|
|
39
|
+
self.base_estimator = base_estimator
|
|
40
|
+
|
|
41
|
+
def _make_base(self) -> BaseEstimator:
|
|
42
|
+
if self.base_estimator is None:
|
|
43
|
+
return LogisticRegression(max_iter=1000)
|
|
44
|
+
return clone(self.base_estimator)
|
|
45
|
+
|
|
46
|
+
def _validate(self, X_accept, y_accept, X_reject):
|
|
47
|
+
X_accept = check_array(to_numpy_2d(X_accept), dtype=float)
|
|
48
|
+
X_reject = check_array(to_numpy_2d(X_reject), dtype=float)
|
|
49
|
+
y_accept = to_numpy_1d(y_accept)
|
|
50
|
+
try:
|
|
51
|
+
y_accept = y_accept.astype(int)
|
|
52
|
+
except (ValueError, TypeError) as exc: # pragma: no cover - defensive
|
|
53
|
+
raise ValueError("y_accept must be coercible to integers 0/1.") from exc
|
|
54
|
+
if X_accept.shape[1] != X_reject.shape[1]:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"X_accept has {X_accept.shape[1]} features but X_reject has "
|
|
57
|
+
f"{X_reject.shape[1]}."
|
|
58
|
+
)
|
|
59
|
+
classes = set(np.unique(y_accept).tolist())
|
|
60
|
+
if classes != {0, 1}:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
"y_accept must be binary with both labels present, where "
|
|
63
|
+
f"1 = bad (event) and 0 = good. Got classes {sorted(classes)}."
|
|
64
|
+
)
|
|
65
|
+
return X_accept, y_accept, X_reject
|
|
66
|
+
|
|
67
|
+
def _reject_bad_proba(self) -> np.ndarray:
|
|
68
|
+
"""P(bad) for rejected applicants from the good/bad scorer."""
|
|
69
|
+
return self.scorer_.predict_proba(self.X_reject_)[:, 1]
|
|
70
|
+
|
|
71
|
+
def fit(self, X_accept, y_accept, X_reject):
|
|
72
|
+
"""Fit any internal models on the accepts (and rejects, if needed)."""
|
|
73
|
+
X_accept, y_accept, X_reject = self._validate(X_accept, y_accept, X_reject)
|
|
74
|
+
self.X_accept_ = X_accept
|
|
75
|
+
self.y_accept_ = y_accept
|
|
76
|
+
self.X_reject_ = X_reject
|
|
77
|
+
self.n_features_in_ = X_accept.shape[1]
|
|
78
|
+
self._fit()
|
|
79
|
+
self.is_fitted_ = True
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def _fit(self) -> None:
|
|
83
|
+
"""Hook for method-specific fitting; default fits the good/bad scorer."""
|
|
84
|
+
self.scorer_ = self._make_base()
|
|
85
|
+
self.scorer_.fit(self.X_accept_, self.y_accept_)
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def resample(self) -> ArrayTriple:
|
|
89
|
+
"""Return the augmented training sample ``(X, y, sample_weight)``."""
|
|
90
|
+
raise NotImplementedError
|
|
91
|
+
|
|
92
|
+
def fit_resample(self, X_accept, y_accept, X_reject) -> ArrayTriple:
|
|
93
|
+
"""Convenience: :meth:`fit` followed by :meth:`resample`."""
|
|
94
|
+
return self.fit(X_accept, y_accept, X_reject).resample()
|
rejectkit/benchmark.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Benchmark harness for reject inference methods.
|
|
2
|
+
|
|
3
|
+
The core difficulty of reject inference is that you never observe the outcome
|
|
4
|
+
of rejected applicants, so you cannot directly validate any method on real
|
|
5
|
+
data. :class:`MaskedRejectBenchmark` sidesteps this on a *fully labelled*
|
|
6
|
+
dataset: it hides the labels of a synthetically "rejected" subset of the
|
|
7
|
+
training data, asks each method to recover a model, and scores every model on
|
|
8
|
+
an untouched test set representing the true through-the-door population.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from sklearn.base import clone
|
|
18
|
+
from sklearn.linear_model import LogisticRegression
|
|
19
|
+
from sklearn.model_selection import train_test_split
|
|
20
|
+
from sklearn.preprocessing import StandardScaler
|
|
21
|
+
|
|
22
|
+
from ._compat import to_numpy_1d, to_numpy_2d
|
|
23
|
+
from .diagnostics import auc, gini, ks_statistic
|
|
24
|
+
from .estimator import RejectInferenceClassifier
|
|
25
|
+
from .methods import HeckmanClassifier
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MaskedRejectBenchmark:
|
|
29
|
+
"""Compare reject inference methods by masking labels on labelled data.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
selection : {'mar', 'mnar', 'cutoff'}, default='mnar'
|
|
34
|
+
Acceptance mechanism applied to the training set.
|
|
35
|
+
``'mar'`` — acceptance depends only on observed features.
|
|
36
|
+
``'mnar'`` — acceptance also depends on the hidden outcome.
|
|
37
|
+
``'cutoff'`` — accept the lowest-PD fraction under a quick model, the
|
|
38
|
+
realistic credit-policy mechanism (strong, score-based truncation).
|
|
39
|
+
accept_rate : float, default=0.6
|
|
40
|
+
Fraction of training applicants accepted.
|
|
41
|
+
test_size : float, default=0.3
|
|
42
|
+
Fraction held out as the unbiased evaluation population.
|
|
43
|
+
selection_strength : float, default=2.0
|
|
44
|
+
Strength of the feature (and, under MNAR, outcome) dependence. Unused
|
|
45
|
+
for ``'cutoff'``.
|
|
46
|
+
random_state : int, default=0
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, selection="mnar", accept_rate=0.6, test_size=0.3,
|
|
50
|
+
selection_strength=2.0, random_state=0):
|
|
51
|
+
self.selection = selection
|
|
52
|
+
self.accept_rate = accept_rate
|
|
53
|
+
self.test_size = test_size
|
|
54
|
+
self.selection_strength = selection_strength
|
|
55
|
+
self.random_state = random_state
|
|
56
|
+
|
|
57
|
+
def _selection_scores(self, X_std, y, rng):
|
|
58
|
+
n, d = X_std.shape
|
|
59
|
+
w = rng.normal(size=d)
|
|
60
|
+
feat = X_std @ w
|
|
61
|
+
feat = feat / (feat.std() + 1e-12)
|
|
62
|
+
noise = rng.normal(size=n)
|
|
63
|
+
if self.selection == "mar":
|
|
64
|
+
return self.selection_strength * feat + noise
|
|
65
|
+
if self.selection == "mnar":
|
|
66
|
+
y_term = self.selection_strength * (0.5 - y) * 2.0
|
|
67
|
+
return feat + y_term + noise
|
|
68
|
+
raise ValueError("selection must be 'mar', 'mnar', or 'cutoff'.")
|
|
69
|
+
|
|
70
|
+
def _accept_mask(self, X_train, y_train) -> np.ndarray:
|
|
71
|
+
if self.selection == "cutoff":
|
|
72
|
+
risk = (
|
|
73
|
+
LogisticRegression(max_iter=1000)
|
|
74
|
+
.fit(X_train, y_train)
|
|
75
|
+
.predict_proba(X_train)[:, 1]
|
|
76
|
+
)
|
|
77
|
+
accepted = risk <= np.quantile(risk, self.accept_rate)
|
|
78
|
+
else:
|
|
79
|
+
rng = np.random.default_rng(self.random_state)
|
|
80
|
+
X_std = StandardScaler().fit_transform(X_train)
|
|
81
|
+
z = self._selection_scores(X_std, y_train, rng)
|
|
82
|
+
accepted = z >= np.quantile(z, 1.0 - self.accept_rate)
|
|
83
|
+
if len(np.unique(y_train[accepted])) < 2:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
"The accepted subset contains a single class; lower "
|
|
86
|
+
"selection_strength or change accept_rate."
|
|
87
|
+
)
|
|
88
|
+
return accepted
|
|
89
|
+
|
|
90
|
+
def compare(self, methods: Sequence[str], X, y, estimator=None,
|
|
91
|
+
method_params: dict | None = None) -> pd.DataFrame:
|
|
92
|
+
"""Run the benchmark and return a tidy results table.
|
|
93
|
+
|
|
94
|
+
Methods may include ``'heckman'`` (uses :class:`HeckmanClassifier`) as
|
|
95
|
+
well as any resampling method name. Returns a DataFrame indexed by
|
|
96
|
+
``['oracle', 'naive', *methods]`` with columns ``auc``, ``ks``, ``gini``
|
|
97
|
+
and ``auc_recovery`` (0 = no better than naive, 1 = matches oracle).
|
|
98
|
+
Sample sizes are in ``df.attrs``.
|
|
99
|
+
"""
|
|
100
|
+
X = to_numpy_2d(X)
|
|
101
|
+
y = to_numpy_1d(y).astype(int)
|
|
102
|
+
method_params = method_params or {}
|
|
103
|
+
est = estimator if estimator is not None else LogisticRegression(max_iter=1000)
|
|
104
|
+
|
|
105
|
+
X_tr, X_te, y_tr, y_te = train_test_split(
|
|
106
|
+
X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
|
|
107
|
+
)
|
|
108
|
+
accepted = self._accept_mask(X_tr, y_tr)
|
|
109
|
+
X_a, y_a, X_r = X_tr[accepted], y_tr[accepted], X_tr[~accepted]
|
|
110
|
+
|
|
111
|
+
def _scores(model):
|
|
112
|
+
p = model.predict_proba(X_te)[:, 1]
|
|
113
|
+
return {"auc": auc(y_te, p), "ks": ks_statistic(y_te, p), "gini": gini(y_te, p)}
|
|
114
|
+
|
|
115
|
+
rows = {"oracle": _scores(clone(est).fit(X_tr, y_tr)),
|
|
116
|
+
"naive": _scores(clone(est).fit(X_a, y_a))}
|
|
117
|
+
for m in methods:
|
|
118
|
+
if str(m).lower() == "heckman":
|
|
119
|
+
clf = HeckmanClassifier(outcome_estimator=clone(est))
|
|
120
|
+
else:
|
|
121
|
+
clf = RejectInferenceClassifier(
|
|
122
|
+
estimator=clone(est), method=m, method_params=method_params.get(m, {})
|
|
123
|
+
)
|
|
124
|
+
clf.fit(X_a, y_a, X_r)
|
|
125
|
+
rows[m] = _scores(clf)
|
|
126
|
+
|
|
127
|
+
df = pd.DataFrame(rows).T[["auc", "ks", "gini"]]
|
|
128
|
+
gap = rows["oracle"]["auc"] - rows["naive"]["auc"]
|
|
129
|
+
# Recovery is only meaningful when the oracle clearly beats the naive
|
|
130
|
+
# model; if there is no gap to recover, report NaN rather than a ratio
|
|
131
|
+
# that explodes around a near-zero denominator.
|
|
132
|
+
df["auc_recovery"] = (
|
|
133
|
+
(df["auc"] - rows["naive"]["auc"]) / gap if gap > 5e-3 else np.nan
|
|
134
|
+
)
|
|
135
|
+
df.attrs.update(
|
|
136
|
+
n_accept=int(accepted.sum()),
|
|
137
|
+
n_reject=int((~accepted).sum()),
|
|
138
|
+
n_test=int(len(y_te)),
|
|
139
|
+
selection=self.selection,
|
|
140
|
+
)
|
|
141
|
+
return df
|
rejectkit/datasets.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Synthetic data generators for examples, tests and demos."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _sigmoid(z: np.ndarray) -> np.ndarray:
|
|
10
|
+
"""Numerically stable logistic sigmoid."""
|
|
11
|
+
out = np.empty_like(z, dtype=float)
|
|
12
|
+
pos = z >= 0
|
|
13
|
+
out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
|
|
14
|
+
ez = np.exp(z[~pos])
|
|
15
|
+
out[~pos] = ez / (1.0 + ez)
|
|
16
|
+
return out
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _solve_intercept(logit: np.ndarray, target: float, iters: int = 60) -> float:
|
|
20
|
+
"""Bisection for the intercept that yields a target mean event rate."""
|
|
21
|
+
lo, hi = -20.0, 20.0
|
|
22
|
+
for _ in range(iters):
|
|
23
|
+
mid = 0.5 * (lo + hi)
|
|
24
|
+
if _sigmoid(logit + mid).mean() < target:
|
|
25
|
+
lo = mid
|
|
26
|
+
else:
|
|
27
|
+
hi = mid
|
|
28
|
+
return 0.5 * (lo + hi)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def make_credit_data(
|
|
32
|
+
n_samples: int = 3000,
|
|
33
|
+
n_features: int = 8,
|
|
34
|
+
bad_rate: float = 0.3,
|
|
35
|
+
random_state: int = 0,
|
|
36
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
|
37
|
+
"""Generate a synthetic, fully labelled credit dataset.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
X : pandas.DataFrame, shape (n_samples, n_features)
|
|
42
|
+
y : pandas.Series
|
|
43
|
+
``1 = bad`` (default), ``0 = good``. The mean of ``y`` is approximately
|
|
44
|
+
``bad_rate``.
|
|
45
|
+
"""
|
|
46
|
+
rng = np.random.default_rng(random_state)
|
|
47
|
+
X = rng.normal(size=(n_samples, n_features))
|
|
48
|
+
beta = rng.normal(size=n_features)
|
|
49
|
+
beta[n_features // 2:] *= 0.2 # second half weakly informative
|
|
50
|
+
logit = X @ beta
|
|
51
|
+
logit = 1.5 * (logit - logit.mean()) / (logit.std() + 1e-12)
|
|
52
|
+
intercept = _solve_intercept(logit, bad_rate)
|
|
53
|
+
p_bad = _sigmoid(logit + intercept)
|
|
54
|
+
y = (rng.random(n_samples) < p_bad).astype(int)
|
|
55
|
+
cols = [f"x{i + 1}" for i in range(n_features)]
|
|
56
|
+
return pd.DataFrame(X, columns=cols), pd.Series(y, name="bad")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def make_accept_reject(
|
|
60
|
+
n_samples: int = 3000,
|
|
61
|
+
n_features: int = 8,
|
|
62
|
+
bad_rate: float = 0.3,
|
|
63
|
+
accept_rate: float = 0.6,
|
|
64
|
+
selection: str = "mnar",
|
|
65
|
+
selection_strength: float = 1.5,
|
|
66
|
+
random_state: int = 0,
|
|
67
|
+
) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
|
|
68
|
+
"""Generate accept/reject data with the reject labels hidden.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
X_accept, y_accept : the accepted applicants and their observed outcomes.
|
|
73
|
+
X_reject : the rejected applicants (features only).
|
|
74
|
+
y_reject_true : the rejects' true outcomes — **hidden** in practice, returned
|
|
75
|
+
only so demos and tests can quantify how well a method recovers them.
|
|
76
|
+
"""
|
|
77
|
+
X, y = make_credit_data(n_samples, n_features, bad_rate, random_state)
|
|
78
|
+
rng = np.random.default_rng(random_state + 1)
|
|
79
|
+
Xv = X.to_numpy()
|
|
80
|
+
Xs = (Xv - Xv.mean(0)) / (Xv.std(0) + 1e-12)
|
|
81
|
+
w = rng.normal(size=n_features)
|
|
82
|
+
feat = Xs @ w
|
|
83
|
+
feat = feat / (feat.std() + 1e-12)
|
|
84
|
+
noise = rng.normal(size=n_samples)
|
|
85
|
+
if selection == "mnar":
|
|
86
|
+
z = feat + selection_strength * (0.5 - y.to_numpy()) * 2.0 + noise
|
|
87
|
+
elif selection == "mar":
|
|
88
|
+
z = selection_strength * feat + noise
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("selection must be 'mar' or 'mnar'.")
|
|
91
|
+
accepted = z >= np.quantile(z, 1.0 - accept_rate)
|
|
92
|
+
return (
|
|
93
|
+
X[accepted].reset_index(drop=True),
|
|
94
|
+
y[accepted].reset_index(drop=True),
|
|
95
|
+
X[~accepted].reset_index(drop=True),
|
|
96
|
+
y[~accepted].reset_index(drop=True),
|
|
97
|
+
)
|
rejectkit/diagnostics.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Lightweight evaluation and drift metrics used across rejectkit.
|
|
2
|
+
|
|
3
|
+
Dependency-free (numpy + scikit-learn + pandas) and using the ``1 = bad``
|
|
4
|
+
convention, so ``y_score`` is always interpreted as P(bad).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from ._compat import to_numpy_1d, to_numpy_2d
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _check_arrays(y_true, y_score):
|
|
16
|
+
y_true = to_numpy_1d(y_true)
|
|
17
|
+
y_score = to_numpy_1d(y_score).astype(float)
|
|
18
|
+
if y_true.shape[0] != y_score.shape[0]:
|
|
19
|
+
raise ValueError("y_true and y_score must have the same length.")
|
|
20
|
+
return y_true, y_score
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def auc(y_true, y_score) -> float:
|
|
24
|
+
"""Area under the ROC curve."""
|
|
25
|
+
from sklearn.metrics import roc_auc_score
|
|
26
|
+
|
|
27
|
+
y_true, y_score = _check_arrays(y_true, y_score)
|
|
28
|
+
return float(roc_auc_score(y_true, y_score))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def gini(y_true, y_score) -> float:
|
|
32
|
+
"""Gini coefficient, ``2 * AUC - 1``."""
|
|
33
|
+
return 2.0 * auc(y_true, y_score) - 1.0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def ks_statistic(y_true, y_score) -> float:
|
|
37
|
+
"""Kolmogorov-Smirnov statistic: max separation of good/bad CDFs."""
|
|
38
|
+
from sklearn.metrics import roc_curve
|
|
39
|
+
|
|
40
|
+
y_true, y_score = _check_arrays(y_true, y_score)
|
|
41
|
+
fpr, tpr, _ = roc_curve(y_true, y_score)
|
|
42
|
+
return float(np.max(tpr - fpr))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def psi(expected, actual, n_bins: int = 10, eps: float = 1e-6) -> float:
|
|
46
|
+
"""Population Stability Index between two distributions.
|
|
47
|
+
|
|
48
|
+
Bins are quantiles of ``expected``. Rule of thumb: < 0.1 no shift,
|
|
49
|
+
0.1-0.25 moderate, > 0.25 major.
|
|
50
|
+
"""
|
|
51
|
+
expected = to_numpy_1d(expected).astype(float)
|
|
52
|
+
actual = to_numpy_1d(actual).astype(float)
|
|
53
|
+
edges = np.unique(np.quantile(expected, np.linspace(0, 1, n_bins + 1)))
|
|
54
|
+
if edges.size < 2:
|
|
55
|
+
return 0.0
|
|
56
|
+
edges[0], edges[-1] = -np.inf, np.inf
|
|
57
|
+
e_counts, _ = np.histogram(expected, bins=edges)
|
|
58
|
+
a_counts, _ = np.histogram(actual, bins=edges)
|
|
59
|
+
e_perc = np.clip(e_counts / max(e_counts.sum(), 1), eps, None)
|
|
60
|
+
a_perc = np.clip(a_counts / max(a_counts.sum(), 1), eps, None)
|
|
61
|
+
return float(np.sum((a_perc - e_perc) * np.log(a_perc / e_perc)))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def feature_drift(X_accept, X_reject, n_bins: int = 10) -> pd.Series:
|
|
65
|
+
"""Per-feature PSI between the accept and reject populations.
|
|
66
|
+
|
|
67
|
+
A quick read on how unrepresentative your accepts are: large values flag
|
|
68
|
+
features whose distribution differs most between accepts and rejects.
|
|
69
|
+
"""
|
|
70
|
+
cols = list(X_accept.columns) if hasattr(X_accept, "columns") else None
|
|
71
|
+
A = to_numpy_2d(X_accept)
|
|
72
|
+
R = to_numpy_2d(X_reject)
|
|
73
|
+
if cols is None:
|
|
74
|
+
cols = [f"x{i + 1}" for i in range(A.shape[1])]
|
|
75
|
+
values = {c: psi(A[:, i], R[:, i], n_bins=n_bins) for i, c in enumerate(cols)}
|
|
76
|
+
return pd.Series(values, name="psi").sort_values(ascending=False)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def swap_set(
|
|
80
|
+
y_true,
|
|
81
|
+
score_reference,
|
|
82
|
+
score_challenger,
|
|
83
|
+
cutoff_reference,
|
|
84
|
+
cutoff_challenger,
|
|
85
|
+
lower_is_safer: bool = True,
|
|
86
|
+
) -> pd.DataFrame:
|
|
87
|
+
"""Swap-set analysis between a reference and a challenger scorecard.
|
|
88
|
+
|
|
89
|
+
Compares which applicants each policy accepts (accept if the score is on the
|
|
90
|
+
safe side of its cutoff) and reports counts and bad rates for the four
|
|
91
|
+
groups: kept-accept, kept-reject, swap-in (reference rejects, challenger
|
|
92
|
+
accepts) and swap-out (reference accepts, challenger rejects).
|
|
93
|
+
"""
|
|
94
|
+
y = to_numpy_1d(y_true).astype(int)
|
|
95
|
+
sr = to_numpy_1d(score_reference).astype(float)
|
|
96
|
+
sc = to_numpy_1d(score_challenger).astype(float)
|
|
97
|
+
acc_r = sr <= cutoff_reference if lower_is_safer else sr >= cutoff_reference
|
|
98
|
+
acc_c = sc <= cutoff_challenger if lower_is_safer else sc >= cutoff_challenger
|
|
99
|
+
groups = {
|
|
100
|
+
"kept_accept": acc_r & acc_c,
|
|
101
|
+
"swap_out": acc_r & ~acc_c,
|
|
102
|
+
"swap_in": ~acc_r & acc_c,
|
|
103
|
+
"kept_reject": ~acc_r & ~acc_c,
|
|
104
|
+
}
|
|
105
|
+
rows = {}
|
|
106
|
+
for name, mask in groups.items():
|
|
107
|
+
n = int(mask.sum())
|
|
108
|
+
rows[name] = {"n": n, "bad_rate": float(y[mask].mean()) if n else float("nan")}
|
|
109
|
+
return pd.DataFrame(rows).T[["n", "bad_rate"]]
|
rejectkit/estimator.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""High-level estimator that wraps any classifier with reject inference."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
|
7
|
+
from sklearn.linear_model import LogisticRegression
|
|
8
|
+
|
|
9
|
+
from ._compat import to_numpy_2d
|
|
10
|
+
from .methods import (
|
|
11
|
+
Extrapolation,
|
|
12
|
+
FuzzyAugmentation,
|
|
13
|
+
Parcelling,
|
|
14
|
+
Reclassification,
|
|
15
|
+
Reweighting,
|
|
16
|
+
SelfLearning,
|
|
17
|
+
SimpleAugmentation,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
_METHODS = {
|
|
21
|
+
"simple": SimpleAugmentation,
|
|
22
|
+
"fuzzy": FuzzyAugmentation,
|
|
23
|
+
"parcelling": Parcelling,
|
|
24
|
+
"reweighting": Reweighting,
|
|
25
|
+
"reclassification": Reclassification,
|
|
26
|
+
"extrapolation": Extrapolation,
|
|
27
|
+
"twins": Extrapolation,
|
|
28
|
+
"selflearning": SelfLearning,
|
|
29
|
+
"self-learning": SelfLearning,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_inferencer(method: str, base_estimator: BaseEstimator | None = None, **params):
|
|
34
|
+
"""Build a reject inferencer by name.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
method : str
|
|
39
|
+
One of: simple, fuzzy, parcelling, reweighting, reclassification,
|
|
40
|
+
extrapolation (alias twins), selflearning.
|
|
41
|
+
base_estimator : sklearn classifier, optional
|
|
42
|
+
**params
|
|
43
|
+
Extra keyword arguments for the chosen method (e.g. ``uplift=1.5``).
|
|
44
|
+
"""
|
|
45
|
+
key = str(method).lower()
|
|
46
|
+
if key not in _METHODS:
|
|
47
|
+
raise ValueError(f"Unknown method {method!r}. Available: {sorted(_METHODS)}.")
|
|
48
|
+
return _METHODS[key](base_estimator=base_estimator, **params)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RejectInferenceClassifier(BaseEstimator, ClassifierMixin):
|
|
52
|
+
"""Wrap a scikit-learn classifier with reject inference.
|
|
53
|
+
|
|
54
|
+
Infers labels/weights for the rejected applicants using ``method``, builds
|
|
55
|
+
the augmented through-the-door sample, and fits ``estimator`` on it
|
|
56
|
+
(passing ``sample_weight``). It then behaves like an ordinary fitted
|
|
57
|
+
classifier via :meth:`predict` / :meth:`predict_proba`.
|
|
58
|
+
|
|
59
|
+
Note the non-standard ``fit`` signature: it takes accepts and rejects
|
|
60
|
+
separately. Accepts pandas, polars, or numpy inputs.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
estimator : sklearn classifier, optional
|
|
65
|
+
Final model trained on the augmented sample. Must accept
|
|
66
|
+
``sample_weight``. Defaults to LogisticRegression.
|
|
67
|
+
method : str, default='fuzzy'
|
|
68
|
+
See :func:`get_inferencer`.
|
|
69
|
+
base_scorer : sklearn classifier, optional
|
|
70
|
+
Internal model used by the reject inference method.
|
|
71
|
+
method_params : dict, optional
|
|
72
|
+
Extra keyword arguments forwarded to the method, e.g. ``{'uplift': 1.5}``.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, estimator=None, method="fuzzy", base_scorer=None, method_params=None):
|
|
76
|
+
self.estimator = estimator
|
|
77
|
+
self.method = method
|
|
78
|
+
self.base_scorer = base_scorer
|
|
79
|
+
self.method_params = method_params
|
|
80
|
+
|
|
81
|
+
def fit(self, X_accept, y_accept, X_reject):
|
|
82
|
+
params = self.method_params or {}
|
|
83
|
+
inferencer = get_inferencer(self.method, base_estimator=self.base_scorer, **params)
|
|
84
|
+
X_aug, y_aug, w_aug = inferencer.fit_resample(X_accept, y_accept, X_reject)
|
|
85
|
+
self.inferencer_ = inferencer
|
|
86
|
+
self.estimator_ = (
|
|
87
|
+
clone(self.estimator) if self.estimator is not None
|
|
88
|
+
else LogisticRegression(max_iter=1000)
|
|
89
|
+
)
|
|
90
|
+
self.estimator_.fit(X_aug, y_aug, sample_weight=w_aug)
|
|
91
|
+
self.classes_ = getattr(self.estimator_, "classes_", np.array([0, 1]))
|
|
92
|
+
self.n_features_in_ = X_aug.shape[1]
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def predict(self, X):
|
|
96
|
+
return self.estimator_.predict(to_numpy_2d(X))
|
|
97
|
+
|
|
98
|
+
def predict_proba(self, X):
|
|
99
|
+
return self.estimator_.predict_proba(to_numpy_2d(X))
|
|
100
|
+
|
|
101
|
+
def decision_function(self, X):
|
|
102
|
+
Xa = to_numpy_2d(X)
|
|
103
|
+
if hasattr(self.estimator_, "decision_function"):
|
|
104
|
+
return self.estimator_.decision_function(Xa)
|
|
105
|
+
return self.predict_proba(Xa)[:, 1]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Reject inference methods."""
|
|
2
|
+
|
|
3
|
+
from .augmentation import FuzzyAugmentation, SimpleAugmentation
|
|
4
|
+
from .extrapolation import Extrapolation
|
|
5
|
+
from .heckman import HeckmanClassifier
|
|
6
|
+
from .parcelling import Parcelling
|
|
7
|
+
from .reclassification import Reclassification
|
|
8
|
+
from .reweighting import Reweighting
|
|
9
|
+
from .semi_supervised import SelfLearning
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"SimpleAugmentation",
|
|
13
|
+
"FuzzyAugmentation",
|
|
14
|
+
"Parcelling",
|
|
15
|
+
"Reweighting",
|
|
16
|
+
"Reclassification",
|
|
17
|
+
"Extrapolation",
|
|
18
|
+
"SelfLearning",
|
|
19
|
+
"HeckmanClassifier",
|
|
20
|
+
]
|